# DataFrame Description:

This DataFrame contains synthetic data representing DPI (Deep Packet Inspection) session details, simulating network traffic. 
It includes various attributes such as timestamps, IP addresses, traffic volume, device types, user activities, and risk categories. 
The dataset was generated with 100,000 records, providing insights into network usage, application interactions, and security risk classifications. 
The data can be used for analysis or simulations related to network traffic, user behavior, and network security.


In [5]:
import random
import pandas as pd
from faker import Faker

# Data generation with Faker
fake = Faker("en_US")
Faker.seed(42)

# Domain names for construction stores and brand shops (Samsung/Apple)
construction_shop_domains = [
    "homeDepot.com", "lowes.com", "builderswarehouse.com", "wayfair.com", 
    "flooranddecor.com", "constructionstore.com", "bunnings.com.au", "plumbingsupply.com"
]

brand_shop_domains = [
    "samsung.com", "shop.samsung.com", "apple.com", "shop.apple.com", "applestore.com", 
    "samsungstore.com", "bestbuy.com", "ebay.com", "amazon.com", "newegg.com", "walmart.com"
]

# Additional categories for construction and technical stores
categories = [
    "social media", "streaming", "gaming", "building", "technical", "electronics", "furniture"
]

# Function to generate DPI (Data Packet Inspection) data
def generate_dpi_data():
    start_time = fake.date_time_this_year()  # Generate a random start time within this year
    end_time = fake.date_time_between(start_date=start_time)  # Generate an end time after start time
    base_station_id = random.randint(1, 1000)  # Random base station ID
    latitude = random.uniform(25.0, 49.0)  # Random latitude
    longitude = random.uniform(-125.0, -66.0)  # Random longitude
    user_ip = fake.ipv4_public()  # Random public IP address for user
    destination_ip = fake.ipv4_public()  # Random public IP address for destination
    
    # Randomly choose a domain: construction or brand shop (Samsung/Apple)
    domain = random.choice(construction_shop_domains + brand_shop_domains)
    
    protocol = random.choice(["HTTP", "HTTPS", "TCP", "UDP"])  # Protocol type (HTTP, HTTPS, TCP, UDP)
    port = random.choice([80, 443, 8080, 53])  # Port number
    traffic_volume = random.randint(1_000, 10_000_000)  # Traffic volume in bytes
    traffic_direction = random.choice(["upload", "download"])  # Direction of traffic (upload or download)
    packet_count = random.randint(100, 10_000)  # Number of packets transmitted
    avg_packet_size = round(random.uniform(50, 1500), 2)  # Average packet size in bytes
    duration = int((end_time - start_time).total_seconds())  # Duration of the session in seconds
    country = "United States"  # Country of origin
    region = fake.state()  # Region (state) in the United States
    city = fake.city()  # City name
    asn = random.randint(1_000, 65_535)  # Autonomous system number (ASN)
    isp = fake.company()  # ISP company name
    user_id = random.randint(1, 1_000_000)  # Unique user ID
    subscriber_id = random.randint(1, 1_000_000)  # Subscriber ID
    msisdn = fake.msisdn()  # Mobile number (MSISDN)
    device_type = random.choice(["smartphone", "tablet", "PC"])  # Type of device used
    os = random.choice(["Android", "iOS", "Windows", "macOS"])  # Operating system
    browser = random.choice(["Chrome", "Firefox", "Safari", "Edge"])  # Browser used
    user_agent = fake.user_agent()  # User agent string for the browser
    application_name = random.choice(["WhatsApp", "YouTube", "Facebook"])  # Application used
    content_type = random.choice(["video", "text", "image"])  # Type of content transferred
    risk_category = random.choice(["safe", "suspicious", "malicious"])  # Risk classification of the traffic
    
    # Randomly choose a category, including construction and technical
    category = random.choice(categories)
    
    is_vpn = random.choice([True, False])  # Whether the traffic was from a VPN
    connection_type = random.choice(["4G", "5G", "Wi-Fi"])  # Type of internet connection
    throttled = random.choice([True, False])  # Whether the connection was throttled
    error_code = random.choice([None, 404, 500, 200])  # HTTP error code, if any

    # Return the generated data as a dictionary
    return {
        "start_time": start_time,
        "end_time": end_time,
        "base_station_id": base_station_id,
        "latitude": latitude,
        "longitude": longitude,
        "user_ip": user_ip,
        "destination_ip": destination_ip,
        "domain": domain,
        "protocol": protocol,
        "port": port,
        "traffic_volume": traffic_volume,
        "traffic_direction": traffic_direction,
        "packet_count": packet_count,
        "avg_packet_size": avg_packet_size,
        "duration": duration,
        "country": country,
        "region": region,
        "city": city,
        "asn": asn,
        "isp": isp,
        "user_id": user_id,
        "subscriber_id": subscriber_id,
        "msisdn": msisdn,
        "device_type": device_type,
        "os": os,
        "browser": browser,
        "user_agent": user_agent,
        "application_name": application_name,
        "content_type": content_type,
        "risk_category": risk_category,
        "category": category,
        "is_vpn": is_vpn,
        "connection_type": connection_type,
        "throttled": throttled,
        "error_code": error_code,
    }

# Function to generate the data and return as a pandas DataFrame
def generate_data_to_dataframe(total_records=10_000):
    data = [generate_dpi_data() for _ in range(total_records)]
    return pd.DataFrame(data)

# Create a DataFrame with 100,000 records
dpi_data_df = generate_data_to_dataframe(total_records=100_000)

# Save to CSV file (optional)
dpi_data_df.to_csv("dpi_data.csv", index=False)

print("Data has been successfully generated and saved into a DataFrame.")


Data has been successfully generated and saved into a DataFrame.


In [6]:
dpi_data_df

Unnamed: 0,start_time,end_time,base_station_id,latitude,longitude,user_ip,destination_ip,domain,protocol,port,...,browser,user_agent,application_name,content_type,risk_category,category,is_vpn,connection_type,throttled,error_code
0,2025-01-16 12:29:35,2025-01-16 20:35:59,215,40.127440,-103.914789,47.172.78.228,70.143.71.233,shop.samsung.com,TCP,8080,...,Safari,Mozilla/5.0 (Macintosh; PPC Mac OS X 10_6_6; r...,Facebook,video,safe,electronics,True,4G,False,200.0
1,2025-01-18 00:50:23,2025-01-19 10:56:32,101,25.504053,-89.272044,200.175.62.248,203.169.212.190,samsungstore.com,TCP,53,...,Chrome,Mozilla/5.0 (compatible; MSIE 6.0; Windows CE;...,YouTube,image,malicious,technical,True,5G,False,
2,2025-01-19 03:10:01,2025-01-19 10:57:58,913,30.357986,-83.429554,215.4.172.247,37.209.117.182,homeDepot.com,UDP,53,...,Firefox,Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7...,WhatsApp,video,safe,social media,True,Wi-Fi,False,500.0
3,2025-01-06 19:29:27,2025-01-08 05:14:04,141,25.266515,-113.187130,18.133.45.128,36.109.250.82,builderswarehouse.com,HTTPS,80,...,Firefox,Mozilla/5.0 (Linux; Android 2.3.5) AppleWebKit...,Facebook,text,suspicious,social media,False,4G,False,200.0
4,2025-01-05 14:55:18,2025-01-07 21:52:13,572,34.373700,-69.481130,54.172.207.183,131.119.138.105,apple.com,HTTPS,8080,...,Firefox,Mozilla/5.0 (iPad; CPU iPad OS 10_3_4 like Mac...,YouTube,text,suspicious,electronics,False,4G,False,500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2025-01-15 14:07:00,2025-01-19 04:01:52,622,35.822603,-98.300873,133.88.156.255,174.159.55.244,homeDepot.com,UDP,53,...,Edge,Opera/8.90.(Windows NT 5.01; tk-TM) Presto/2.9...,Facebook,image,malicious,gaming,True,4G,True,404.0
99996,2025-01-02 17:18:41,2025-01-08 07:37:26,170,25.083679,-101.234792,44.238.97.52,200.189.170.23,bunnings.com.au,HTTPS,80,...,Edge,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,Facebook,image,malicious,electronics,False,5G,False,404.0
99997,2025-01-10 15:56:56,2025-01-14 05:33:01,697,45.799186,-105.755863,94.119.124.138,145.36.2.89,plumbingsupply.com,TCP,8080,...,Chrome,Mozilla/5.0 (compatible; MSIE 7.0; Windows NT ...,YouTube,text,safe,building,False,4G,False,404.0
99998,2025-01-05 00:09:22,2025-01-10 15:28:11,692,40.451469,-84.655184,144.22.159.6,220.245.46.106,newegg.com,TCP,443,...,Chrome,Mozilla/5.0 (X11; Linux x86_64; rv:1.9.7.20) G...,YouTube,text,malicious,social media,True,4G,False,404.0


In [9]:
from sqlalchemy import create_engine
import pandas as pd

# Database connection parameters
db_config = {
    "host": "192.168.1.164",  # Database host IP
    "user": "######",          # Username for the database
    "password": '#########',  # Password for the database
    "database": "raw_data"    # Name of the database
}

# Creating a connection using SQLAlchemy
engine = create_engine(f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}/{db_config['database']}")

# Testing the connection
try:
    with engine.connect() as connection:
        print("Connection to the database established successfully.")
except Exception as e:
    print(f"Connection error: {e}")

# Writing the DataFrame to the MySQL table
table_name = "dpi_sessions"  # The table name in the database

try:
    dpi_data_df.to_sql(table_name, con=engine, if_exists="append", index=False, chunksize=1000)
    print(f"Data successfully written to the {table_name} table.")
except Exception as e:
    print(f"Error while writing data to the database: {e}")


Connection to the database established successfully.
Data successfully written to the dpi_sessions table.
