# TON-IoT Dataset Analysis

Comprehensive analysis of the TON-IoT dataset for IoT security research.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Import custom processors
import sys
sys.path.append('../src')
from data_pipeline.ton_iot_processor import TONIoTProcessor

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## 1. Dataset Loading and Overview

In [None]:
# Load TON-IoT dataset
dataset_path = "../../datasets/samples/ton_iot_sample.csv"
processor = TONIoTProcessor(dataset_path)

# Load and get basic info
try:
    data = processor.load_data()
    print(f"Dataset shape: {data.shape}")
    print(f"\nColumns: {list(data.columns)}")
    print(f"\nData types:\n{data.dtypes}")
    print(f"\nMissing values:\n{data.isnull().sum()}")
except FileNotFoundError:
    print("Sample dataset not found. Creating synthetic data for demonstration.")
    # Create synthetic TON-IoT-like data
    np.random.seed(42)
    n_samples = 10000
    
    data = pd.DataFrame({
        'ts': pd.date_range('2023-01-01', periods=n_samples, freq='1s'),
        'src_ip': np.random.choice(['192.168.1.100', '192.168.1.101', '192.168.1.102'], n_samples),
        'dst_ip': np.random.choice(['8.8.8.8', '1.1.1.1', '192.168.1.1'], n_samples),
        'src_port': np.random.randint(1024, 65535, n_samples),
        'dst_port': np.random.choice([80, 443, 53, 22, 1883], n_samples),
        'proto': np.random.choice(['TCP', 'UDP', 'ICMP'], n_samples, p=[0.7, 0.25, 0.05]),
        'duration': np.random.exponential(2, n_samples),
        'src_bytes': np.random.lognormal(6, 2, n_samples).astype(int),
        'dst_bytes': np.random.lognormal(5, 2, n_samples).astype(int),
        'src_pkts': np.random.poisson(10, n_samples),
        'dst_pkts': np.random.poisson(8, n_samples),
        'label': np.random.choice(['normal', 'dos', 'ddos', 'injection', 'backdoor'], 
                                n_samples, p=[0.8, 0.1, 0.05, 0.03, 0.02])
    })
    
    processor.data = data
    print(f"Created synthetic dataset with shape: {data.shape}")

## 2. Data Preprocessing and Feature Engineering

In [None]:
# Preprocess the data
processed_data = processor.preprocess()
print(f"Processed data shape: {processed_data.shape}")

# Extract features
feature_data = processor.extract_features()
print(f"Feature data shape: {feature_data.shape}")

# Display first few rows
print("\nFirst 5 rows of processed data:")
processed_data.head()

## 3. Exploratory Data Analysis

In [None]:
# Attack type distribution
attack_dist = processor.get_attack_distribution()
print("Attack Type Distribution:")
for attack, count in attack_dist.items():
    print(f"{attack}: {count} ({count/sum(attack_dist.values())*100:.2f}%)")

# Visualize attack distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Bar plot
attacks = list(attack_dist.keys())
counts = list(attack_dist.values())
ax1.bar(attacks, counts)
ax1.set_title('Attack Type Distribution')
ax1.set_xlabel('Attack Type')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)

# Pie chart
ax2.pie(counts, labels=attacks, autopct='%1.1f%%')
ax2.set_title('Attack Type Distribution (Percentage)')

plt.tight_layout()
plt.show()

In [None]:
# Protocol distribution
protocol_dist = processor.get_protocol_distribution()
print("\nProtocol Distribution:")
for proto, count in protocol_dist.items():
    print(f"{proto}: {count} ({count/sum(protocol_dist.values())*100:.2f}%)")

# Visualize protocol distribution
plt.figure(figsize=(10, 6))
protocols = list(protocol_dist.keys())
counts = list(protocol_dist.values())
plt.bar(protocols, counts)
plt.title('Protocol Distribution')
plt.xlabel('Protocol')
plt.ylabel('Count')
plt.show()

## 4. Traffic Pattern Analysis

In [None]:
# Time series analysis
if 'ts' in processed_data.columns:
    # Traffic volume over time
    hourly_traffic = processed_data.set_index('ts').resample('H').size()
    
    plt.figure(figsize=(15, 6))
    plt.plot(hourly_traffic.index, hourly_traffic.values)
    plt.title('Hourly Traffic Volume')
    plt.xlabel('Time')
    plt.ylabel('Number of Connections')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Attack patterns over time
    attack_timeline = processed_data[processed_data['label'] != 'normal'].set_index('ts').resample('H')['label'].count()
    
    plt.figure(figsize=(15, 6))
    plt.plot(attack_timeline.index, attack_timeline.values, color='red')
    plt.title('Attack Activity Over Time')
    plt.xlabel('Time')
    plt.ylabel('Number of Attacks')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 5. Feature Correlation Analysis

In [None]:
# Select numeric features for correlation analysis
numeric_features = ['duration', 'src_bytes', 'dst_bytes', 'src_pkts', 'dst_pkts']
numeric_features = [col for col in numeric_features if col in processed_data.columns]

if numeric_features:
    # Correlation matrix
    correlation_matrix = processed_data[numeric_features].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Feature distributions by attack type
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.ravel()
    
    for i, feature in enumerate(numeric_features[:6]):
        for attack_type in processed_data['label'].unique():
            data_subset = processed_data[processed_data['label'] == attack_type][feature]
            axes[i].hist(data_subset, alpha=0.6, label=attack_type, bins=30)
        
        axes[i].set_title(f'{feature} Distribution by Attack Type')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Frequency')
        axes[i].legend()
        axes[i].set_yscale('log')
    
    plt.tight_layout()
    plt.show()

## 6. Port and Service Analysis

In [None]:
# Most common destination ports
if 'dst_port' in processed_data.columns:
    top_ports = processed_data['dst_port'].value_counts().head(10)
    
    plt.figure(figsize=(12, 6))
    plt.bar(range(len(top_ports)), top_ports.values)
    plt.title('Top 10 Destination Ports')
    plt.xlabel('Port')
    plt.ylabel('Count')
    plt.xticks(range(len(top_ports)), top_ports.index, rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Port usage by attack type
    port_attack_crosstab = pd.crosstab(processed_data['dst_port'], processed_data['label'])
    
    plt.figure(figsize=(15, 8))
    port_attack_crosstab.head(10).plot(kind='bar', stacked=True)
    plt.title('Port Usage by Attack Type (Top 10 Ports)')
    plt.xlabel('Destination Port')
    plt.ylabel('Count')
    plt.legend(title='Attack Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

## 7. Summary Statistics and Insights

In [None]:
# Generate summary statistics
print("=== TON-IoT Dataset Analysis Summary ===")
print(f"Total samples: {len(processed_data):,}")
print(f"Time range: {processed_data['ts'].min()} to {processed_data['ts'].max()}" if 'ts' in processed_data.columns else "Time range: N/A")
print(f"Unique source IPs: {processed_data['src_ip'].nunique()}" if 'src_ip' in processed_data.columns else "")
print(f"Unique destination IPs: {processed_data['dst_ip'].nunique()}" if 'dst_ip' in processed_data.columns else "")
print(f"Attack ratio: {(processed_data['label'] != 'normal').mean()*100:.2f}%")

print("\n=== Key Insights ===")
print("1. Attack Distribution:")
for attack, count in attack_dist.items():
    if attack != 'normal':
        print(f"   - {attack}: {count:,} samples ({count/len(processed_data)*100:.2f}%)")

print("\n2. Protocol Usage:")
for proto, count in list(protocol_dist.items())[:3]:
    print(f"   - {proto}: {count:,} samples ({count/len(processed_data)*100:.2f}%)")

if 'dst_port' in processed_data.columns:
    print("\n3. Most Targeted Ports:")
    for port, count in top_ports.head(5).items():
        print(f"   - Port {port}: {count:,} connections")

print("\n=== Recommendations ===")
print("1. Focus anomaly detection on the most common protocols (TCP, UDP)")
print("2. Monitor high-traffic ports for unusual patterns")
print("3. Implement time-based analysis for attack pattern detection")
print("4. Consider feature engineering based on traffic flow characteristics")