In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
benign_df = pd.read_csv('merging/benign_cesnet.csv')
fuzzing_df = pd.read_csv('fuzzing.csv')
flooding_df = pd.read_csv('flooding.csv')

benign_df.dropna(inplace=True)
fuzzing_df.dropna(inplace=True)
flooding_df.dropna(inplace=True)

benign_df['classification'] = 'benign'
fuzzing_df['classification'] = 'fuzzing'
fuzzing_df['label'] = 1
flooding_df['classification'] = 'flooding'

In [None]:
benign_df

In [None]:
fuzzing_df

In [None]:
flooding_df

In [None]:
# fig, axes = plt.subplots(1, 3, figsize=(30, 8), sharex=True)

# # Benign boxplot
# sns.boxplot(data=benign_df.drop(columns=['label', 'classification']), orient='h', ax=axes[0])
# axes[0].set_title('Benign')
# axes[0].set_xscale('log')

# # Fuzzing boxplot
# sns.boxplot(data=fuzzing_df.drop(columns=['label', 'classification']), orient='h', ax=axes[1])
# axes[1].set_title('Fuzzing')
# axes[1].set_xscale('log')

# # Flooding boxplot
# sns.boxplot(data=flooding_df.drop(columns=['label', 'classification']), orient='h', ax=axes[2])
# axes[2].set_title('Flooding')
# axes[2].set_xscale('log')

# plt.suptitle('Comparison of Feature Distributions Across Datasets')
# plt.show()

In [None]:
# combined_df = pd.concat([benign_df, fuzzing_df, flooding_df])

# # Melt to long-form
# long_df = combined_df.melt(id_vars=['classification'], var_name='feature', value_name='value')

# # Plot overlapping boxplots
# plt.figure(figsize=(20, 8))
# sns.boxplot(data=long_df, x='value', y='feature', hue='classification')

# plt.xscale('log')
# plt.title('Overlapping Boxplot of Feature Distributions')
# plt.show()

In [3]:
# Combine fuzzing and flooding into a single attack dataset
attack_df = pd.concat([fuzzing_df, flooding_df])

# Set window size
window_size = 10000
halfway_point = 10

# Alternate windows of benign and attack
combined_df = []
i = 0

while len(benign_df) >= window_size:
    if i % 2 == 0:
        # Randomly sample a benign window instead of taking the first window
        benign_sample = benign_df.sample(n=window_size, random_state=42)
        combined_df.append(benign_sample)
        benign_df = benign_df.drop(benign_sample.index)
    else:
        if (len(flooding_df) < window_size) and (len(fuzzing_df) < window_size):
            break
        # Before halfway, use flooding; after halfway, switch to fuzzing
        if i // 2 < halfway_point:
            if len(flooding_df) >= window_size:
                combined_df.append(flooding_df.iloc[:window_size])
                flooding_df = flooding_df.iloc[window_size:]
        else:
            if len(fuzzing_df) >= window_size:
                combined_df.append(fuzzing_df.iloc[:window_size])
                fuzzing_df = fuzzing_df.iloc[window_size:]
    i += 1

# Concatenate all windows
combined_df = pd.concat(combined_df).reset_index(drop=True)

In [4]:
combined_df['classification'].value_counts()

benign      200000
flooding    100000
fuzzing      90000
Name: classification, dtype: int64

In [10]:
# Define color mapping for classification
color_map = {
    'benign': 'blue',
    'fuzzing': 'red',
    'flooding': 'orange'
}

plt.figure(figsize=(12, 6))
plt.scatter(combined_df.index, combined_df['label'], 
            c=combined_df['classification'].map(color_map),
            alpha=0.6, s=1)

# Add legend manually
import matplotlib.patches as mpatches
legend_handles = [
    mpatches.Patch(color='blue', label='benign'),
    mpatches.Patch(color='red', label='fuzzing'),
    mpatches.Patch(color='orange', label='flooding')
]
plt.legend(handles=legend_handles, title="Classification")

plt.title('QUIC Traffic Classification Over Sample Index')
plt.xlabel('Sample Index')
plt.ylabel('Label (1 = Attack, 0 = Benign)')
plt.show()

KeyError: 'classification'

<Figure size 1200x600 with 0 Axes>

In [5]:
combined_df.drop(columns=['classification'], inplace=True)
combined_df

Unnamed: 0,dst_port,dst_asn,quic_ver,dur,ratio,flow_pkt_rate,flow_byte_rate,total_pkts,total_bytes,max_bytes,...,ave_fwd_iat,std_fwd_iat,var_fwd_iat,rev_dur,max_rev_iat,min_rev_iat,ave_rev_iat,std_rev_iat,var_rev_iat,label
0,443,13335.0,1,0.257149,1,77.775920,3.655857e+04,20,9401.0,1250.0,...,11.625000,14.576844,2.124844e+02,0.255000,155.000000,0.000000e+00,14.909091,44.313869,1.963719e+03,0
1,443,9821.0,1,0.001191,1,25191.476213,3.003328e+07,30,35766.0,1232.0,...,0.001011,0.000000,0.000000e+00,0.001191,0.001191,9.499999e-08,0.000525,0.000545,2.972463e-07,0
2,443,15169.0,1,0.033477,0,268.841294,1.182006e+05,9,3957.0,1250.0,...,1.750000,1.785357,3.187500e+00,0.015000,20.000000,0.000000e+00,6.500000,8.170067,6.675000e+01,0
3,443,9821.0,1,0.003283,1,9137.062331,1.052711e+07,30,34564.0,1232.0,...,0.001492,0.000611,3.737128e-07,0.003283,0.003283,9.000001e-08,0.001711,0.001073,1.151368e-06,0
4,443,9821.0,1,0.002773,1,10818.346618,1.246418e+07,30,34564.0,1232.0,...,0.001698,0.000903,8.149512e-07,0.002773,0.002773,7.500000e-08,0.001202,0.000964,9.283899e-07,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389995,443,15169.0,1,0.148511,0,127.936651,6.647319e+04,19,9872.0,1250.0,...,13.333333,14.772347,2.182222e+02,0.063000,6.000000,0.000000e+00,3.111111,2.514157,6.320988e+00,0
389996,443,13335.0,1,1.223215,1,17.167873,9.242038e+03,21,11305.0,1357.0,...,98.375000,250.837067,6.291923e+04,1.023000,218.000000,0.000000e+00,36.333333,80.577430,6.492722e+03,0
389997,443,13335.0,1,0.212495,0,141.179792,6.294736e+04,30,13376.0,1250.0,...,5.437500,10.210098,1.042461e+02,0.255000,109.000000,0.000000e+00,9.615385,28.925736,8.366982e+02,0
389998,443,9821.0,1,0.451929,1,66.382081,4.238053e+04,30,19153.0,1232.0,...,0.086768,0.132854,1.765016e-02,0.445262,0.445262,4.340000e-07,0.079235,0.089501,8.010425e-03,0


In [6]:
combined_df.dropna(inplace=True)
combined_df

Unnamed: 0,dst_port,dst_asn,quic_ver,dur,ratio,flow_pkt_rate,flow_byte_rate,total_pkts,total_bytes,max_bytes,...,ave_fwd_iat,std_fwd_iat,var_fwd_iat,rev_dur,max_rev_iat,min_rev_iat,ave_rev_iat,std_rev_iat,var_rev_iat,label
0,443,13335.0,1,0.257149,1,77.775920,3.655857e+04,20,9401.0,1250.0,...,11.625000,14.576844,2.124844e+02,0.255000,155.000000,0.000000e+00,14.909091,44.313869,1.963719e+03,0
1,443,9821.0,1,0.001191,1,25191.476213,3.003328e+07,30,35766.0,1232.0,...,0.001011,0.000000,0.000000e+00,0.001191,0.001191,9.499999e-08,0.000525,0.000545,2.972463e-07,0
2,443,15169.0,1,0.033477,0,268.841294,1.182006e+05,9,3957.0,1250.0,...,1.750000,1.785357,3.187500e+00,0.015000,20.000000,0.000000e+00,6.500000,8.170067,6.675000e+01,0
3,443,9821.0,1,0.003283,1,9137.062331,1.052711e+07,30,34564.0,1232.0,...,0.001492,0.000611,3.737128e-07,0.003283,0.003283,9.000001e-08,0.001711,0.001073,1.151368e-06,0
4,443,9821.0,1,0.002773,1,10818.346618,1.246418e+07,30,34564.0,1232.0,...,0.001698,0.000903,8.149512e-07,0.002773,0.002773,7.500000e-08,0.001202,0.000964,9.283899e-07,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389995,443,15169.0,1,0.148511,0,127.936651,6.647319e+04,19,9872.0,1250.0,...,13.333333,14.772347,2.182222e+02,0.063000,6.000000,0.000000e+00,3.111111,2.514157,6.320988e+00,0
389996,443,13335.0,1,1.223215,1,17.167873,9.242038e+03,21,11305.0,1357.0,...,98.375000,250.837067,6.291923e+04,1.023000,218.000000,0.000000e+00,36.333333,80.577430,6.492722e+03,0
389997,443,13335.0,1,0.212495,0,141.179792,6.294736e+04,30,13376.0,1250.0,...,5.437500,10.210098,1.042461e+02,0.255000,109.000000,0.000000e+00,9.615385,28.925736,8.366982e+02,0
389998,443,9821.0,1,0.451929,1,66.382081,4.238053e+04,30,19153.0,1232.0,...,0.086768,0.132854,1.765016e-02,0.445262,0.445262,4.340000e-07,0.079235,0.089501,8.010425e-03,0


In [7]:
combined_df.to_csv('merged_cesnet.csv', index=False)

In [8]:
import matplotlib.pyplot as plt

from skmultiflow.meta import AdaptiveRandomForestClassifier
from skmultiflow.data import DataStream
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Create a data stream
stream = DataStream(combined_df, target_idx=-1)

# Initialize the ARF classifier
clf = AdaptiveRandomForestClassifier(random_state=42)

# Store metrics over time
n_samples = 0
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
sample_counts = []

# Online learning loop
while n_samples < 250000:
    X_batch, y_batch = stream.next_sample()
    
    if n_samples > 0:
        # Predict and update metrics
        y_pred = clf.predict(X_batch)
        accuracy = accuracy_score(y_batch, y_pred)
        precision = precision_score(y_batch, y_pred, zero_division=0)
        recall = recall_score(y_batch, y_pred, zero_division=0)
        f1 = f1_score(y_batch, y_pred, zero_division=0)
        
        # Store metrics
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)
        sample_counts.append(n_samples)
    
    # Train the classifier
    clf.partial_fit(X_batch, y_batch, classes=[0, 1])
    n_samples += 1
    
    # Stop after processing all samples
    if n_samples == len(combined_df):
        break

# Plot the results
plt.figure(figsize=(12, 8))
plt.plot(sample_counts, accuracy_list, label='Accuracy', color='blue')
plt.plot(sample_counts, precision_list, label='Precision', color='green')
plt.plot(sample_counts, recall_list, label='Recall', color='orange')
plt.plot(sample_counts, f1_list, label='F1 Score', color='red')
plt.xlabel('Number of Samples')
plt.ylabel('Score')
plt.title('Online Learning Performance of ARF')
plt.legend()
plt.grid()
plt.show()