# 📝 1️⃣ Install Dependencies (Run this once)

In [2]:
!pip install pandas numpy matplotlib seaborn scikit-learn imbalanced-learn pyarrow


Defaulting to user installation because normal site-packages is not writeable


# 📌 2️⃣ Import Required Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import QuantileTransformer
from imblearn.over_sampling import SMOTE
import os
from sklearn.preprocessing import LabelEncoder

# 📂 3️⃣ Load & Inspect Dataset

In [3]:

# Get the absolute path dynamically
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, "CICIDDOS", "data")
PROCESSED_DATA_DIR = os.path.join(BASE_DIR, "CICIDDOS", "processed")


In [4]:
#  List All Parquet Files in DATA_DIR
parquet_files = [f for f in os.listdir(DATA_DIR) if f.endswith(".parquet")]
print(f" Found {len(parquet_files)} Parquet files in {DATA_DIR}")

 Found 17 Parquet files in c:\Users\allaeddine Bouchahma\Desktop\D\Dev\DDoSViT\CICIDDOS\data


In [6]:

#  Separate Training & Testing Files
train_files = [f for f in parquet_files if "training" in f]
test_files = [f for f in parquet_files if "testing" in f]

print(f" Found {len(train_files)} Training Files")
print(f" Found {len(test_files)} Testing Files")


 Found 7 Training Files
 Found 10 Testing Files


In [7]:

#  Load Training Files
train_list = []
for file in train_files:
    file_path = os.path.join(DATA_DIR, file)
    attack_type = file.split("-")[0]  # Extract attack name
    
    df = pd.read_parquet(file_path)
    train_list.append(df)

#  Load  Testing Files
test_list = []
for file in test_files:
    file_path = os.path.join(DATA_DIR, file)
    attack_type = file.split("-")[0]  # Extract attack name
    print(f" Loading Testing Data: {file}")
    
    df = pd.read_parquet(file_path)
    test_list.append(df)

# Merge all training & testing data
data_training = pd.concat(train_list, ignore_index=True)
data_testing = pd.concat(test_list, ignore_index=True)

# display the data
print("\n Training Data Head:")
display(data_training.head(10))

print("\n Testing Data Head:")
display(data_testing.head(10))


 Loading Testing Data: DNS-testing.parquet
 Loading Testing Data: LDAP-testing.parquet
 Loading Testing Data: MSSQL-testing.parquet
 Loading Testing Data: NetBIOS-testing.parquet
 Loading Testing Data: NTP-testing.parquet
 Loading Testing Data: SNMP-testing.parquet
 Loading Testing Data: Syn-testing.parquet
 Loading Testing Data: TFTP-testing.parquet
 Loading Testing Data: UDP-testing.parquet
 Loading Testing Data: UDPLag-testing.parquet

 Training Data Head:


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,17,49,2,0,458.0,0.0,229.0,229.0,229.0,0.0,...,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NetBIOS
1,17,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,1480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,LDAP
2,17,1,2,0,458.0,0.0,229.0,229.0,229.0,0.0,...,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NetBIOS
3,17,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,LDAP
4,17,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,LDAP
5,17,2,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,1396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,LDAP
6,17,49,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,1480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,LDAP
7,17,1,2,0,2848.0,0.0,1424.0,1424.0,1424.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,LDAP
8,17,1,2,0,458.0,0.0,229.0,229.0,229.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NetBIOS
9,17,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,LDAP



 Testing Data Head:


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,17,48,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
1,17,2,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
2,17,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
3,17,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,1480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
4,17,1,2,0,2896.0,0.0,1448.0,1448.0,1448.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
5,17,1,2,0,2736.0,0.0,1368.0,1368.0,1368.0,0.0,...,1472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
6,17,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
7,17,232,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
8,17,1,2,0,2896.0,0.0,1448.0,1448.0,1448.0,0.0,...,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
9,17,11,2,0,2896.0,0.0,1448.0,1448.0,1448.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS


In [None]:
#dissplay number of each atack according to label
print("🔍 Unique Values in Label Column:")
print(data_training["Label"].value_counts())



🔍 Unique Values in Label Column:
Label
Syn        48840
Benign     46427
UDP        18090
MSSQL       8523
LDAP        1906
Portmap      685
NetBIOS      644
UDPLag        55
Name: count, dtype: int64

🔍 Attack Types in Benign Traffic:


KeyError: 'attack_type'