<a href="https://colab.research.google.com/github/Fidelisaboke/robust-nids/blob/feat%2Fbaseline-model/notebooks/03_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing the TII-SSRC-23 Dataset

## Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import required libraries

In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib

## Constants and configuration

In [3]:
# Sample CSV
SAMPLE_CSV_PATH = "/content/drive/MyDrive/Datasets/TII-SSRC-23/sampled_200k.csv"

# Clean CSV
CLEAN_CSV_PATH = "/content/drive/MyDrive/Datasets/TII-SSRC-23/cleaned_data.csv"

## Loading the CSV dataset

In [6]:
sample_df = pd.read_csv(SAMPLE_CSV_PATH)
sample_df.head()

  sample_df = pd.read_csv(SAMPLE_CSV_PATH)


Unnamed: 0,Flow Duration,Flow Bytes/s,Flow Packets/s,Fwd Packet Length Max,Fwd Packet Length Min,Packet Length Std,Bwd Init Win Bytes,FWD Init Win Bytes,Fwd Seg Size Min,Idle Std,Idle Mean,SYN Flag Count,ACK Flag Count,Total Fwd Packet,Total Bwd packets,Label,Traffic Type,Traffic Subtype,_subtype,_cluster
0,52601173.0,5522.709541,66.424374,1318.0,0.0,93.64393,2068.0,65280.0,32.0,0.0,0.0,2.0,3493.0,1701.0,1793.0,Benign,Audio,Audio,Audio,_no_cluster
1,119106942.0,24.599742,0.780811,41.0,0.0,27.063135,502.0,502.0,32.0,1737.400069,5001511.0,0.0,93.0,36.0,57.0,Benign,Audio,Audio,Audio,_no_cluster
2,5589.0,22186.437645,357.845768,124.0,124.0,71.591433,502.0,501.0,32.0,0.0,0.0,0.0,2.0,1.0,1.0,Benign,Audio,Audio,Audio,_no_cluster
3,118166562.0,5969.1421,68.784264,1348.0,0.0,133.693257,4708.0,65280.0,32.0,0.0,0.0,2.0,8127.0,3932.0,4196.0,Benign,Audio,Audio,Audio,_no_cluster
4,119988385.0,4459.064934,56.838835,9.0,9.0,17.228724,0.0,0.0,8.0,0.0,0.0,0.0,0.0,25.0,6795.0,Benign,Audio,Audio,Audio,_no_cluster


## Data Cleaning

### Drop duplicates, if any

In [7]:
# Drop duplicates
sample_df.drop_duplicates(inplace=True)

## Drop irrelevant columns

In [8]:
# Columns to drop
cols_to_drop = ['_subtype', '_cluster']
sample_df.drop(columns=cols_to_drop, inplace=True)
print(sample_df)

        Flow Duration   Flow Bytes/s  Flow Packets/s  Fwd Packet Length Max  \
0          52601173.0    5522.709541       66.424374                 1318.0   
1         119106942.0      24.599742        0.780811                   41.0   
2              5589.0   22186.437645      357.845768                  124.0   
3         118166562.0    5969.142100       68.784264                 1348.0   
4         119988385.0    4459.064934       56.838835                    9.0   
...               ...            ...             ...                    ...   
198864      5005605.0      31.165064        0.799104                   39.0   
198865      5004653.0      31.170992        0.799256                   39.0   
198866      5007300.0      31.154514        0.798834                   39.0   
198867          339.0  230088.495575     5899.705015                   39.0   
198868          306.0  254901.960784     6535.947712                   39.0   

        Fwd Packet Length Min  Packet Length Std  B

### Check for missing values

In [9]:
# Check for missing values
sample_df.isna().sum()

Unnamed: 0,0
Flow Duration,0
Flow Bytes/s,0
Flow Packets/s,0
Fwd Packet Length Max,0
Fwd Packet Length Min,0
Packet Length Std,0
Bwd Init Win Bytes,0
FWD Init Win Bytes,0
Fwd Seg Size Min,0
Idle Std,0


## Feature Encoding

Encoding strategies:
| Column Name       | Encoding                                    |
|-------------------|---------------------------------------------|
| `Label`           | Binary Encoding (Benign = 0, Malicious = 1) |
| `Traffic Subtype` | Label Encoding (Will be done in training)   |


### Binary Encoding the Label column

In [10]:
sample_df['Label'] = sample_df['Label'].map({'Benign': 0, 'Malicious': 1})

### Confirming Binary Encoding

In [11]:
is_benign = sample_df['Label'] == 0
is_malicious = sample_df['Label'] == 1

In [12]:
# Benign
sample_df[is_benign].head()

Unnamed: 0,Flow Duration,Flow Bytes/s,Flow Packets/s,Fwd Packet Length Max,Fwd Packet Length Min,Packet Length Std,Bwd Init Win Bytes,FWD Init Win Bytes,Fwd Seg Size Min,Idle Std,Idle Mean,SYN Flag Count,ACK Flag Count,Total Fwd Packet,Total Bwd packets,Label,Traffic Type,Traffic Subtype
0,52601173.0,5522.709541,66.424374,1318.0,0.0,93.64393,2068.0,65280.0,32.0,0.0,0.0,2.0,3493.0,1701.0,1793.0,0,Audio,Audio
1,119106942.0,24.599742,0.780811,41.0,0.0,27.063135,502.0,502.0,32.0,1737.400069,5001511.0,0.0,93.0,36.0,57.0,0,Audio,Audio
2,5589.0,22186.437645,357.845768,124.0,124.0,71.591433,502.0,501.0,32.0,0.0,0.0,0.0,2.0,1.0,1.0,0,Audio,Audio
3,118166562.0,5969.1421,68.784264,1348.0,0.0,133.693257,4708.0,65280.0,32.0,0.0,0.0,2.0,8127.0,3932.0,4196.0,0,Audio,Audio
4,119988385.0,4459.064934,56.838835,9.0,9.0,17.228724,0.0,0.0,8.0,0.0,0.0,0.0,0.0,25.0,6795.0,0,Audio,Audio


In [13]:
# Malicious
sample_df[is_malicious].head()

Unnamed: 0,Flow Duration,Flow Bytes/s,Flow Packets/s,Fwd Packet Length Max,Fwd Packet Length Min,Packet Length Std,Bwd Init Win Bytes,FWD Init Win Bytes,Fwd Seg Size Min,Idle Std,Idle Mean,SYN Flag Count,ACK Flag Count,Total Fwd Packet,Total Bwd packets,Label,Traffic Type,Traffic Subtype
1301,3324822.0,300.767981,0.601536,500.0,500.0,0.0,0.0,512.0,20.0,0.0,0.0,0.0,0.0,2.0,0.0,1,DoS,DoS RST
1302,3047129.0,328.17777,0.656356,500.0,500.0,0.0,0.0,512.0,20.0,0.0,0.0,0.0,0.0,2.0,0.0,1,DoS,DoS RST
1303,3030602.0,329.967445,0.659935,500.0,500.0,0.0,0.0,512.0,20.0,0.0,0.0,0.0,0.0,2.0,0.0,1,DoS,DoS RST
1304,3057113.0,327.105998,0.654212,500.0,500.0,0.0,0.0,512.0,20.0,0.0,0.0,0.0,0.0,2.0,0.0,1,DoS,DoS RST
1305,3052675.0,327.581547,0.655163,500.0,500.0,0.0,0.0,512.0,20.0,0.0,0.0,0.0,0.0,2.0,0.0,1,DoS,DoS RST


## Final Dataframe

In [14]:
final_df = sample_df

## Save cleaned Dataframe

In [15]:
try:
    final_df.to_csv(CLEAN_CSV_PATH, index=False)
    print(f"Saved cleaned data to: {CLEAN_CSV_PATH}")
except Exception as e:
    print(f"Error saving cleaned data: {e}")

Saved cleaned data to: /content/drive/MyDrive/Datasets/TII-SSRC-23/cleaned_data.csv
