## data selection 


In [None]:
import pandas as pd
# Load the original dataset
df = pd.read_csv('../CIC-DDoS2019.csv')

# Columns to keep
selected_columns = [
    "SYN Flag Count", "Total Fwd Packets", "Total Backward Packets", "Flow Duration", "Flow Packets/s", "Flow Bytes/s", 
    "Fwd Packet Length Mean", "Bwd Packet Length Mean", "Bwd IAT Mean", "ACK Flag Count", "Active Mean", "Inbound",
    "Label"
]

# Keep only the selected columns
df = df[selected_columns]

# Filter by the label names BENIGN and SYN
label_BENIGN = df[df['Label'] == 'BENIGN']
label_SYN = df[df['Label'] == 'SYN']

# Randomly sample 5000 rows from each
sample_BENIGN = label_BENIGN.sample(n=5000, random_state=42)
sample_SYN = label_SYN.sample(n=5000, random_state=42)

# Combine and shuffle
balanced_df = pd.concat([sample_BENIGN, sample_SYN])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the filtered and balanced dataset
balanced_df.to_csv('balanced_filtered_dataset.csv', index=False)

print("Filtered and balanced dataset saved as 'balanced_filtered_dataset.csv'")


## pre-processing

the only pre-processing that has been done using python was to randomly select 70%, 20% and 10% for training, testing and evaluation respectively.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Load the dataset
file_path = "Data\SYN.csv"
df = pd.read_csv(file_path)

# First, split into train (70%) and temp (30%)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, shuffle=True)

# Then, split temp into test (20%) and eval (10%)
test_df, eval_df = train_test_split(temp_df, test_size=1/3, random_state=42, shuffle=True)

# Create output directory
output_dir = "Data\Splits"
os.makedirs(output_dir, exist_ok=True)

# Save each split to CSV
train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)
eval_df.to_csv(os.path.join(output_dir, "eval.csv"), index=False)

print("Dataset split and saved successfully:")
print(f"Training set: {len(train_df)} samples")
print(f"Testing set: {len(test_df)} samples")
print(f"Evaluation set: {len(eval_df)} samples")
