In [1]:
import pandas as pd
import numpy as np
import os
import glob
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [2]:
# Set display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Define the folder containing all CSV files
data_folder = "./data/"  # Change this to your actual path

In [3]:
# Read and merge all CSV files
all_files = glob.glob(os.path.join(data_folder, "*.csv"))
df_list = []
for file in all_files:
    try:
        temp_df = pd.read_csv(file, encoding='latin1', low_memory=False)
        if temp_df.shape[0] > 0:
            temp_df.columns = temp_df.columns.str.strip().str.lower()
            label_col = [col for col in temp_df.columns if 'label' in col.lower()]
            if label_col:
                temp_df.rename(columns={label_col[0]: 'label'}, inplace=True)
            df_list.append(temp_df)
            print(f"Loaded: {file} ({temp_df.shape[0]} rows)")
        else:
            print(f"Skipping empty file: {file}")
    except Exception as e:
        print(f"Error reading {file}: {e}")

if not df_list:
    raise ValueError("No valid data found in CSV files.")

df = pd.concat(df_list, ignore_index=True)

Loaded: ./data\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv (225745 rows)
Loaded: ./data\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv (286467 rows)
Loaded: ./data\Friday-WorkingHours-Morning.pcap_ISCX.csv (191033 rows)
Loaded: ./data\Monday-WorkingHours.pcap_ISCX.csv (529918 rows)
Loaded: ./data\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv (288602 rows)
Loaded: ./data\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv (170366 rows)
Loaded: ./data\Tuesday-WorkingHours.pcap_ISCX.csv (445909 rows)
Loaded: ./data\Wednesday-workingHours.pcap_ISCX.csv (692703 rows)


In [4]:
# Enable GPU Acceleration
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    for device in gpu_devices:
        tf.config.experimental.set_memory_growth(device, True)
    print("✅ GPU is enabled for TensorFlow.")
else:
    print("⚠️ No GPU found. Running on CPU.")

✅ GPU is enabled for TensorFlow.


In [5]:
# Exploratory Data Analysis (EDA)
# Display first few rows
print(df.head())

# Summary statistics
print(df.describe())

# Check for missing values
missing_values = df.isnull().sum()
print(f"Missing Values per Column:\n{missing_values[missing_values > 0]}")


   destination port  flow duration  total fwd packets  total backward packets  \
0             54865              3                  2                       0   
1             55054            109                  1                       1   
2             55055             52                  1                       1   
3             46236             34                  1                       1   
4             54863              3                  2                       0   

   total length of fwd packets  total length of bwd packets  \
0                           12                            0   
1                            6                            6   
2                            6                            6   
3                            6                            6   
4                           12                            0   

   fwd packet length max  fwd packet length min  fwd packet length mean  \
0                      6                      6            