In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn import preprocessing
import random
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier


In [2]:
multiplier = 5
samples = 5000  # Variable to control the number of rows to read

def loadingfile(path):
    # Count total lines in the file (excluding header)
    nlines = sum(1 for _ in open(path)) - 1  

    # Ensure at most `samples` rows are read randomly
    if nlines > samples:
        skip_rows = sorted(random.sample(range(1, nlines + 1), nlines - samples))
    else:
        skip_rows = None  # Read entire file if it's smaller than `samples`

    data = pd.read_csv(path, skiprows=skip_rows, sep=',', low_memory=False)

    is_benign = data[' Label'] == 'BENIGN'
    flows_ok = data[is_benign]
    flows_ddos_full = data[~is_benign]
    
    sizeDownSample = len(flows_ok) * multiplier  # Target size for anomalous data
    
    # Downsample majority class
    if sizeDownSample < len(flows_ddos_full): 
        flows_ddos_reduced = resample(
            flows_ddos_full,
            replace=False,
            n_samples=sizeDownSample,
            random_state=27
        )
    else:
        flows_ddos_reduced = flows_ddos_full

    final_df = pd.concat([flows_ok, flows_ddos_reduced])

    return final_df

In [3]:
def loadinghugefile(path):
    lines = sum(1 for _ in open(path)) - 1  

    if lines > samples:
        skip_rows = sorted(random.sample(range(1, lines + 1), lines - samples))
    else:
        skip_rows = None  

    df_chunk = pd.read_csv(path, skiprows=skip_rows, chunksize=500000, low_memory=False)
    
    chunk_list_ok = []  
    chunk_list_ddos = [] 

    for chunk in df_chunk:  
        is_benign = chunk[' Label'] == 'BENIGN'
        flows_ok = chunk[is_benign]
        flows_ddos_full = chunk[~is_benign]
        
        if (len(flows_ok) * multiplier) < len(flows_ddos_full): 
            sizeDownSample = len(flows_ok) * multiplier  
            
            flows_ddos_reduced = resample(
                flows_ddos_full,
                replace=False,
                n_samples=sizeDownSample,
                random_state=27
            )
        else:
            flows_ddos_reduced = flows_ddos_full
            
        chunk_list_ok.append(flows_ok)
        chunk_list_ddos.append(flows_ddos_reduced)
        
    flows_ok = pd.concat(chunk_list_ok)
    flows_ddos = pd.concat(chunk_list_ddos)

    final_df = pd.concat([flows_ok, flows_ddos])

    return final_df

In [4]:
# Load first file
flows = loadinghugefile('/kaggle/input/cic-ddos2019-30gb-full-dataset-csv-files/01-12/TFTP.csv')
print('file 1 loaded')

# List of remaining files
files = [
    "DrDoS_LDAP.csv", "DrDoS_MSSQL.csv", "DrDoS_NetBIOS.csv",
    "DrDoS_NTP.csv", "DrDoS_SNMP.csv", "DrDoS_SSDP.csv",
    "DrDoS_UDP.csv", "Syn.csv", "DrDoS_DNS.csv", "UDPLag.csv"
]

# Process each file
for i, file in enumerate(files, start=2):
    df = loadingfile(f'/kaggle/input/cic-ddos2019-30gb-full-dataset-csv-files/01-12/{file}')
    
    # Concatenate new file data
    flows = pd.concat([flows, df], ignore_index=True)
    
    print(f'file {i} loaded')

# Save to CSV
flows.to_csv('/kaggle/working/export_dataframe.csv', index=False, header=True)

# Delete large variable
del flows

file 1 loaded
file 2 loaded
file 3 loaded
file 4 loaded
file 5 loaded
file 6 loaded
file 7 loaded
file 8 loaded
file 9 loaded
file 10 loaded
file 11 loaded


In [5]:
# Load dataset
samples = pd.read_csv('/kaggle/working/export_dataframe.csv', sep=',')

In [6]:
samples.shape

(774, 88)

In [7]:
samples.head()

Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
0,1,192.168.50.7-23.194.142.15-52420-443-6,192.168.50.7,52420,23.194.142.15,443,6,2018-12-01 13:40:52.680613,20842,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,BENIGN
1,91001,192.168.50.6-72.21.91.29-58254-80-6,192.168.50.6,58254,72.21.91.29,80,6,2018-12-01 13:46:26.164492,114878,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ocsp.digicert.com/,0,BENIGN
2,19803,192.168.50.8-8.8.8.8-62077-53-17,192.168.50.8,62077,8.8.8.8,53,17,2018-12-01 14:42:03.700254,54185,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,BENIGN
3,21948,192.168.50.8-4.2.2.4-56697-53-17,192.168.50.8,56697,4.2.2.4,53,17,2018-12-01 16:01:40.682107,20860,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,BENIGN
4,17584,192.168.50.6-8.8.8.8-56988-53-17,192.168.50.6,56988,8.8.8.8,53,17,2018-12-01 16:35:39.201263,45253,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,BENIGN


In [8]:
samples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 774 entries, 0 to 773
Data columns (total 88 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    774 non-null    int64  
 1   Flow ID                       774 non-null    object 
 2    Source IP                    774 non-null    object 
 3    Source Port                  774 non-null    int64  
 4    Destination IP               774 non-null    object 
 5    Destination Port             774 non-null    int64  
 6    Protocol                     774 non-null    int64  
 7    Timestamp                    774 non-null    object 
 8    Flow Duration                774 non-null    int64  
 9    Total Fwd Packets            774 non-null    int64  
 10   Total Backward Packets       774 non-null    int64  
 11  Total Length of Fwd Packets   774 non-null    float64
 12   Total Length of Bwd Packets  774 non-null    float64
 13   Fwd 

In [9]:
samples.describe()

Unnamed: 0.1,Unnamed: 0,Source Port,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Inbound
count,774.0,774.0,774.0,774.0,774.0,774.0,774.0,774.0,774.0,774.0,...,774.0,774.0,774.0,774.0,774.0,774.0,774.0,774.0,774.0,774.0
mean,84212.295866,25661.179587,28763.151163,12.72093,3079923.0,21.635659,0.872093,8941.365633,308.102067,319.087855,...,-23341340.0,5252.26,5001.91,9310.304,1818.073643,1362429.0,210368.3,1554928.0,1204542.0,0.860465
std,114451.445618,25097.503231,21136.585518,5.390774,12603020.0,43.725405,4.815125,19371.530566,4661.076112,385.152862,...,155858500.0,72289.02,89224.9,132303.1,34533.250362,5876098.0,1374674.0,6654613.0,5485428.0,0.346728
min,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-1062719000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3587.75,773.0,8100.5,6.0,1.0,2.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,17362.5,21859.0,28715.0,17.0,251.5,2.0,0.0,860.0,0.0,384.5,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,142996.5,51871.5,48380.75,17.0,20671.0,14.0,0.0,5014.5,0.0,440.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,711196.0,65498.0,65528.0,17.0,116772200.0,200.0,103.0,88000.0,122666.0,2920.0,...,1480.0,1632323.0,2308452.0,3264645.0,943533.0,58227630.0,25348650.0,66019170.0,58031990.0,1.0


In [10]:
samples.columns = samples.columns.str.strip()
samples["Label"].value_counts()

Label
DrDoS_NTP        255
UDP-lag          250
BENIGN           129
TFTP              35
DrDoS_LDAP        20
DrDoS_UDP         20
DrDoS_SNMP        15
DrDoS_MSSQL       10
DrDoS_NetBIOS     10
DrDoS_SSDP        10
Syn               10
DrDoS_DNS         10
Name: count, dtype: int64

In [11]:
samples.columns

Index(['Unnamed: 0', 'Flow ID', 'Source IP', 'Source Port', 'Destination IP',
       'Destination Port', 'Protocol', 'Timestamp', 'Flow Duration',
       'Total Fwd Packets', 'Total Backward Packets',
       'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Le

In [12]:
import pandas as pd
import numpy as np
import hashlib

# Load dataset
samples = pd.read_csv('/kaggle/working/export_dataframe.csv', sep=',')

# Function to convert string to numeric hash
def string2numeric_hash(text):
    return int(hashlib.md5(text.encode()).hexdigest()[:8], 16)

# Replace infinite values
samples = samples.replace(['Infinity', np.inf], 0)

# Convert numerical columns safely
samples[' Flow Packets/s'] = pd.to_numeric(samples[' Flow Packets/s'], errors='coerce').fillna(0)
samples['Flow Bytes/s'] = pd.to_numeric(samples['Flow Bytes/s'], errors='coerce').fillna(0)

# Convert labels to numeric
samples[' Label'] = samples[' Label'].replace({
    'BENIGN': 0, 'DrDoS_DNS': 1, 'DrDoS_LDAP': 1, 'DrDoS_MSSQL': 1,
    'DrDoS_NTP': 1, 'DrDoS_NetBIOS': 1, 'DrDoS_SNMP': 1, 'DrDoS_SSDP': 1,
    'DrDoS_UDP': 1, 'Syn': 1, 'TFTP': 1, 'UDP-lag': 1, 'WebDDoS': 1
}).astype(int).infer_objects(copy=False)

# Ensure no NaN timestamps before splitting
samples[' Timestamp'] = samples[' Timestamp'].fillna('1970-01-01 00:00:00.000000')

# Process timestamps
columnTime = samples[' Timestamp'].str.split(' ', n=1, expand=True)
columnTime.columns = ['date', 'hours']
columnTime = columnTime['hours'].str.split('.', n=1, expand=True)
columnTime.columns = ['hours', 'milisec']
samples[' Timestamp'] = columnTime['hours'].apply(string2numeric_hash)

# Drop unnecessary columns
samples.drop(columns=[' Source IP', ' Destination IP', 'Flow ID', 'SimillarHTTP', 'Unnamed: 0'], inplace=True)

# Save processed dataset
samples.to_csv('/kaggle/working/export_dataframe_proc.csv', index=False, header=True)

print('Training data processed successfully!')

Training data processed successfully!


  samples[' Label'] = samples[' Label'].replace({


In [13]:
df = pd.read_csv('/kaggle/input/lr-hr-ddos-2024-dataset-for-sdn-based-networks/LR-HR DDoS 2024 Dataset for SDN-Based Networks.csv')

In [14]:
df.shape

(113407, 25)

In [15]:
df.head()

Unnamed: 0,flow_duration,protocol,srcport,dstport,byte_count,packet_count,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Std,...,Fwd Header Len,Bwd Header Len,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,Pkt Size Avg,Init Fwd Win Byts,Init Bwd Win Byts,Label
0,274,1,0,0,26656,272,2,31,46,21.92031,...,40,40,541.8,0,0,0,30.75,-1,64240,0
1,274,1,0,0,26656,272,2,31,46,21.92031,...,40,40,541.8,0,0,0,30.75,-1,64240,0
2,29,1,0,0,2842,29,1,0,0,0.0,...,40,20,0.0,0,0,0,0.0,-1,64240,0
3,29,1,0,0,2842,29,1,0,0,0.0,...,40,20,0.0,0,0,0,0.0,-1,64240,0
4,218,6,46004,80,246238678,29823,8,2556,5188,700.40217,...,100,160,441000.7473,0,0,0,693.692308,-1,64240,0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113407 entries, 0 to 113406
Data columns (total 25 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   flow_duration      113407 non-null  int64  
 1   protocol           113407 non-null  int64  
 2   srcport            113407 non-null  int64  
 3   dstport            113407 non-null  int64  
 4   byte_count         113407 non-null  int64  
 5   packet_count       113407 non-null  int64  
 6   Tot Bwd Pkts       113407 non-null  int64  
 7   TotLen Fwd Pkts    113407 non-null  int64  
 8   TotLen Bwd Pkts    113407 non-null  int64  
 9   Fwd Pkt Len Std    113407 non-null  float64
 10  Flow Pkts/s        113407 non-null  float64
 11  Fwd PSH Flags      113407 non-null  int64  
 12  Bwd PSH Flags      113407 non-null  int64  
 13  Fwd URG Flags      113407 non-null  int64  
 14  Bwd URG Flags      113407 non-null  int64  
 15  Fwd Header Len     113407 non-null  int64  
 16  Bw

In [17]:
df.describe()

Unnamed: 0,flow_duration,protocol,srcport,dstport,byte_count,packet_count,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Std,...,Fwd Header Len,Bwd Header Len,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,Pkt Size Avg,Init Fwd Win Byts,Init Bwd Win Byts,Label
count,113407.0,113407.0,113407.0,113407.0,113407.0,113407.0,113407.0,113407.0,113407.0,113407.0,...,113407.0,113407.0,113407.0,113407.0,113407.0,113407.0,113407.0,113407.0,113407.0,113407.0
mean,5675157.0,4.205825,13404.874496,3942.364598,314952.2,138.744257,8.995829,1607.594,17103.89,54.413657,...,96.797658,152.417223,419236.9,0.010555,0.07332,0.000265,103.779678,-1.0,11279.70083,0.621725
std,22909230.0,6.446244,21656.132359,13247.903299,8064643.0,1847.267425,156.940094,121142.7,490540.1,424.427145,...,1842.412823,3271.544343,7155128.0,0.102194,0.260662,0.016262,336.6038,0.0,24353.898653,0.484959
min,-154.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0
25%,17.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0
50%,36.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,1.0
75%,3325.0,6.0,36313.0,53.0,33.0,1.0,3.0,33.0,178.0,0.0,...,8.0,24.0,9618.8,0.0,0.0,0.0,123.75,-1.0,-1.0,1.0
max,120000000.0,17.0,65513.0,60994.0,358199700.0,63196.0,34094.0,31600000.0,107000000.0,45423.83252,...,360448.0,681904.0,983000000.0,1.0,1.0,1.0,21413.0,-1.0,65535.0,1.0


In [18]:
df.isnull().sum()

flow_duration        0
protocol             0
srcport              0
dstport              0
byte_count           0
packet_count         0
Tot Bwd Pkts         0
TotLen Fwd Pkts      0
TotLen Bwd Pkts      0
Fwd Pkt Len Std      0
Flow Pkts/s          0
Fwd PSH Flags        0
Bwd PSH Flags        0
Fwd URG Flags        0
Bwd URG Flags        0
Fwd Header Len       0
Bwd Header Len       0
Pkt Len Var          0
FIN Flag Cnt         0
SYN Flag Cnt         0
RST Flag Cnt         0
Pkt Size Avg         0
Init Fwd Win Byts    0
Init Bwd Win Byts    0
Label                0
dtype: int64

In [19]:
print(df['Label'].value_counts())

Label
1    70508
0    42899
Name: count, dtype: int64


In [20]:
# This maps features from CIC-DDoS to equivalent features in LR-HR dataset
FEATURE_MAPPING = {
    # CIC column: LR-HR column
    ' Flow Duration': 'flow_duration',
    ' Protocol': 'protocol',
    ' Source Port': 'srcport',
    ' Destination Port': 'dstport', 
    ' Total Length of Fwd Packets': 'TotLen Fwd Pkts',
    ' Total Length of Bwd Packets': 'TotLen Bwd Pkts',
    ' Total Backward Packets': 'Tot Bwd Pkts',
    ' Packet Length Variance': 'Pkt Len Var',
    ' Fwd Header Length': 'Fwd Header Len',
    ' Bwd Header Length': 'Bwd Header Len',
    ' Average Packet Size': 'Pkt Size Avg',
    ' Fwd PSH Flags': 'Fwd PSH Flags',
    ' Bwd PSH Flags': 'Bwd PSH Flags',
    ' Fwd URG Flags': 'Fwd URG Flags',
    ' Bwd URG Flags': 'Bwd URG Flags',
    ' FIN Flag Count': 'FIN Flag Cnt',
    ' SYN Flag Count': 'SYN Flag Cnt',
    ' RST Flag Count': 'RST Flag Cnt',
    ' Flow Packets/s': 'Flow Pkts/s',
    ' Fwd Packet Length Std': 'Fwd Pkt Len Std',
    'Init_Win_bytes_forward': 'Init Fwd Win Byts',
    'Init_Win_bytes_backward': 'Init Bwd Win Byts'
}

In [21]:
COMMON_FEATURES = list(FEATURE_MAPPING.keys())

In [22]:
processed_df = df.copy()
    
# Rename columns to match CIC feature names
inverse_mapping = {v: k for k, v in FEATURE_MAPPING.items()}
renamed_df = processed_df.rename(columns=inverse_mapping)
    
# Only keep common columns and label
common_cols = [col for col in COMMON_FEATURES if col in renamed_df.columns]
renamed_df = renamed_df[common_cols + ['Label']]
    
# Rename Label column to match CIC format
renamed_df = renamed_df.rename(columns={'Label': ' Label'})

In [23]:
train_df = pd.read_csv('/kaggle/working/export_dataframe_proc.csv')

In [24]:
def extract_common_features(cic_df, lrhr_df):
    """
    Extract only the common features from both datasets
    """
    # Find features that exist in both dataframes
    cic_features = set(cic_df.columns)
    lrhr_features = set(lrhr_df.columns)
    
    # Use the mapping to find common columns
    common_cols = []
    for cic_col, lrhr_col in FEATURE_MAPPING.items():
        if cic_col in cic_features and lrhr_col in lrhr_features:
            common_cols.append(cic_col)
    
    # Extract only common features from CIC (plus label)
    cic_common = cic_df[common_cols + [' Label']]
    
    # Map LR-HR features to CIC names and extract
    inverse_mapping = {v: k for k, v in FEATURE_MAPPING.items() if k in common_cols}
    lrhr_common = lrhr_df[list(inverse_mapping.keys()) + ['Label']].rename(
        columns={**inverse_mapping, 'Label': ' Label'})
    
    return cic_common, lrhr_common

In [25]:
cic_common, lrhr_common = extract_common_features(train_df, df)
    
print(f"Common features CIC data shape: {cic_common.shape}")
print(f"Common features LR-HR data shape: {lrhr_common.shape}")
print(f"Common features: {cic_common.columns.tolist()[:-1]}")

Common features CIC data shape: (774, 19)
Common features LR-HR data shape: (113407, 19)
Common features: [' Flow Duration', ' Protocol', ' Source Port', ' Destination Port', ' Total Length of Bwd Packets', ' Total Backward Packets', ' Packet Length Variance', ' Fwd Header Length', ' Bwd Header Length', ' Average Packet Size', ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' SYN Flag Count', ' RST Flag Count', ' Flow Packets/s', ' Fwd Packet Length Std', 'Init_Win_bytes_forward']


In [26]:
is_benign = cic_common[' Label']==0 

normal = cic_common[is_benign]
ddos = cic_common[~is_benign]

# upsample minority
normal_upsampled = resample(normal,
                          replace=True, # sample with replacement
                          n_samples=len(ddos), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([normal_upsampled, ddos])

In [27]:
# Split training data
X_train = upsampled.drop(' Label', axis=1)
y_train = upsampled[' Label']
    
X_test = lrhr_common.drop(' Label', axis=1)
y_test = lrhr_common[' Label']
print(f"X_train columns: {X_train.columns.tolist()}")
print(f"X_test columns: {X_test.columns.tolist()}")

X_train columns: [' Flow Duration', ' Protocol', ' Source Port', ' Destination Port', ' Total Length of Bwd Packets', ' Total Backward Packets', ' Packet Length Variance', ' Fwd Header Length', ' Bwd Header Length', ' Average Packet Size', ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' SYN Flag Count', ' RST Flag Count', ' Flow Packets/s', ' Fwd Packet Length Std', 'Init_Win_bytes_forward']
X_test columns: [' Flow Duration', ' Protocol', ' Source Port', ' Destination Port', ' Total Length of Bwd Packets', ' Total Backward Packets', ' Packet Length Variance', ' Fwd Header Length', ' Bwd Header Length', ' Average Packet Size', ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' SYN Flag Count', ' RST Flag Count', ' Flow Packets/s', ' Fwd Packet Length Std', 'Init_Win_bytes_forward']


In [28]:
def train_evaluate_model(X_train, y_train, X_test, y_test, model_name="Random Forest"):
    """
    Train and evaluate a model on the provided data
    """
    if model_name == "Random Forest":
        model = RandomForestClassifier(n_estimators=100, random_state=42)
    elif model_name == "Gradient Boosting":
        model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
    elif model_name == "NB":
        model = GaussianNB()
    elif model_name == "SVM":
        model = SVC(probability=True, random_state=42)
    elif model_name == "KNN":
        model = KNeighborsClassifier(n_neighbors=5)
    elif model_name == "MLP":
        model = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', max_iter=300, random_state=42)
    else:
        raise ValueError(f"Unsupported model: {model_name}")
    
    # Train the model
    print(f"Training {model_name} model...")
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    print("Classes learned by model:", model.classes_)
    print(np.unique(y_pred, return_counts=True))
    
    try:
        # ROC AUC might fail if only one class is present
        roc_auc = roc_auc_score(y_test, y_prob)
    except:
        roc_auc = float('nan')
    
    # Print evaluation results
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    
    return {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }


In [29]:
from sklearn.preprocessing import StandardScaler
# Step 5: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
    
# Step 6: Train and evaluate models
results = {}
print(np.unique(y_test, return_counts=True))

for model_name in ["Random Forest", "Gradient Boosting","NB","SVM","KNN","MLP"]:
    results[model_name] = train_evaluate_model(X_train_scaled, y_train, X_test_scaled, y_test, model_name)

(array([0, 1]), array([42899, 70508]))
Training Random Forest model...
Classes learned by model: [0 1]
(array([0, 1]), array([107550,   5857]))

Random Forest Performance:
Accuracy: 0.3266
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
ROC AUC: 0.6437
Training Gradient Boosting model...
Classes learned by model: [0 1]
(array([0, 1]), array([107884,   5523]))

Gradient Boosting Performance:
Accuracy: 0.3296
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
ROC AUC: 0.8381
Training NB model...
Classes learned by model: [0 1]
(array([0, 1]), array([17433, 95974]))

NB Performance:
Accuracy: 0.7754
Precision: 0.7346
Recall: 1.0000
F1 Score: 0.8470
ROC AUC: 0.7707
Training SVM model...
Classes learned by model: [0 1]
(array([0, 1]), array([109389,   4018]))

SVM Performance:
Accuracy: 0.3429
Precision: 0.0007
Recall: 0.0000
F1 Score: 0.0001
ROC AUC: 0.0936
Training KNN model...
Classes learned by model: [0 1]
(array([0, 1]), array([108677,   4730]))

KNN Performance:
Accuracy: 0.3366
Pre

In [30]:
X_test = upsampled.drop(' Label', axis=1)
y_test = upsampled[' Label']
    
X_train = lrhr_common.drop(' Label', axis=1)
y_train = lrhr_common[' Label']

In [31]:
# Step 5: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
    
# Step 6: Train and evaluate models
results = {}
print(np.unique(y_test, return_counts=True))

for model_name in ["Random Forest", "Gradient Boosting","NB","SVM","KNN","MLP"]:
    results[model_name] = train_evaluate_model(X_train_scaled, y_train, X_test_scaled, y_test, model_name)

(array([0, 1]), array([645, 645]))
Training Random Forest model...
Classes learned by model: [0 1]
(array([0, 1]), array([1286,    4]))

Random Forest Performance:
Accuracy: 0.4969
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
ROC AUC: 0.8596
Training Gradient Boosting model...
Classes learned by model: [0 1]
(array([0, 1]), array([1286,    4]))

Gradient Boosting Performance:
Accuracy: 0.4969
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
ROC AUC: 0.7718
Training NB model...
Classes learned by model: [0 1]
(array([0]), array([1290]))

NB Performance:
Accuracy: 0.5000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
ROC AUC: 0.5000
Training SVM model...
Classes learned by model: [0 1]
(array([0, 1]), array([1286,    4]))

SVM Performance:
Accuracy: 0.4969
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
ROC AUC: 0.4232
Training KNN model...
Classes learned by model: [0 1]
(array([0, 1]), array([1278,   12]))

KNN Performance:
Accuracy: 0.5031
Precision: 0.6667
Recall: 0.0124
F