In [1]:
import os
import glob
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from h2o.automl import H2OAutoML

## Loading the dataset ##

In [2]:
# Folder containing CSV files
DATA_DIR = "MachineLearningCSV"

# Search for all CSV files in the folder
files = sorted(glob.glob(os.path.join(DATA_DIR, "*.csv")))
if len(files) == 0:
    raise FileNotFoundError(f"No CSV files found in {DATA_DIR}. Check the path or files.")

print(f"CSV files found ({len(files)}):\n", "\n".join(files))

# Reading and concatenation
start = time.time()
df_list = []
for f in files:
    print("Reading:", f)
    # Read using ';' separator
    df_part = pd.read_csv(f, sep=';', low_memory=False)
    df_list.append(df_part)

# Concatenate all files into a single DataFrame
df = pd.concat(df_list, ignore_index=True)
print(f"Concatenation completed. Shape: {df.shape} (time: {time.time()-start:.2f}s)")

# DataFrame preview
display(df.head())

CSV files found (8):
 MachineLearningCSV\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
MachineLearningCSV\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
MachineLearningCSV\Friday-WorkingHours-Morning.pcap_ISCX.csv
MachineLearningCSV\Monday-WorkingHours.pcap_ISCX.csv
MachineLearningCSV\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
MachineLearningCSV\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
MachineLearningCSV\Tuesday-WorkingHours.pcap_ISCX.csv
MachineLearningCSV\Wednesday-workingHours.pcap_ISCX.csv
Reading: MachineLearningCSV\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Reading: MachineLearningCSV\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Reading: MachineLearningCSV\Friday-WorkingHours-Morning.pcap_ISCX.csv
Reading: MachineLearningCSV\Monday-WorkingHours.pcap_ISCX.csv
Reading: MachineLearningCSV\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Reading: MachineLearningCSV\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


## DataFrame info ##

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830743 entries, 0 to 2830742
Data columns (total 79 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0    Destination Port             int64  
 1    Flow Duration                int64  
 2    Total Fwd Packets            int64  
 3    Total Backward Packets       int64  
 4   Total Length of Fwd Packets   int64  
 5    Total Length of Bwd Packets  int64  
 6    Fwd Packet Length Max        int64  
 7    Fwd Packet Length Min        int64  
 8    Fwd Packet Length Mean       float64
 9    Fwd Packet Length Std        float64
 10  Bwd Packet Length Max         int64  
 11   Bwd Packet Length Min        int64  
 12   Bwd Packet Length Mean       float64
 13   Bwd Packet Length Std        float64
 14  Flow Bytes/s                  float64
 15   Flow Packets/s               float64
 16   Flow IAT Mean                float64
 17   Flow IAT Std                 float64
 18   Flow IAT Max         

## The dataset is too large for H2O on 8GB RAM; consider moving to the cloud or skipping this step ##

In [4]:
# Frequencies in the original dataset
print("Original distribution:")
print(df[' Label'].value_counts(normalize=True))

Original distribution:
 Label
BENIGN                        0.803004
DoS Hulk                      0.081630
PortScan                      0.056144
DDoS                          0.045227
DoS GoldenEye                 0.003636
FTP-Patator                   0.002804
SSH-Patator                   0.002083
DoS slowloris                 0.002048
DoS Slowhttptest              0.001943
Bot                           0.000695
Web Attack � Brute Force      0.000532
Web Attack � XSS              0.000230
Infiltration                  0.000013
Web Attack � Sql Injection    0.000007
Heartbleed                    0.000004
Name: proportion, dtype: float64


## The dataset is highly imbalanced; apply the following step to balance it ##

In [5]:
import pandas as pd

# Name of your target column (note the leading space!)
target = ' Label'

df_sample = (
    df
    .groupby(target, group_keys=False)
    .apply(
        lambda x: x.sample(
            frac=0.20 if x.name == "BENIGN" else 0.80,
            random_state=1234
        )
    )
)

  .apply(


## Verification of changes ##

In [6]:
print("Original dataset size:", len(df)) # verify original dataset
print("Sample size:", len(df_sample)) # verify reduced dataset

Original dataset size: 2830743
Sample size: 900737


In [7]:
# Frequencies in the original dataset
print("Original distribution:")
print(df[' Label'].value_counts(normalize=True))

# Frequencies in the sample
print("\nDistribution in the sample:")
print(df_sample[' Label'].value_counts(normalize=True))

Original distribution:
 Label
BENIGN                        0.803004
DoS Hulk                      0.081630
PortScan                      0.056144
DDoS                          0.045227
DoS GoldenEye                 0.003636
FTP-Patator                   0.002804
SSH-Patator                   0.002083
DoS slowloris                 0.002048
DoS Slowhttptest              0.001943
Bot                           0.000695
Web Attack � Brute Force      0.000532
Web Attack � XSS              0.000230
Infiltration                  0.000013
Web Attack � Sql Injection    0.000007
Heartbleed                    0.000004
Name: proportion, dtype: float64

Distribution in the sample:
 Label
BENIGN                        0.504719
DoS Hulk                      0.205230
PortScan                      0.141156
DDoS                          0.113709
DoS GoldenEye                 0.009141
FTP-Patator                   0.007050
SSH-Patator                   0.005238
DoS slowloris                 0.005148
DoS 

## Non-null values of the reduced dataframe ##

In [8]:
df_sample.notna().sum()

 Destination Port              900737
 Flow Duration                 900737
 Total Fwd Packets             900737
 Total Backward Packets        900737
Total Length of Fwd Packets    900737
                                ...  
Idle Mean                      900737
 Idle Std                      900737
 Idle Max                      900737
 Idle Min                      900737
 Label                         900737
Length: 79, dtype: int64

## Handling missing values ##

In [9]:
df_sample.isna().sum()[df_sample.isna().sum() > 0] # Shows only columns with missing values

Flow Bytes/s    845
dtype: int64

In [10]:
(df_sample.isna().sum() / len(df_sample) * 100).round(4) # See percentage of missing values

 Destination Port              0.0
 Flow Duration                 0.0
 Total Fwd Packets             0.0
 Total Backward Packets        0.0
Total Length of Fwd Packets    0.0
                              ... 
Idle Mean                      0.0
 Idle Std                      0.0
 Idle Max                      0.0
 Idle Min                      0.0
 Label                         0.0
Length: 79, dtype: float64

In [11]:
df_sample = df_sample.dropna(subset=['Flow Bytes/s']) # Drop rows with missing Flow Bytes/s (recommended here)

In [12]:
df_sample.notna().sum() # verification once more to see if everything is OK

 Destination Port              899892
 Flow Duration                 899892
 Total Fwd Packets             899892
 Total Backward Packets        899892
Total Length of Fwd Packets    899892
                                ...  
Idle Mean                      899892
 Idle Std                      899892
 Idle Max                      899892
 Idle Min                      899892
 Label                         899892
Length: 79, dtype: int64

In [13]:
df_sample.isna().sum()[df_sample.isna().sum() > 0] # Shows only columns with missing values

Series([], dtype: int64)

In [14]:
# Clean column names (recommended before export)
# Before saving, remove leading/trailing spaces from column names:
df_sample.columns = df_sample.columns.str.strip()

In [15]:
df_sample.to_csv("dataset_sample_10pct.csv", index=False) # save the reduced dataset

In [16]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 899892 entries, 1108263 to 1602567
Data columns (total 79 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Destination Port             899892 non-null  int64  
 1   Flow Duration                899892 non-null  int64  
 2   Total Fwd Packets            899892 non-null  int64  
 3   Total Backward Packets       899892 non-null  int64  
 4   Total Length of Fwd Packets  899892 non-null  int64  
 5   Total Length of Bwd Packets  899892 non-null  int64  
 6   Fwd Packet Length Max        899892 non-null  int64  
 7   Fwd Packet Length Min        899892 non-null  int64  
 8   Fwd Packet Length Mean       899892 non-null  float64
 9   Fwd Packet Length Std        899892 non-null  float64
 10  Bwd Packet Length Max        899892 non-null  int64  
 11  Bwd Packet Length Min        899892 non-null  int64  
 12  Bwd Packet Length Mean       899892 non-null  float64
 1

## Memory optimization when the dataframe is too large (optional) ##

In [17]:
# This code reduces the dataframe size for optional manipulation
df_optimized = df.copy()

for col in df_optimized.select_dtypes(include=['int64']).columns:
    df_optimized[col] = pd.to_numeric(df_optimized[col], downcast='integer')

for col in df_optimized.select_dtypes(include=['float64']).columns:
    df_optimized[col] = pd.to_numeric(df_optimized[col], downcast='float')

print("Memory before:", df.memory_usage(deep=True).sum() / 1e6, "MB")
print("Memory after:", df_optimized.memory_usage(deep=True).sum() / 1e6, "MB")

Memory before: 1923.001689 MB
Memory after: 1039.809873 MB


## Data preparation ##

In [18]:
# Initialize H2O
import h2o
h2o.init(max_mem_size="5G")

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 24.0.1+9-30, mixed mode, sharing)
  Starting server from C:\Users\HP\Downloads\test-H2O\.venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\HP\AppData\Local\Temp\tmp22ao24p0
  JVM stdout: C:\Users\HP\AppData\Local\Temp\tmp22ao24p0\h2o_HP_started_from_python.out
  JVM stderr: C:\Users\HP\AppData\Local\Temp\tmp22ao24p0\h2o_HP_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,13 secs
H2O_cluster_timezone:,Africa/Lagos
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.8
H2O_cluster_version_age:,2 months and 19 days
H2O_cluster_name:,H2O_from_python_HP_nnrqgz
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4.983 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


In [19]:
# Convert the pandas DataFrame to an H2OFrame object
# Clean the column names before conversion
df_sample.columns = df_sample.columns.str.strip()
hf = h2o.H2OFrame(df_sample)
# Ensure the H2O object has proper column names
hf.columns = [col.strip() for col in hf.columns]

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [20]:
# Just like in pandas, we can view the statistical summary of an H2O dataframe
hf.describe()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
type,int,int,int,int,int,int,int,int,real,real,int,int,real,real,real,real,real,real,int,int,int,real,real,int,int,int,real,real,int,int,int,int,int,int,int,int,real,real,int,int,real,real,real,int,int,int,int,int,int,int,int,int,real,real,real,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,real,real,int,int,real,real,int,int,enum
mins,0.0,-2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-193000000.0,-2000000.0,-2.0,0.0,-2.0,-14.0,0.0,0.0,0.0,0.0,-12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1073741320.0,-1073741320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1073741320.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,-536870660.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
mean,6028.101061016215,20206669.814518858,6.99743191405191,7.2124866095042535,419.8752161370477,10926.199628399856,173.2515901908229,12.579129495539455,45.81470620880529,60.40928620839372,1591.320725153686,26.544655358643038,526.482707397285,657.6044390367675,1049046.9541624442,79830.5397009855,1888283.7088046682,4954502.659336586,16332319.633001514,184938.66665777718,19931965.20822388,3683285.0448970567,6331382.343829346,16212311.63883445,838341.3385828522,9488572.196684716,1839667.4891442065,2233079.4357172083,6104480.896855398,735180.2346881632,0.033638481062171904,0.0,6.556342316633552e-05,0.0,-2225.8471372120202,-1107.1843454547911,71917.44729200967,7918.453114523618,10.72128210940868,1643.2004996155083,267.1236778521225,518.4665336919038,1048221.4619783087,0.06168295751045681,0.033638481062171904,0.00015668546892293742,0.3591364297048979,0.35901974903655104,0.0632164748658728,6.556342316633552e-05,0.00015668546892293742,0.6577944908944626,295.13445418826285,45.81470620880529,526.4827073972928,-2225.8471372120202,0.0,0.0,0.0,0.0,0.0,0.0,6.99743191405191,419.8752161370477,7.2124866095042535,10926.199916212168,7149.802191818576,1367.112673520821,3.426521182541903,-756.4286658843455,89176.07069689197,33165.37724733788,139268.8954118937,69285.00732532355,15253719.729694283,871525.9477473663,15916900.14413286,14608160.588173915,
maxs,65534.0,119999997.0,219759.0,291922.0,2866110.0,655453030.0,24820.0,2065.0,5940.857143,7049.469004,17376.0,2042.0,5800.5,8194.660487,2071000000.0,4000000.0,120000000.0,84700000.0,120000000.0,120000000.0,120000000.0,120000000.0,84602929.2769822,120000000.0,120000000.0,120000000.0,120000000.0,84418013.7826341,120000000.0,120000000.0,1.0,0.0,1.0,0.0,4480016.0,5838440.0,3000000.0,2000000.0,1448.0,24820.0,2920.0,4731.522394,22400000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,90.0,3337.142857,5940.857143,5800.5,4480016.0,0.0,0.0,0.0,0.0,0.0,0.0,219759.0,2866110.0,291922.0,655453030.0,65535.0,65535.0,213557.0,93.0,100000000.0,63900000.0,100000000.0,100000000.0,120000000.0,75600000.0,120000000.0,120000000.0,
sigma,15565.611237682831,37766570.62165454,500.7261218002552,663.914040617698,7042.391444272528,1538503.373677846,585.514648775856,55.35851174051728,153.51341232972808,228.1355800896373,2770.4963645965704,58.937385028501005,848.3777835466678,1219.8545613779788,21802809.422684293,275186.8155223127,4981956.62057387,10189474.627692802,33206532.10383592,3306606.832136574,37742948.78095082,9772327.450395955,13614726.043267217,33270409.39542617,7948527.386059847,28065407.95556532,8391951.728306577,8487300.151854446,21207123.187968343,7306058.713762341,0.18029689342080163,0.0,0.008096863433217411,0.0,1554064.6202058636,1135417.7274575364,269057.83100607834,38185.70752635933,22.263458905874856,2794.8627916945748,408.4971362127938,882.8821161114164,2407676.153196114,0.24057895706002125,0.18029689342080163,0.012516432905406284,0.47974754853484913,0.47971327328230084,0.24335204535722407,0.008096863433217411,0.012516432905406284,0.6440511636760476,448.10517650265643,153.51341232972808,848.3777835466764,1554064.6202058636,0.0,0.0,0.0,0.0,0.0,0.0,500.7261218002552,7042.391444272528,663.914040617698,1538484.4083361973,13423.714726401518,6925.64376060586,386.6965818474037,579597.2735111554,652272.494227078,373182.5933017072,939313.1537584467,590539.1170420541,32406331.14086435,6288590.643529647,33261784.71069214,32189737.372904073,
zeros,327,378,0,162865,184669,225965,184669,515756,184669,546239,225965,525289,225965,585835,117781,0,378,332673,378,28232,239810,239810,496614,239810,258727,426327,426327,584217,426327,428449,869621,899892,899833,899892,342,162895,396,163134,527083,117781,117781,217353,217353,844384,869621,899751,576708,576813,843004,899833,899751,356790,117781,184669,225965,342,899892,899892,899892,899892,899892,899892,0,184669,162865,225965,55438,152180,337587,354,667353,857486,667353,667353,661308,841966,661308,661308,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,378,378,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,53.0,50039.0,2.0,2.0,58.0,154.0,29.0,29.0,29.0,0.0,77.0,77.0,77.0,0.0,4236.6953776055,79.9376486341,16679.6666666667,28883.1019167494,50031.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,80.0,64.0,39.968824317,39.968824317,29.0,77.0,48.2,26.2906827602,691.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,60.25,29.0,77.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,58.0,2.0,154.0,-1.0,-1.0,1.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,53383.0,3256347.0,1.0,5.0,6.0,30.0,6.0,6.0,6.0,0.0,6.0,6.0,6.0,0.0,11.05533286,1.842555477,651269.4,1447656.588,3240907.0,1.0,0.0,0.0,0.0,0.0,0.0,3248940.0,812235.0,1619119.092,3240907.0,1.0,0.0,0.0,0.0,0.0,20.0,100.0,0.30709258,1.535462898,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,5.0,7.0,6.0,6.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,5.0,30.0,229.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,53.0,70328.0,2.0,2.0,68.0,230.0,34.0,34.0,34.0,0.0,115.0,115.0,115.0,0.0,4237.288136,56.87635081,23442.66667,40557.99663,70275.0,4.0,49.0,49.0,0.0,49.0,49.0,4.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,40.0,40.0,28.43817541,28.43817541,34.0,115.0,66.4,44.36552716,1968.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,83.0,34.0,115.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,68.0,2.0,230.0,-1.0,-1.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [21]:
# Number of rows in the dataframe
hf.nrows

899892

In [22]:
# Remove leading/trailing spaces since they can prevent execution
hf.columns = [col.strip() for col in hf.columns]

# Verify
print(hf.columns)

['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count

In [23]:
# Independent variables (features) and target variable
# Split train/test with hf
train, test = hf.split_frame(ratios=[0.8], seed=1234)
y = 'Label'       # the column you want to predict
x = list(train.columns)  # all columns as a list
# Safe removal of the target column (won't raise if absent)
if y not in x:
    print(f"Error: the target column {y!r} is not in the train columns.")
    print('Available columns :', x[:50])
    x = [c for c in x if c != y]  # safely remove if present
else:
    x.remove(y)

In [24]:
print(train.columns) # shows the column names

['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count

## Running H2O ##

In [25]:
# Strip leading/trailing spaces from column names
df.columns = df.columns.str.strip()
df_sample.columns = df_sample.columns.str.strip()

# Verify
print(df_sample.columns[-5:])  # 'Label' should appear without spaces

# Then, build the frequency table
y = 'Label'
print(df[y].value_counts(normalize=True))

Index(['Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min', 'Label'], dtype='object')
Label
BENIGN                        0.803004
DoS Hulk                      0.081630
PortScan                      0.056144
DDoS                          0.045227
DoS GoldenEye                 0.003636
FTP-Patator                   0.002804
SSH-Patator                   0.002083
DoS slowloris                 0.002048
DoS Slowhttptest              0.001943
Bot                           0.000695
Web Attack � Brute Force      0.000532
Web Attack � XSS              0.000230
Infiltration                  0.000013
Web Attack � Sql Injection    0.000007
Heartbleed                    0.000004
Name: proportion, dtype: float64


In [26]:
# Check your x and y before training
print("Target column:", y)
print("Number of features:", len(x))
# This ensures your target column is correct and everything is configured properly.

Target column: Label
Number of features: 78


In [27]:
# Create an H2O AutoML model
aml = H2OAutoML(
    max_runtime_secs = 1800,      # AutoML stops after 30 minutes
    balance_classes = True,      # automatic class rebalancing for minority/majority classes
    stopping_metric = 'logloss', # stopping criterion based on logloss (suitable for multiclass)
    project_name = 'Final',      # AutoML project name
    seed = 123                   # for reproducibility
)
# Train the model
%time aml.train(x=x, y=y, training_frame= train)

AutoML progress: |
22:07:30.843: AutoML: XGBoost is not available; skipping it.
22:07:32.208: _train param, Dropping bad and constant columns: [Bwd Avg Packets/Bulk, Fwd Avg Bulk Rate, Bwd PSH Flags, Bwd URG Flags, Bwd Avg Bytes/Bulk, Fwd Avg Bytes/Bulk, Fwd Avg Packets/Bulk, Bwd Avg Bulk Rate]

████████████████████████████
22:20:17.407: _train param, Dropping bad and constant columns: [Bwd Avg Packets/Bulk, Fwd Avg Bulk Rate, Bwd PSH Flags, Bwd URG Flags, Bwd Avg Bytes/Bulk, Fwd Avg Bytes/Bulk, Fwd Avg Packets/Bulk, Bwd Avg Bulk Rate]

███████████████████████████
22:33:55.840: GBM_1_AutoML_1_20251227_220728 [GBM def_5] failed: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_1_AutoML_1_20251227_220728.  Details: ERRR on field: _ntrees: The tree model will not fit in the driver node's memory (122,9 KB per tree x 10000 > 977,9 MB) - try decreasing ntrees and/or max_depth or increasing min_rows!

22:33:55.940: _train param, Dropping bad and

Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,1.0,15.0,27576.0,3.0,20.0,17.533333,4.0,490.0,140.33333

BENIGN,Bot,DDoS,DoS GoldenEye,DoS Hulk,DoS Slowhttptest,DoS slowloris,FTP-Patator,Heartbleed,Infiltration,PortScan,SSH-Patator,Web Attack <0xEFBFBD> Brute Force,Web Attack <0xEFBFBD> Sql Injection,Web Attack <0xEFBFBD> XSS,Error,Rate
61757.0,26.0,9.0,4.0,146.0,1.0,6.0,1.0,0.0,0.0,22.0,0.0,10.0,0.0,2.0,0.0036622,227 / 61 984
35140.0,26751.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5677724,35 140 / 61 891
30.0,0.0,61607.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,2.0,0.0008433,52 / 61 659
56.0,0.0,0.0,61054.0,478.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0088315,544 / 61 598
26.0,0.0,0.0,0.0,62040.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000435,27 / 62 067
55.0,0.0,0.0,26.0,0.0,61421.0,244.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0052635,325 / 61 746
90.0,0.0,0.0,58.0,0.0,52.0,61941.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0032185,200 / 62 141
19.0,0.0,0.0,0.0,0.0,0.0,0.0,61481.0,0.0,0.0,0.0,0.0,43.0,0.0,15.0,0.0012509,77 / 61 558
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61619.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0 / 61 619
18295.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43572.0,0.0,0.0,0.0,0.0,0.0,0.295715,18 295 / 61 867

k,hit_ratio
1,0.7731247
2,0.929336
3,0.999643
4,0.9997843
5,0.9998932
6,0.9999449
7,0.9999449
8,0.9999449
9,0.9999449
10,0.9999449

BENIGN,Bot,DDoS,DoS GoldenEye,DoS Hulk,DoS Slowhttptest,DoS slowloris,FTP-Patator,Heartbleed,Infiltration,PortScan,SSH-Patator,Web Attack <0xEFBFBD> Brute Force,Web Attack <0xEFBFBD> Sql Injection,Web Attack <0xEFBFBD> XSS,Error,Rate
36188.0,19.0,6.0,1.0,63.0,2.0,4.0,0.0,0.0,0.0,23.0,0.0,2.0,0.0,0.0,0.0033051,120 / 36 308
71.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5590551,71 / 127
9.0,0.0,8238.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0020594,17 / 8 255
2.0,0.0,1.0,666.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0118694,8 / 674
5.0,0.0,0.0,0.0,14611.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0006156,9 / 14 620
3.0,0.0,0.0,0.0,0.0,339.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0087719,3 / 342
1.0,0.0,0.0,0.0,0.0,0.0,377.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0026455,1 / 378
0.0,0.0,0.0,0.0,0.0,0.0,0.0,508.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0 / 508
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0 / 1
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1 / 1

k,hit_ratio
1,0.9951326
2,0.9990404
3,0.9994854
4,0.9994993
5,0.9995272
6,0.999541
7,0.999541
8,0.999541
9,0.999541
10,0.999541

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,training_auc,training_pr_auc,validation_rmse,validation_logloss,validation_classification_error,validation_auc,validation_pr_auc
,2025-12-27 22:33:56,0.037 sec,0.0,,,,,,,,,,
,2025-12-27 22:36:01,2 min 5.394 sec,1.0,0.4025317,0.5565703,0.2268753,,,0.071496,0.0443963,0.0048674,,

variable,relative_importance,scaled_importance,percentage
Flow IAT Std,184328.0312500,1.0,0.1192268
Subflow Bwd Packets,126554.6406250,0.6865730,0.0818579
Packet Length Std,98084.375,0.5321186,0.0634428
Fwd Packet Length Min,93574.75,0.5076534,0.0605259
Init_Win_bytes_backward,91912.6796875,0.4986365,0.0594508
Flow Duration,81999.3593750,0.4448556,0.0530387
Fwd Packet Length Std,71106.6484375,0.3857614,0.0459931
ACK Flag Count,67615.5156250,0.3668217,0.0437350
Active Min,45030.4921875,0.2442954,0.0291266
act_data_pkt_fwd,44232.8945312,0.2399684,0.0286107


In [28]:
# 1. Show the AutoML leaderboard of top models
lb = aml.leaderboard
print("=== Top 10 AutoML models ===")
print(lb.head(rows=10))

# 2. Show the best model (leader)
leader_model = aml.leader
print("\n=== Best model found ===")
print(f"Model ID: {leader_model.model_id}")

# 3. Evaluate the model on the test set
perf = leader_model.model_performance(test)
print("\n=== Evaluation on the test set ===")
print(perf)

# 4. Print useful metrics (multinomial)
print("\nLogloss :", perf.logloss())
print("MSE :", perf.mse())
print("RMSE :", perf.rmse())
print("Hit Ratio @1 (Accuracy) :", perf.hit_ratio_table()[0][1])
print("\nConfusion matrix :")
print(perf.confusion_matrix())

# 5. Save the best model locally
model_path = h2o.save_model(model=leader_model, path="models", force=True)
print(f"\nModel saved at: {model_path}")

=== Top 10 AutoML models ===
model_id                          mean_per_class_error    logloss      rmse         mse
DRF_1_AutoML_1_20251227_220728                0.222402  0.0443963  0.071496  0.00511167
GBM_2_AutoML_1_20251227_220728                0.705868  0.682838   0.472792  0.223532
GBM_3_AutoML_1_20251227_220728                0.742374  0.725026   0.490297  0.240391
GLM_1_AutoML_1_20251227_220728                0.805516  0.981896   0.569387  0.324202
[4 rows x 5 columns]


=== Best model found ===
Model ID: DRF_1_AutoML_1_20251227_220728

=== Evaluation on the test set ===
ModelMetricsMultinomial: drf
** Reported on test data. **

MSE: 0.0054411091832452544
RMSE: 0.07376387451351274
LogLoss: 0.05280133690848892
Mean Per-Class Error: 0.2737306970750483
AUC table was not computed: it is either disabled (model parameter 'auc_type' was set to AUTO or NONE) or the domain size exceeds the limit (maximum is 50 domains).
AUCPR table was not computed: it is either disabled (model parame