In [1]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import os

folder_path = 'PcapsToTest/Features'
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Load each CSV file into a pandas DataFrame
dataframes = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]

# Concatenate all dataframes into one
full_df = pd.concat(dataframes, ignore_index=True)

# Convert all columns except 'Label' to floats with 3 decimal places
for column in full_df.columns:
    if column != 'Label':
        full_df[column] = full_df[column].astype(float).round(3)

# Convert 'Label' to int
full_df['Label'] = full_df['Label'].astype(int)

# Calculate standard deviation for each column
std_dev = full_df.std()

# Identify columns with standard deviation of 0
zero_std_dev_columns = std_dev[std_dev == 0].index

print("Columns with standard deviation of 0 are being removed:")
for column in zero_std_dev_columns:
    print(column)
    
# Remove columns with standard deviation of 0
full_df = full_df.drop(columns=zero_std_dev_columns)

#Get rid of all rows with infs or nans
full_df = full_df.replace([np.inf, -np.inf], np.nan).dropna()

#get rid of negatives
for column in full_df.columns:
    full_df[column] = full_df[column].apply(lambda x: max(0, x) if pd.notnull(x) else x)
    
#Ensure all values are of type float
full_df = full_df.astype(float) 
full_df.head(10)


Columns with standard deviation of 0 are being removed:
Avg_urg_flag


Unnamed: 0,Avg_syn_flag,Avg_fin_flag,Avg_ack_flag,Avg_psh_flag,Avg_rst_flag,Avg_DNS_pkt,Avg_TCP_pkt,Avg_UDP_pkt,Avg_ICMP_pkt,Duration_window_flow,...,Min_pkts_length,Max_pkts_length,StDev_pkts_length,Avg_small_payload_pkt,Avg_payload,Min_payload,Max_payload,StDev_payload,Avg_DNS_over_TCP,Label
0,0.0,0.0,0.7,0.3,0.0,0.0,1.0,0.0,0.0,0.2,...,54.0,1514.0,678.569,0.4,730.1,0.0,1460.0,678.569,0.0,1.0
1,0.0,0.0,0.5,0.5,0.0,0.0,1.0,0.0,0.0,8.767,...,54.0,1111.0,372.127,0.5,324.1,0.0,1057.0,372.127,0.0,1.0
2,0.0,0.0,0.3,0.7,0.0,0.0,1.0,0.0,0.0,1.114,...,60.0,1392.0,473.876,0.3,564.2,6.0,1338.0,473.876,0.0,1.0
3,0.0,0.0,0.4,0.6,0.0,0.0,1.0,0.0,0.0,0.264,...,54.0,1474.0,501.63,0.3,607.3,0.0,1420.0,501.63,0.0,1.0
4,0.0,0.0,0.5,0.5,0.0,0.0,1.0,0.0,0.0,0.991,...,54.0,1140.0,440.883,0.5,378.4,0.0,1086.0,440.883,0.0,1.0
5,0.0,0.0,0.4,0.6,0.0,0.0,1.0,0.0,0.0,0.726,...,54.0,1392.0,611.156,0.4,657.7,0.0,1338.0,611.156,0.0,1.0
6,0.0,0.0,0.7,0.3,0.0,0.0,1.0,0.0,0.0,0.13,...,54.0,1474.0,700.58,0.4,811.0,0.0,1420.0,700.58,0.0,1.0
7,0.0,0.0,0.7,0.3,0.0,0.0,1.0,0.0,0.0,0.04,...,54.0,1474.0,699.114,0.5,660.8,0.0,1420.0,699.114,0.0,1.0
8,0.0,0.0,0.5,0.5,0.0,0.0,1.0,0.0,0.0,0.017,...,54.0,1474.0,645.082,0.4,695.8,0.0,1420.0,645.082,0.0,1.0
9,0.0,0.0,0.6,0.4,0.0,0.0,1.0,0.0,0.0,0.113,...,54.0,1474.0,692.329,0.4,803.4,0.0,1420.0,692.329,0.0,1.0


In [3]:
# Load the trained model and scaler
rf_classifier = joblib.load('rf_model.joblib')
scaler = joblib.load('rf_scaler.joblib')  # Ensure you've saved and are now loading the scaler

# Separate the features and the actual labels
X_new = full_df[['StDev_pkts_length', 'Max_payload', 'Avg_pkts_length', 'Min_pkts_length']]
y_actual = full_df['Label']

# Scale the features using the loaded scaler
X_new_scaled = scaler.transform(X_new)

# Predict the labels with the loaded model
y_pred = rf_classifier.predict(X_new_scaled)

# Calculate and print the accuracy
accuracy = accuracy_score(y_actual, y_pred)
print(f"Accuracy: {accuracy}")

# Optionally, print a detailed classification report
print(classification_report(y_actual, y_pred))

Accuracy: 0.9704648014440433
              precision    recall  f1-score   support

         0.0       0.97      0.96      0.97     39186
         1.0       0.97      0.97      0.97     49454

    accuracy                           0.97     88640
   macro avg       0.97      0.97      0.97     88640
weighted avg       0.97      0.97      0.97     88640

