In [1]:
import pandas as pd

#Put CSV file into a data frame
full_df = pd.read_csv("fullDataFrame.csv")
full_df.head()

Unnamed: 0,Avg_syn_flag,Avg_fin_flag,Avg_ack_flag,Avg_psh_flag,Avg_rst_flag,Avg_DNS_pkt,Avg_TCP_pkt,Avg_UDP_pkt,Avg_ICMP_pkt,Duration_window_flow,...,Min_pkts_length,Max_pkts_length,StDev_pkts_length,Avg_small_payload_pkt,Avg_payload,Min_payload,Max_payload,StDev_payload,Avg_DNS_over_TCP,Label
0,0.0,0.0,0.5,0.5,0.0,0.0,1.0,0.0,0.0,0.759,...,54.0,1404.0,523.807,0.5,424.0,0.0,1350.0,523.807,0.0,1.0
1,0.0,0.0,0.5,0.5,0.0,0.0,1.0,0.0,0.0,4.337,...,60.0,597.0,283.024,0.5,274.5,6.0,543.0,283.024,0.0,1.0
2,0.0,0.0,0.4,0.6,0.0,0.0,1.0,0.0,0.0,0.526,...,54.0,597.0,278.086,0.4,327.6,0.0,543.0,278.086,0.0,1.0
3,0.0,0.0,0.6,0.4,0.0,0.0,1.0,0.0,0.0,0.452,...,54.0,1474.0,635.703,0.5,552.6,0.0,1420.0,635.703,0.0,1.0
4,0.0,0.0,0.5,0.5,0.0,0.0,1.0,0.0,0.0,0.046,...,54.0,1404.0,711.512,0.5,675.0,0.0,1350.0,711.512,0.0,1.0


In [4]:
#Create Ml Model using Random Forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# Select features from the best subset in CFS output
features = ['Max_payload', 'Max_pkts_length', 'Min_payload', 'Avg_UDP_pkt', 'StDev_pkts_length', 'StDev_payload']
target = 'Label'

# Split the DataFrame into features and target
X = full_df[features]
y = full_df[target]

# Scaling the features (optional but recommended)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save the model if needed
# joblib.dump(rf_classifier, 'rf_model.joblib')


0.9645257007212635
              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97    126719
         1.0       0.96      0.96      0.96     92341

    accuracy                           0.96    219060
   macro avg       0.96      0.96      0.96    219060
weighted avg       0.96      0.96      0.96    219060



In [None]:
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import joblib
from datetime import datetime

# Assuming full_df is your DataFrame and is already defined

# List of all possible features based on your selection
all_features = ['StDev_pkts_length', 'StDev_payload', 'Max_payload', 'Max_pkts_length', 'Min_payload', 'Avg_UDP_pkt', 
                'Avg_pkts_length', 'Avg_payload', 'Avg_TCP_pkt', 'Avg_fin_flag', 'Min_pkts_length', 'Avg_syn_flag']

# Target variable
target = 'Label'

# Initialize variables to keep track of the best model and its accuracy
best_accuracy = 0
best_model = None
best_features = []

# Iterate over all combinations of the features list of length 4 to len(all_features)
for r in range(4, len(all_features) + 1):
    for combo in combinations(all_features, r):
        # Select features and target from the DataFrame
        X = full_df[list(combo)]
        y = full_df[target]

        # Scaling the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

        # Train and evaluate the model
        rf_classifier = RandomForestClassifier(random_state=42)
        rf_classifier.fit(X_train, y_train)
        y_pred = rf_classifier.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Update the best model if current model is better
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = rf_classifier
            best_features = combo
            print("Updated Best Model Accuracy:", best_accuracy)
            print("Updated Best Features:", best_features)
            current_time = datetime.now()
            formatted_time = current_time.strftime("%H:%M:%S")
            print("Current time (formatted):", formatted_time)

# Save the best model
joblib.dump(best_model, 'best_rf_model.joblib')

print("Final Best Model Accuracy:", best_accuracy)
print("Final Best Features:", best_features)

#Updated Best Model Accuracy: 0.9566739706016616
#Updated Best Features: ('StDev_pkts_length', 'StDev_payload', 'Max_payload', 'Max_pkts_length')
#Current time (formatted): 16:33:40
#Updated Best Model Accuracy: 0.9644754861681731
#Updated Best Features: ('StDev_pkts_length', 'StDev_payload', 'Max_payload', 'Min_payload')
#Current time (formatted): 16:39:16
#Updated Best Model Accuracy: 0.9766000182598374
#Updated Best Features: ('StDev_pkts_length', 'StDev_payload', 'Max_payload', 'Avg_pkts_length')
#Current time (formatted): 16:52:14
#Updated Best Model Accuracy: 0.9766411028941843
#Updated Best Features: ('StDev_pkts_length', 'StDev_payload', 'Max_payload', 'Avg_payload')
#Current time (formatted): 17:00:11
#Updated Best Model Accuracy: 0.977074774034511
#Updated Best Features: ('StDev_pkts_length', 'Max_payload', 'Max_pkts_length', 'Avg_pkts_length')
#Current time (formatted): 21:33:45
#Updated Best Model Accuracy: 0.9809778142974528
#Updated Best Features: ('StDev_pkts_length', 'Max_payload', 'Min_payload', 'Avg_pkts_length')
#Current time (formatted): 22:07:55
#Updated Best Model Accuracy: 0.9810782434036337
#Updated Best Features: ('StDev_pkts_length', 'Max_payload', 'Avg_pkts_length', 'Min_pkts_length')
#Current time (formatted): 23:31:40


Updated Best Model Accuracy: 0.9566739706016616
Updated Best Features: ('StDev_pkts_length', 'StDev_payload', 'Max_payload', 'Max_pkts_length')
Current time (formatted): 16:33:40
Updated Best Model Accuracy: 0.9644754861681731
Updated Best Features: ('StDev_pkts_length', 'StDev_payload', 'Max_payload', 'Min_payload')
Current time (formatted): 16:39:16
Updated Best Model Accuracy: 0.9766000182598374
Updated Best Features: ('StDev_pkts_length', 'StDev_payload', 'Max_payload', 'Avg_pkts_length')
Current time (formatted): 16:52:14
Updated Best Model Accuracy: 0.9766411028941843
Updated Best Features: ('StDev_pkts_length', 'StDev_payload', 'Max_payload', 'Avg_payload')
Current time (formatted): 17:00:11
Updated Best Model Accuracy: 0.977074774034511
Updated Best Features: ('StDev_pkts_length', 'Max_payload', 'Max_pkts_length', 'Avg_pkts_length')
Current time (formatted): 21:33:45
Updated Best Model Accuracy: 0.9809778142974528
Updated Best Features: ('StDev_pkts_length', 'Max_payload', 'Min