In this notebook, we will be building our model for the closed-world experiments to classify 95 monitored websites

In [None]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import os
import psutil
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler

# Extracting Data

To begin, we will first import the dataframes into this notebook. Run either 1 of these blocks

If you are using Google Colab, run this block

In [None]:
# 1 Use this with colab
print("Loading datafile...")
with open('datasets/extracted_features.pkl', 'rb') as f:
    extracted_df = pickle.load(f)
print ("Data loaded")

If you are using local, run this block

In [None]:
# 2 Use this for local (change the directory to where the extracted_features.pkl is stored on your local machine)
# Load the pickle file
print("Loading datafile...")
# change this directory to the directory where mon_standard.pkl is stored on your local machine
file_path = r'C:\EWHA\Term 2\Machine Learning\pro\neurotic_networkers\extracted_features.pkl' # Jordans local path
with open(file_path, 'rb') as f: # Path to extracted_features.pkl in Colab
    extracted_df = pickle.load(f)
print ("Data loaded")

In [None]:
extracted_df

In this notebook, since we will only be dealing with the closed world experiments, we will first extract the relevant data from the dataframe

In [None]:
closed_world_df = extracted_df[extracted_df['label'] != -1]

In [None]:
closed_world_df

Next, we separate the features and the target. Target will be label which represents the label of the monitored websites. Features will be the remaining columns.

In [None]:
X_initial = closed_world_df.drop(columns=['label'])
y_initial = closed_world_df['label']

# Constructing Model with all features

Here, we construct a model using all of the features.

In [None]:
# Split data into training and testing sets
X_initial_train, X_initial_test, y_initial_train, y_initial_test = train_test_split(
    X_initial, y_initial, test_size=0.2, random_state=42
)

In [None]:
# Initialize the model
clf_all_features = RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=100, min_samples_split=2, max_features="sqrt", random_state=42)

# Train the model
clf_all_features.fit(X_initial_train, y_initial_train)

In [None]:
# Create a process object to track current process
process = psutil.Process(os.getpid())
# tracking start memory
memory_before = process.memory_info().rss / (1024 * 1024)
# tracking start time
start_time = time.time()
# Make predictions on the train set
y_train_pred = clf_all_features.predict(X_initial_train)
# tracking end time
end_time = time.time()
# tracking end memory
memory_after = process.memory_info().rss / (1024 * 1024)

time_taken_all_features_train = end_time - start_time
memory_used_all_features_train = memory_after - memory_before

In [None]:
# Print accuracy and other metrics
print("Memory used:", memory_used_all_features_train, "MB")
print("Time taken to predict:", time_taken_all_features_train, "seconds")
print("Model Accuracy:", accuracy_score(y_initial_train, y_train_pred))
print(classification_report(y_initial_train, y_train_pred))

In [None]:
# Create a process object to track current process
process = psutil.Process(os.getpid())
# tracking start memory
memory_before = process.memory_info().rss / (1024 * 1024)
# tracking start time
start_time = time.time()
# Make predictions on the test set
y_initial_pred = clf_all_features.predict(X_initial_test)
# tracking end time
end_time = time.time()
# tracking end memory
memory_after = process.memory_info().rss / (1024 * 1024)

time_taken_all_features_test = end_time - start_time
memory_used_all_features_test = memory_after - memory_before

In [None]:
# Print accuracy and other metrics
print("Memory used:", memory_used_all_features_test, "MB")
print("Time taken to predict:", time_taken_all_features_test, "seconds")
print("Model Accuracy:", accuracy_score(y_initial_test, y_initial_pred))
print(classification_report(y_initial_test, y_initial_pred))

# Feature Importance

In [None]:
df_X = pd.DataFrame(X_initial_train)

In [None]:
df_X.head()

In [None]:
# using entropy
model_1= RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=100, min_samples_split=2, max_features="sqrt", random_state=42)
model_1.fit(X_initial_train, y_initial_train)
feature_imp_1 = pd.Series(model_1.feature_importances_, index=df_X.columns)

In [None]:
# using gini
model_2= RandomForestClassifier(n_estimators=100, criterion="gini", max_depth=100, min_samples_split=2, max_features="sqrt", random_state=42)
model_2.fit(X_initial_train, y_initial_train)
feature_imp_2 = pd.Series(model_2.feature_importances_, index=df_X.columns)

In [None]:
feature_names = df_X.columns
entropy_values = feature_imp_1.values
gini_values = feature_imp_2.values

bar_width = 0.4 
y_positions = range(len(feature_names)) 

plt.figure(figsize=(10,6))

#plotting entropy importance
plt.barh(
    [y - bar_width / 2 for y in y_positions],
    entropy_values,
    bar_width,
    label='Entropy',
    color='blue',
)

# plotting gini improtance
plt.barh(
    [y + bar_width / 2 for y in y_positions],
    gini_values,
    bar_width,
    label='Gini',
    color='orange',
)

plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.yticks(y_positions, feature_names) 
plt.title('Visualisation of Feature Importance: Entropy vs Gini')
plt.legend()
plt.tight_layout()
plt.show()

Recall the extracted features:
> **Feature Group 1: Traffic Volume (Absolute)**  
> - Feature 1: Number of incoming packets  
> - Feature 2: Number of outgoing packets  
> - Feature 3: Total number of packets  
> 
> **Feature Group 2: Traffic Volume (Fraction)**
> - Feature 1: Number of incoming packets as a fraction of the total number of packets  
> - Feature 2: Number of outgoing packets as a fraction of the total number of packets 
> 
> **Feature Group 3: Traffic Ordering List**
> - Feature 6: Standard deviation of the outgoing packets ordering list  
> - Feature 7: Average of the outgoing packets ordering list  
> 
> **Feature Group 4: Traffic concentration** 
> - Feature 8: Sum of all items in the alternative concentration feature list  
> - Feature 9: Average of all items in the alternative concentration feature list  

We noted that within each of the 4 feature groups, the features are likely to be highly correlated due to their similarity. Furthermore, feature groups 1 and 2 are closely related as well with 1 being an absolute measurement of traffic volume and 2 as the ratio. Hence we will be selecting 2 features from the combination of group 1 and 2, and 1 feature each from group 3 and 4.

According to our feature importance analysis, we have selected the features to be
1. Feature 2: Number of outgoing packets   
2. Feature 3: Total number of packets
3. Feature 7: Average of the outgoing packets ordering list 
4. Feature 8: Sum of all items in the alternative concentration feature list  

In [None]:
X = closed_world_df[['outgoing_packet_counts', 'total_packet_counts', 'avg_outgoing_order', 'sum_concentration']]
y = closed_world_df['label']

# Constructing Model with selected features

In this section, we construct an arbitrary random forest classification model using arbitrarily chosen parameters. This section aims to explore the implementation of the model. These parameters will be tuned in the next section. 

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
df = pd.DataFrame(X_train)

In [None]:
df.head()

In [None]:
# Initialize the model
clf_selected_features = RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=100, min_samples_split=2, max_features="sqrt", random_state=42)

# Train the model
clf_selected_features.fit(X_train, y_train)

In [None]:
# Create a process object to track current process
process = psutil.Process(os.getpid())
# tracking start memory
memory_before = process.memory_info().rss / (1024 * 1024)
# tracking start time
start_time = time.time()
# Make predictions on the train set
y_train_pred = clf_selected_features.predict(X_train)
# tracking end time
end_time = time.time()
# tracking end memory
memory_after = process.memory_info().rss / (1024 * 1024)

time_taken_selected_features_train = end_time - start_time
memory_used_selected_features_train = memory_after - memory_before

In [None]:
# Print accuracy and other metrics
print("Memory used:", memory_used_selected_features_train, "MB")
print("Time taken to predict:", time_taken_selected_features_train, "seconds")
print("Model Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))
# JORDAN ADD THE RAM HERE 

In [None]:
# Create a process object to track current process
process = psutil.Process(os.getpid())
# tracking start memory
memory_before = process.memory_info().rss / (1024 * 1024)
# tracking start time
start_time = time.time()
# Make predictions on the test set
y_test_pred = clf_selected_features.predict(X_test)
# tracking end time
end_time = time.time()
# tracking end memory
memory_after = process.memory_info().rss / (1024 * 1024)

time_taken_selected_features_test = end_time - start_time
memory_used_selected_features_test = memory_after - memory_before

In [None]:
# Print accuracy and other metrics
print("Memory used:", memory_used_selected_features_test, "MB")
print("Time taken to predict:", time_taken_selected_features_test, "seconds")
print("Model Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))
# JORDAN ADD THE RAM HERE 

# Model Tuning

In this section, we will be using Grid Search to tune our model parameters for our Random Forest Classifier

In [None]:
# Defining the parameter grid for the Grid Search
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 20],
    'min_samples_split': [10, 15],
    'min_samples_leaf': [5, 10],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True],
    'criterion': ['gini', 'entropy']           
}

# Defining the grid search
grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42), 
    param_grid=param_grid, 
    cv=5,  
    refit=True, 
    verbose = 3,
    n_jobs=-1
)

In [None]:
# Fitting the grid search
start_time = time.time()
grid.fit(X_train, y_train)
end_time = time.time()
time_taken_grid_search = end_time - start_time

In [None]:
# Print the time taken to perform the grid search
print("Time taken for grid search:", time_taken_grid_search, "seconds")

In [None]:
# Print best parameters after grid search
print("Best parameters found:", grid.best_params_)

In [None]:
# Create a process object to track current process
process = psutil.Process(os.getpid())
# tracking start memory
memory_before = process.memory_info().rss / (1024 * 1024)
# tracking start time
start_time = time.time()
# Make predictions on the test set
y_train_pred = grid.predict(X_train)
# tracking end time
end_time = time.time()
# tracking end memory
memory_after = process.memory_info().rss / (1024 * 1024)

time_taken_selected_features_tuned_train = end_time - start_time
memory_used_selected_features_tuned_train = memory_after - memory_before

In [None]:
# Print accuracy and other metrics
print("Memory used:", memory_used_selected_features_tuned_train, "MB")
print("Time taken to predict:", time_taken_selected_features_tuned_train, "seconds")
print("Model Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

In [None]:
# Create a process object to track current process
process = psutil.Process(os.getpid())
# tracking start memory
memory_before = process.memory_info().rss / (1024 * 1024)
# tracking start time
start_time = time.time()
# Make predictions on the test set
y_test_pred = grid.predict(X_test)
# tracking end time
end_time = time.time()
# tracking end memory
memory_after = process.memory_info().rss / (1024 * 1024)

time_taken_selected_features_tuned_test = end_time - start_time
memory_used_selected_features_tuned_test = memory_after - memory_before

In [None]:
# Print accuracy and other metrics
print("Memory used:", memory_used_selected_features_tuned_test, "MB")
print("Time taken to predict:", time_taken_selected_features_tuned_test, "seconds")
print("Model Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))
# JORDAN ADD THE RAM HERE 