# Importing Libraries

In [1]:
import pandas as pd # data processing, read CSV file
import matplotlib.pyplot as plt # graph plotting
import numpy as np # linear algebra
import os # access directory structure
import ast # Abstract Syntax Tree
import math
import pickle

# ML Models (SciKit Learn)
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

# Importing k-fold cross validation
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, confusion_matrix, precision_score, recall_score, auc, accuracy_score, f1_score, classification_report

# Dataset

## Reading

In [3]:
df = pd.read_csv('dataset.csv')

## Initial Preprocessing

In [5]:
df.head()

Unnamed: 0,cachestat_HITS,cachestat_BUFFERS(MB),cachestat_CACHED(MB),pidpersec_PID/s,biopattern_RND(%),biopattern_SEQ(%),biopattern_COUNT,biopattern_KBYTES,cpuunclaimed_CPU(%),ramusage_USED(%),tcpstates_NEWSTATE,miner
0,-0.131878,-1.479101,-1.706736,-0.114224,0.0,0.0,-0.065106,-0.05447,0.0,19.8,1,0
1,-0.131878,-1.479101,-1.706736,-0.114224,0.0,0.0,-0.065106,-0.05447,0.0,25.1,1,0
2,-0.131878,-1.479101,-1.706736,-0.114224,98.0,2.0,0.167004,-0.015849,0.0,28.0,1,0
3,0.580189,-0.137269,0.865642,0.002309,95.0,5.0,-0.018834,-0.043874,0.0,29.7,1,0
4,0.567372,-0.137269,0.866377,0.002309,42.0,58.0,-0.05615,-0.048415,0.0,31.7,1,0


In [None]:
df.drop('bindsnoop_PROT_TCP', axis = 1, inplace = True)
df.drop('bindsnoop_PROT_UDP', axis = 1, inplace = True)

### Splitting Train and Test Data

In [None]:
# Defining Input and Output of the Model using .iloc[rows, columns] function from pandas library.
x = df.iloc[ : , 0:11] # [ : , 0:6] ==> [all rows, column index 0 to 5] (excludes the '6th' index)
y = df.iloc[ : , 11] # ... ==> [all rows, column index 6]; [ : , 6:7] will also perform the same function

In [None]:
x.head()

In [None]:
y.head()

In [None]:
# Splitting Train and Test Data
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size = 0.2, random_state = 1)
# Test Size = 20% of Dataset,
# Random State --> corresponds a random data split with the number, each time this random state no. (1) is used, same data split will be used.
# Using random_state is optional but helps in understanding/explaining what (random) data set was used and enables recreatability of same results.

## Initialising and Training the Models

In [None]:
# Instantiating the Models With Default Parameters

# modelLR = LogisticRegression()
modelNN = MLPClassifier()
modelRF = RandomForestClassifier()
modelDT = DecisionTreeClassifier()
modelSV = SVC()

In [None]:
# Training Models
# modelLR.fit(Xtrain, Ytrain)
modelNN.fit(Xtrain, Ytrain)
modelRF.fit(Xtrain, Ytrain)
modelDT.fit(Xtrain, Ytrain)
modelSV.fit(Xtrain, Ytrain)
# This will take time depending on the size of the data set.

# Printing rsquared-score --> Measures the error in predictions.
# NOTE: To improve predictions --> 1. Improve data set, 2. Play around with hyperparameters of each model.
print("R^2 Scores:")
# print("Linear Regression: ", modelLR.score(Xtest, Ytest))
print("MLP Regressor (Neural Network): ", modelNN.score(Xtest, Ytest))
print("Random Forest: ", modelRF.score(Xtest, Ytest))
print("Decision Tree: ", modelDT.score(Xtest, Ytest))
print("Support Vector Machine: ", modelSV.score(Xtest, Ytest))

### Testing Initial Model

In [None]:
# Testing Models
# This time model will not see the Y data, Ytest will be used to compare the results.
RFpred = modelRF.predict(Xtest)
DTpred = modelDT.predict(Xtest)
SVpred = modelSV.predict(Xtest)
NNpred = modelNN.predict(Xtest)

print("Random Forest Results:")
print("Mean Squared Error: ", mean_squared_error(RFpred, Ytest))
print("Mean Absolute Error: ", mean_absolute_error(RFpred, Ytest))
print("Confusion Matrix:\n", confusion_matrix(RFpred, Ytest))
print("Random Forest Classification Report:\n", classification_report(RFpred, Ytest))

print("")

print("Decision Tree Results:")
print("Mean Squared Error: ", mean_squared_error(DTpred, Ytest))
print("Mean Absolute Error: ", mean_absolute_error(DTpred, Ytest))
print("Confusion Matrix:\n", confusion_matrix(DTpred, Ytest))
print("Decision Tree Classification Report:\n", classification_report(DTpred, Ytest))

print("")

print("Support Vector Machine Results:")
print("Mean Squared Error: ", mean_squared_error(SVpred, Ytest))
print("Mean Absolute Error: ", mean_absolute_error(SVpred, Ytest))
print("Confusion Matrix:\n", confusion_matrix(SVpred, Ytest))
print("Decision Tree Classification Report:\n", classification_report(SVpred, Ytest))

print("")

print("Multi-Layer Perceptron Results:")
print("Mean Squared Error: ", mean_squared_error(SVpred, Ytest))
print("Mean Absolute Error: ", mean_absolute_error(SVpred, Ytest))
print("Confusion Matrix:\n", confusion_matrix(SVpred, Ytest))
print("Decision Tree Classification Report:\n", classification_report(SVpred, Ytest))

### Feature Importance

In [None]:
print("RF: ", modelRF.feature_importances_)
print("DT: ", modelDT.feature_importances_)
# print("SVM: ", modelSV.feature_importances_)
# print("NN:", modelNN.feature_importances_)

## Further Processing Data

In [7]:
df.drop('pidpersec_PID/s', axis = 1, inplace = True)
df.drop('biopattern_RND(%)', axis = 1, inplace = True)
df.drop('biopattern_SEQ(%)', axis = 1, inplace = True)
df.drop('cachestat_HITS', axis = 1, inplace = True)
df.drop('biopattern_COUNT', axis = 1, inplace = True)
df.drop('tcpstates_NEWSTATE', axis = 1, inplace = True)
df.drop('biopattern_KBYTES', axis = 1, inplace = True)

In [9]:
df.head()

Unnamed: 0,cachestat_BUFFERS(MB),cachestat_CACHED(MB),cpuunclaimed_CPU(%),ramusage_USED(%),miner
0,-1.479101,-1.706736,0.0,19.8,0
1,-1.479101,-1.706736,0.0,25.1,0
2,-1.479101,-1.706736,0.0,28.0,0
3,-0.137269,0.865642,0.0,29.7,0
4,-0.137269,0.866377,0.0,31.7,0


## Splitting for training and testing, training, and prediction

In [15]:
xnew = df.iloc[ : , 0:4] # [ : , 0:6] ==> [all rows, column index 0 to 5] (excludes the '6th' index)
ynew = df.iloc[ : , 4] # ... ==> [all rows, column index 6]; [ : , 6:7] will also perform the same function

Xtrainnew, Xtestnew, Ytrainnew, Ytestnew = train_test_split(xnew, ynew, test_size = 0.2, random_state = 1)

modelRFnew = RandomForestClassifier()
modelDTnew = DecisionTreeClassifier()

modelRFnew.fit(Xtrainnew, Ytrainnew)
modelDTnew.fit(Xtrainnew, Ytrainnew)

RFprednew = modelRFnew.predict(Xtestnew)
DTprednew = modelDTnew.predict(Xtestnew)

In [17]:
print("Random Forest Results:")
print("Mean Squared Error: ", mean_squared_error(RFprednew, Ytestnew))
print("Mean Absolute Error: ", mean_absolute_error(RFprednew, Ytestnew))
print("Confusion Matrix:\n", confusion_matrix(RFprednew, Ytestnew))
print("Random Forest Classification Report:\n", classification_report(RFprednew, Ytestnew))
print("Feature Importance: ", modelRFnew.feature_importances_)
print("")

print("Decision Tree Results:")
print("Mean Squared Error: ", mean_squared_error(DTprednew, Ytestnew))
print("Mean Absolute Error: ", mean_absolute_error(DTprednew, Ytestnew))
print("Confusion Matrix:\n", confusion_matrix(DTprednew, Ytestnew))
print("Decision Tree Classification Report:\n", classification_report(DTprednew, Ytestnew))
print("Feature Importance: ", modelDTnew.feature_importances_)

Random Forest Results:
Mean Squared Error:  0.005697081982399259
Mean Absolute Error:  0.005697081982399259
Confusion Matrix:
 [[43908   154]
 [  215 20493]]
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     44062
           1       0.99      0.99      0.99     20708

    accuracy                           0.99     64770
   macro avg       0.99      0.99      0.99     64770
weighted avg       0.99      0.99      0.99     64770

Feature Importance:  [0.32059159 0.38133617 0.11655016 0.18152207]

Decision Tree Results:
Mean Squared Error:  0.008059286706808708
Mean Absolute Error:  0.008059286706808708
Confusion Matrix:
 [[43864   263]
 [  259 20384]]
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     44127
           1       0.99      0.99      0.99     20643

    accuracy                           0.99     6

## Defining and Running Parameter Grids

In [19]:
param_grid = {
    'max_depth': [None, 3, 9, 15, 18, 25],
    'min_samples_split': [2, 5, 7, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Initialize the model
dtc = DecisionTreeClassifier(random_state=1)

# Perform GridSearchCV
grid_search = GridSearchCV(dtc, param_grid, cv=5, scoring='accuracy')
grid_search.fit(Xtrainnew, Ytrainnew)

# Display the best parameters and corresponding score
print(f"Best Parameters for Decision Tree: {grid_search.best_params_}")
print(f"Best Cross-Validation Score for Decision Tree: {grid_search.best_score_:.2f}")

# Get the best model
best_dtc = grid_search.best_estimator_ #Initialising a new model with the best determined hyperparameters.

param_grid_rf = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None],
    'criterion': ['gini', 'entropy']  # Optional
}

rfc = RandomForestClassifier(random_state = 1)

grid_search_rf = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy')
grid_search_rf.fit(Xtrainnew, Ytrainnew)

# Display the best parameters and corresponding score
print(f"Best Parameters for Random Forest: {grid_search_rf.best_params_}")
print(f"Best Cross-Validation Score for Random Forest: {grid_search_rf.best_score_:.2f}")

# Get the best model
best_rfc = grid_search_rf.best_estimator_ #Initialising a new model with the best determined hyperparameters.

Best Parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Cross-Validation Score for Decision Tree: 0.99


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters for Random Forest: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Cross-Validation Score for Random Forest: 0.99


# Final Test

In [22]:
Xtrainnew, Xtestnew, Ytrainnew, Ytestnew = train_test_split(xnew, ynew, test_size = 0.99, random_state = 2)

DTCpred = best_dtc.predict(Xtestnew)
RFCpred = best_rfc.predict(Xtestnew)

In [26]:
print("Random Forest Results:")
print("Mean Squared Error: ", mean_squared_error(RFCpred, Ytestnew))
print("Mean Absolute Error: ", mean_absolute_error(RFCpred, Ytestnew))
print("Confusion Matrix:\n", confusion_matrix(RFCpred, Ytestnew))
print("Random Forest Classification Report:\n", classification_report(RFCpred, Ytestnew))
print("")

print("Decision Tree Results:")
print("Mean Squared Error: ", mean_squared_error(DTCpred, Ytestnew))
print("Mean Absolute Error: ", mean_absolute_error(DTCpred, Ytestnew))
print("Confusion Matrix:\n", confusion_matrix(DTCpred, Ytestnew))
print("Decision Tree Classification Report:\n", classification_report(DTCpred, Ytestnew))

Random Forest Results:
Mean Squared Error:  0.001072961373390558
Mean Absolute Error:  0.001072961373390558
Confusion Matrix:
 [[218021    152]
 [   192 102243]]
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    218173
           1       1.00      1.00      1.00    102435

    accuracy                           1.00    320608
   macro avg       1.00      1.00      1.00    320608
weighted avg       1.00      1.00      1.00    320608


Decision Tree Results:
Mean Squared Error:  0.001621918355125262
Mean Absolute Error:  0.001621918355125262
Confusion Matrix:
 [[217948    255]
 [   265 102140]]
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    218203
           1       1.00      1.00      1.00    102405

    accuracy                           1.00    320608
   macro avg       1.00      1.00      1.00    320608
