Hyperparameter tuning of Random Forest Classifier (RISS DATASET)

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

KeyboardInterrupt: 

GridSearchCV (without pre, rfc, svm(Scroll a bit))

In [2]:
dataset_path = r"E:\OneDrive\Desktop\CLICK\Amrita\SEM 5\PROJECTS\Project_comp_sec\RISS_RansomwareDataset.csv"   
df = pd.read_csv(dataset_path)

In [3]:
df.head(2)

Unnamed: 0,10001,1,2,0,0.1,0.2,0.3,0.4,0.5,0.6,...,0.30925,0.30926,0.30927,0.30928,0.30929,0.30930,0.30931,0.30932,0.30933,0.30934
0,10002,1,3,1,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,10003,1,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# All columns except the (1st, 2nd, 3rd) as features
X = df.drop(df.columns[[0,1,2]], axis=1)

#2nd column (index 1) as the target
y = df.iloc[:, 1]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
rfc = RandomForestClassifier(random_state=42)


In [7]:
# Define the parameter grid for rfc
param_grid = {
    'n_estimators': [50,100, 200], 
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)  # 5-fold cross-validation

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Output is best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Best Score: 0.9597719759832692


In [8]:
# Define the Random Forest Classifier again as best score is not same as accuracy
rfc = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, random_state=42)    
# Fit the model to the training data
rfc.fit(X_train, y_train)

#predictions on the test set
y_pred = rfc.predict(X_test)

In [9]:
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))
#Distinguishing between ransomware and benign samples is a binary classification problem, hence the high accuracy.


Test Accuracy: 0.9770491803278688
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       188
           1       0.97      0.97      0.97       117

    accuracy                           0.98       305
   macro avg       0.98      0.98      0.98       305
weighted avg       0.98      0.98      0.98       305



RandomisedSearchCV (RFC and SVM without dimensionality reduct)

In [10]:
#parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': np.arange(50, 201, 10),  # Testing from 50 to 200 trees
    'max_depth': [None] + list(np.arange(10, 31, 5)),  # Testing various max depths
    'min_samples_split': np.arange(2, 11)  # Testing values from 2 to 10
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rfc, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Output the best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Best Parameters: {'n_estimators': 190, 'min_samples_split': 3, 'max_depth': None}
Best Score: 0.9622276192403696


In [11]:
rfc = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, random_state=42)    
# Fitting the model to the training data
rfc.fit(X_train, y_train)

# redictions on the test set
y_pred = rfc.predict(X_test)

#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))
#Distinguishing between ransomware and benign samples is a binary classification problem, hence the high accuracy.

Test Accuracy: 0.9770491803278688
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       188
           1       0.97      0.97      0.97       117

    accuracy                           0.98       305
   macro avg       0.98      0.98      0.98       305
weighted avg       0.98      0.98      0.98       305



In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#  SVM model
svm = SVC(random_state=42)

# Defining the parameter grid for RandomizedSearchCV
param_dist = {
    'C': np.logspace(-3, 3, 10),            # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],    # Kernel type
    'gamma': ['scale', 'auto'],             # Kernel coefficient for 'rbf' and 'poly'
    'degree': [2, 3, 4],                    # Degree for 'poly' kernel
}

# Setting up the RandomizedSearchCV with 5-fold cross-validation
random_search = RandomizedSearchCV(estimator=svm, param_distributions=param_dist, 
                                   n_iter=20, cv=5, scoring='accuracy', n_jobs=-1, 
                                   verbose=3, random_state=42)

# Fitting the RandomizedSearchCV on the scaled training data
random_search.fit(X_train_scaled, y_train)

# best parameters from RandomizedSearchCV
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Use the best estimator to predict on the test set
best_svm = random_search.best_estimator_
y_pred_svm = best_svm.predict(X_test_scaled)

# Evaluating the tuned model
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy (Tuned): {:.2f}%".format(svm_accuracy * 100))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'kernel': 'rbf', 'gamma': 'scale', 'degree': 2, 'C': 215.44346900318823}
SVM Accuracy (Tuned): 89.51%

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.84      0.91       188
           1       0.79      0.98      0.88       117

    accuracy                           0.90       305
   macro avg       0.89      0.91      0.89       305
weighted avg       0.91      0.90      0.90       305



GriSearchCV (SVM and RFC, without pre)

In [13]:
# Define the parameter grid for svm
param_grid = {
    'C': [0.1, 1, 10,100],       # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel type
    'gamma': ['scale', 'auto'],    # Kernel coefficient for 'rbf' or 'poly'
    'degree': [2, 3, 4]            # Degree for 'poly' kernel
}

In [14]:
from sklearn.svm import SVC

svc = SVC()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_}")

# Test set accuracy with best parameters
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
Best Cross-Validation Score: 0.9753659852931256
Test Accuracy: 0.9836065573770492


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Feature scaling (optional, depending on your dataset)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Random Forest Classifier
rfc = RandomForestClassifier(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 300, 500],      # Number of trees in the forest
    'max_depth': [10, 20, None],          # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],      # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],        # Minimum samples required to be at a leaf node
}

# Set up the GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)

# Fit the GridSearchCV on the scaled training data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters from the GridSearch
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best estimator to predict on the test set
best_rfc = grid_search.best_estimator_
y_pred_rfc = best_rfc.predict(X_test_scaled)

# Evaluate the tuned model
rfc_accuracy = accuracy_score(y_test, y_pred_rfc)
print("Random Forest Classifier Accuracy (Tuned): {:.2f}%".format(rfc_accuracy * 100))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rfc))


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Random Forest Classifier Accuracy (Tuned): 97.38%

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98       188
           1       0.96      0.97      0.97       117

    accuracy                           0.97       305
   macro avg       0.97      0.97      0.97       305
weighted avg       0.97      0.97      0.97       305



Dimensionlaity reduct_(umap)

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import umap

dataset_path = r"E:\OneDrive\Desktop\CLICK\Amrita\SEM 5\PROJECTS\Project_comp_sec\RISS_RansomwareDataset.csv"
df = pd.read_csv(dataset_path)

# Preprocessing - remove irrelevant columns
# first column is sample ID and second column is the label
X = df.iloc[:, 3:]  # Features starting from column 3
y = df.iloc[:, 1]   # Labels (benign or ransomware)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply UMAP for dimensionality reduction with higher dimensions
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=500, random_state=42, verbose=3)

# Fit and transform the training data
X_train_umap = umap_model.fit_transform(X_train_scaled)

# Optional: Transform the test set
X_test_umap = umap_model.transform(X_test_scaled)

# The reduced datasets (X_train_umap, X_test_umap) can now be used for further processing or classification


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


UMAP(n_components=500, n_jobs=1, random_state=42, verbose=3)
Sun Sep 29 17:12:29 2024 Construct fuzzy simplicial set
Sun Sep 29 17:12:41 2024 Finding Nearest Neighbors
Sun Sep 29 17:12:41 2024 Finished Nearest Neighbor Search
Sun Sep 29 17:12:41 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sun Sep 29 17:13:06 2024 Finished embedding


Epochs completed:   0%|            0/100 [00:00]

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs


RFC after applying umap 

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the UMAP-reduced training data
rfc.fit(X_train_umap, y_train)

# Predict on the test set
y_pred_rfc = rfc.predict(X_test_umap)

# Evaluate the model
rfc_accuracy = accuracy_score(y_test, y_pred_rfc)
print("Random Forest Classifier Accuracy: {:.2f}%".format(rfc_accuracy * 100))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rfc))


Random Forest Classifier Accuracy: 59.02%

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.49      0.60       188
           1       0.48      0.74      0.58       117

    accuracy                           0.59       305
   macro avg       0.62      0.62      0.59       305
weighted avg       0.65      0.59      0.59       305



SVM after applying umap (dimensionality reduct)

In [24]:
from sklearn.svm import SVC

# Initialize the Support Vector Classifier with an RBF kernel
svm = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)

# Fit the model on the UMAP-reduced training data
svm.fit(X_train_umap, y_train)

# Predict on the test set
y_pred_svm = svm.predict(X_test_umap)

# Evaluate the model
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print("SVM Classifier Accuracy: {:.2f}%".format(svm_accuracy * 100))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))


SVM Classifier Accuracy: 79.34%

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.97      0.85       188
           1       0.92      0.50      0.65       117

    accuracy                           0.79       305
   macro avg       0.84      0.74      0.75       305
weighted avg       0.82      0.79      0.78       305



Applying MCA ()

In [26]:
import mca

# Apply MCA to the binary features
mca_ben = mca.MCA(X, ncols=500)

# Get the transformed dataset (with reduced dimensions)
X_mca = pd.DataFrame(mca_ben.fs_r(N=200))  # Keeping 2 dimensions for simplicity

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_mca, y, test_size=0.2, random_state=42)

# Use a Random Forest classifier (you can use any model really!!!)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict the target on the test set
y_pred = clf.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")          
 
# Optionally, print the explained variance of the dimensions
print("Explained Variance (inertia) per dimension:", mca_ben.L)


MemoryError: Unable to allocate 7.14 GiB for an array with shape (30967, 30967) and data type float64