In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv(r"E:\OneDrive\Desktop\CLICK\Amrita\SEM 5\PROJECTS\Project_comp_sec\RISS_RansomwareDataset.csv")

# Select all columns except the 1st, 2nd, 3rd as features
X = df.drop(df.columns[[0, 1, 2]], axis=1)

# Select the 2nd column (index 1) as the target
y = df.iloc[:, 1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)
print("KNN Accuracy:", knn_accuracy)
print("KNN Classification Report:")
print(classification_report(y_test, knn_pred))

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)    
dt_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)
print("\nDecision Tree Accuracy:", dt_accuracy)
print("Decision Tree Classification Report:")
print(classification_report(y_test, dt_pred))

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)
print("\nNaive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Classification Report:")
print(classification_report(y_test, nb_pred))

KNN Accuracy: 0.9114754098360656
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.89      0.93       188
           1       0.85      0.94      0.89       117

    accuracy                           0.91       305
   macro avg       0.90      0.92      0.91       305
weighted avg       0.92      0.91      0.91       305


Decision Tree Accuracy: 0.980327868852459
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       188
           1       0.97      0.97      0.97       117

    accuracy                           0.98       305
   macro avg       0.98      0.98      0.98       305
weighted avg       0.98      0.98      0.98       305


Naive Bayes Accuracy: 0.7967213114754098
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.72      0.81       188
           1       0

PCA APPLICATION

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

data = pd.read_csv(r"E:\OneDrive\Desktop\CLICK\Amrita\SEM 5\PROJECTS\Project_comp_sec\RISS_RansomwareDataset.csv")

X = data.drop(data.columns[[0, 1, 2]], axis=1)  # non- fearure columns
y = data['2']  #target column name

# Split the data into training and testing sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Standardize the data (important for PCA)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize PCA
pca = PCA(n_components=0.95)  # Adjust the number of components as necessary
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Explained variance ratio
print(f'Explained variance ratio by components: {pca.explained_variance_ratio_}')

Explained variance ratio by components: [0.04168362 0.04147722 0.03788267 0.03098686 0.02611446 0.02323072
 0.02250129 0.02093016 0.01347111 0.0121865  0.01181374 0.00987653
 0.00985248 0.0093934  0.00890252 0.00833568 0.00798654 0.00793207
 0.00724382 0.00719336 0.00695422 0.00679921 0.00640145 0.00624284
 0.005993   0.0059388  0.00556401 0.00541    0.005315   0.00506815
 0.00494143 0.00468611 0.00457532 0.004544   0.00443668 0.00434655
 0.00434092 0.00416415 0.00411682 0.00389633 0.00381886 0.00376963
 0.00375722 0.0037026  0.00368386 0.00362187 0.00359995 0.00357969
 0.00353479 0.00347801 0.0033896  0.00324849 0.00322762 0.00319816
 0.00317276 0.00311164 0.00309819 0.00305757 0.00303019 0.00300363
 0.00297987 0.0029595  0.00292967 0.00288802 0.00282618 0.00280619
 0.0027909  0.00276925 0.00265096 0.00264922 0.00262962 0.00259682
 0.00253708 0.00251232 0.0025045  0.002479   0.00244037 0.00243231
 0.00232959 0.00231195 0.00228932 0.00227925 0.00226809 0.00224347
 0.00222527 0.00221158

Evaluation of PCA with models

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_pca, y_train)
y_pred_rf = rf_model.predict(X_test_pca)
print("\nRandom Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

# Support Vector Classifier
svc_model = SVC(kernel='linear', random_state=42)
svc_model.fit(X_train_pca, y_train)
y_pred_svc = svc_model.predict(X_test_pca)
print("\nSVC Accuracy:", accuracy_score(y_test, y_pred_svc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svc))
print("Classification Report:\n", classification_report(y_test, y_pred_svc))

# K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_pca, y_train)
y_pred_knn = knn_model.predict(X_test_pca)
print("\nKNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))

# Logistic Regression
logreg_model = LogisticRegression(max_iter=200, random_state=42)
logreg_model.fit(X_train_pca, y_train)
y_pred_logreg = logreg_model.predict(X_test_pca)
print("\nLogistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))


NameError: name 'y_train' is not defined