In [None]:
import pandas
import os
import numpy
import plotly.express
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, balanced_accuracy_score, matthews_corrcoef, average_precision_score
from sklearn.tree import plot_tree
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
import xgboost
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC



In [None]:
df = pandas.read_csv("data/Network datatset/csv/attack_1.csv", encoding="us-ascii", delimiter=',', skipinitialspace=True)

In [None]:
nb_attaque= df['label'].value_counts()
print(nb_attaque)
ratio = (nb_attaque.iloc[1]+nb_attaque.iloc[2]+nb_attaque.iloc[3])/(nb_attaque.iloc[0]+nb_attaque.iloc[1]+nb_attaque.iloc[2]+nb_attaque.iloc[3])
print(ratio*100)

In [None]:
train_sizes = [100, 500,1000,5000,10000,100000]
reduced_datasets = {}


recall_scores = []
accuracy_scores = []

clf = xgboost.XGBClassifier()

for size in train_sizes:

    fraudulent_entries = df[df['label_n'] == 1].sample(n=int(size * ratio) +1)
    non_fraudulent_entries = df[df['label_n'] == 0].sample(n=int(size * (1-ratio)))

    df_reduced = pandas.concat([fraudulent_entries, non_fraudulent_entries])

    df_reduced['modbus_response'] = df_reduced['modbus_response'].str.replace(r'\[(\d+)\]', r'\1', regex=True)
    df_reduced['modbus_response'] = pandas.to_numeric(df_reduced['modbus_response'], errors='coerce')
    df_reduced['modbus_response'] = df_reduced['modbus_response'].fillna(-1)
    df_reduced = df_reduced.dropna()

    reduced_datasets[size] = df_reduced
    X_red=df_reduced.drop(['label_n', 'Time','label'], axis=1)
    X_red = pandas.get_dummies(X_red)
    Y_red=df_reduced['label_n'].copy()

    X_train, X_test, Y_train, Y_test = train_test_split(X_red, Y_red, test_size=0.2, random_state=42)

    # Create an SVM classifier
    clf = DecisionTreeClassifier()
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    Y_pred_proba = clf.predict_proba(X_test)
   
    recall_scores.append(recall_score(Y_test, Y_pred))
    accuracy_scores.append(accuracy_score(Y_test, Y_pred))

In [None]:
# Plot the results
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

plt.plot(train_sizes, recall_scores, label='Recall')
plt.plot(train_sizes, accuracy_scores, label='Accuracy')

plt.title('Xgboost for Different Training Set Sizes')
plt.xlabel('Training Set Size')
plt.ylabel('Score')
plt.legend()
plt.show()

In [None]:
train_sizes = [100, 500,1000,5000,10000,100000]
reduced_datasets = {}


recall_scores = []
accuracy_scores = []

clf = DecisionTreeClassifier()


for size in train_sizes:

    fraudulent_entries = df[df['label_n'] == 1].sample(n=int(size * ratio) +1)
    non_fraudulent_entries = df[df['label_n'] == 0].sample(n=int(size * (1-ratio)))

    df_reduced = pandas.concat([fraudulent_entries, non_fraudulent_entries])

    df_reduced['modbus_response'] = df_reduced['modbus_response'].str.replace(r'\[(\d+)\]', r'\1', regex=True)
    df_reduced['modbus_response'] = pandas.to_numeric(df_reduced['modbus_response'], errors='coerce')
    df_reduced['modbus_response'] = df_reduced['modbus_response'].fillna(-1)
    df_reduced = df_reduced.dropna()

    reduced_datasets[size] = df_reduced
    X_red=df_reduced.drop(['label_n', 'Time','label'], axis=1)
    X_red = pandas.get_dummies(X_red)
    Y_red=df_reduced['label_n'].copy()

    X_train, X_test, Y_train, Y_test = train_test_split(X_red, Y_red, test_size=0.2, random_state=42)

    clf = DecisionTreeClassifier()
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    Y_pred_proba = clf.predict_proba(X_test)
   
    recall_scores.append(recall_score(Y_test, Y_pred))
    accuracy_scores.append(accuracy_score(Y_test, Y_pred))

In [None]:
# Plot the results
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

plt.plot(train_sizes, recall_scores, label='Recall')
plt.plot(train_sizes, accuracy_scores, label='Accuracy')

plt.title('Decision Tree for Different Training Set Sizes')
plt.xlabel('Training Set Size')
plt.ylabel('Score')
plt.legend()
plt.show()

In [None]:
train_sizes = [100, 500,1000,5000,10000,100000]
reduced_datasets = {}


recall_scores = []
accuracy_scores = []

knn = KNeighborsClassifier(n_neighbors=1)


for size in train_sizes:

    fraudulent_entries = df[df['label_n'] == 1].sample(n=int(size * ratio) +1)
    non_fraudulent_entries = df[df['label_n'] == 0].sample(n=int(size * (1-ratio)))

    df_reduced = pandas.concat([fraudulent_entries, non_fraudulent_entries])

    df_reduced['modbus_response'] = df_reduced['modbus_response'].str.replace(r'\[(\d+)\]', r'\1', regex=True)
    df_reduced['modbus_response'] = pandas.to_numeric(df_reduced['modbus_response'], errors='coerce')
    df_reduced['modbus_response'] = df_reduced['modbus_response'].fillna(-1)
    df_reduced = df_reduced.dropna()

    reduced_datasets[size] = df_reduced
    X_red=df_reduced.drop(['label_n', 'Time','label'], axis=1)
    X_red = pandas.get_dummies(X_red)
    Y_red=df_reduced['label_n'].copy()

    X_train, X_test, Y_train, Y_test = train_test_split(X_red, Y_red, test_size=0.2, random_state=42)

    knn = DecisionTreeClassifier()
    knn.fit(X_train, Y_train)
    Y_pred = knn.predict(X_test)
    Y_pred_proba = knn.predict_proba(X_test)
   
    recall_scores.append(recall_score(Y_test, Y_pred))
    accuracy_scores.append(accuracy_score(Y_test, Y_pred))

In [None]:
# Plot the results
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

plt.plot(train_sizes, recall_scores, label='Recall')
plt.plot(train_sizes, accuracy_scores, label='Accuracy')

plt.title('KNN for Different Training Set Sizes')
plt.xlabel('Training Set Size')
plt.ylabel('Score')
plt.legend()
plt.show()

In [None]:
train_sizes = [100, 500,1000,5000,10000,100000]
reduced_datasets = {}


recall_scores = []
accuracy_scores = []

svm = SVC(kernel='rbf')

for size in train_sizes:

    fraudulent_entries = df[df['label_n'] == 1].sample(n=int(size * ratio) +1)
    non_fraudulent_entries = df[df['label_n'] == 0].sample(n=int(size * (1-ratio)))

    # Concatenate both sets to create df_reduced
    df_reduced = pandas.concat([fraudulent_entries, non_fraudulent_entries])

    df_reduced['modbus_response'] = df_reduced['modbus_response'].str.replace(r'\[(\d+)\]', r'\1', regex=True)
    df_reduced['modbus_response'] = pandas.to_numeric(df_reduced['modbus_response'], errors='coerce')
    df_reduced['modbus_response'] = df_reduced['modbus_response'].fillna(-1)
    df_reduced = df_reduced.dropna()

    reduced_datasets[size] = df_reduced
    X_red=df_reduced.drop(['label_n', 'Time','label'], axis=1)
    X_red = pandas.get_dummies(X_red)
    Y_red=df_reduced['label_n'].copy()

    X_train, X_test, Y_train, Y_test = train_test_split(X_red, Y_red, test_size=0.2, random_state=42)

    # Create an SVM classifier
    svm = DecisionTreeClassifier()
    svm.fit(X_train, Y_train)
    Y_pred = svm.predict(X_test)
    Y_pred_proba = svm.predict_proba(X_test)
   
    recall_scores.append(recall_score(Y_test, Y_pred))
    accuracy_scores.append(accuracy_score(Y_test, Y_pred))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

plt.plot(train_sizes, recall_scores, label='Recall')
plt.plot(train_sizes, accuracy_scores, label='Accuracy')

plt.title('SVM for Different Training Set Sizes')
plt.xlabel('Training Set Size')
plt.ylabel('Score')
plt.legend()
plt.show()