In [None]:
import pandas
import os
import numpy
import plotly.express
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, balanced_accuracy_score, matthews_corrcoef, average_precision_score
from sklearn.tree import plot_tree
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
import xgboost
from sklearn.neighbors import KNeighborsClassifier


In [None]:
df = pandas.read_csv("data/Physical dataset/phy_att_1.csv", encoding="utf-16le", delimiter='\t')

In [None]:
print(df)

In [None]:
nb_attaque= df['Label'].value_counts()
print(nb_attaque)
ratio = (nb_attaque.iloc[1]+nb_attaque.iloc[2])/(nb_attaque.iloc[0]+nb_attaque.iloc[1]+nb_attaque.iloc[2])
print(ratio*100)

In [None]:
colonnes_catégorielles = df.select_dtypes(include=['category', 'object']).columns.tolist()

# Extraction des noms des colonnes numériques
colonnes_numériques = df.select_dtypes(include=['number']).columns.tolist()

print("Colonnes catégorielles :")
print(colonnes_catégorielles)

print("\nColonnes numériques :")
print(colonnes_numériques)

In [None]:
cat_stats = df[colonnes_catégorielles].describe()
print(cat_stats)
print()
valeur_catégorielles = {}
for col in colonnes_catégorielles:
    valeur_catégorielles[col] = df[col].unique()
    print("%s: %s" % (col, valeur_catégorielles[col]))

In [None]:
# Extract statistics for each numeric column
numeric_stats = df[colonnes_numériques].describe()

# Print the list of numeric columns
print("Numeric Columns:")
print(colonnes_numériques)

# Print statistics for each numeric column
print("\nStatistics for Numeric Columns:")
print(numeric_stats)

In [None]:
nan_ratios = df.isna().mean()   
print(nan_ratios)


In [None]:
target_attack_categories  = df['Label'].unique()
print(target_attack_categories)
target_attack_categories = target_attack_categories[1:]

In [None]:
print(target_attack_categories)

In [None]:
# on récupère tous les labels 1
MM = df[(df['Label_n'] == 1)]
#on récupère toute les colonnes où il y a bien une attaque pour un label 1
matching_rows = (MM['Label'].isin(target_attack_categories))
# on vérifie que l'on a bien que la valeur True
print(matching_rows.all())

In [None]:
# on récupère tous les labels 0
MM = df[(df['Label_n'] == 0)]
#on récupère toute les colonnes où il y a bien une attaque pour un Label 0
matching_rows =(MM['Label'].isin(["normal"]))
# on vérifie que l'on a bien que la valeur True
print(matching_rows.all())

In [None]:
# Create a new DataFrame X you will use for cleaning.
fraudulent_entries = df[df['Label_n'] == 1].sample(n=int(1000 * ratio) +1)

# Sample non-fraudulent entries to match the ratio
non_fraudulent_entries = df[df['Label_n'] == 0].sample(n=int(1000 * (1-ratio)))

# Concatenate both sets to create df_reduced
X = pandas.concat([fraudulent_entries, non_fraudulent_entries])
print(X.shape)


In [None]:
# Create a new DataFrame X you will use for cleaning.
X = df.copy()

In [None]:
# Store the attack_cat labels in a DataFrame Y as a 1-D array
Y = X['Label_n'].copy()

In [None]:
print(Y)

In [None]:
X.drop(['Label', 'Time','Label_n'], axis=1, inplace=True)

In [None]:
# Extract train and test data through train_test_split on X and Y
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
Y_pred_proba = clf.predict_proba(X_test)
print(Y_pred)

In [None]:
print('Confusion matrix:', confusion_matrix(Y_test, Y_pred))
print('Precision:', precision_score(Y_test, Y_pred))
print('Recall:', recall_score(Y_test, Y_pred))
print('Accuracy:', accuracy_score(Y_test, Y_pred))
print('Balanced accuracy:', balanced_accuracy_score(Y_test, Y_pred))
print('Matthews Correlation Coefficient:', matthews_corrcoef(Y_test, Y_pred))
print('AUPRC:', average_precision_score(Y_test, Y_pred))

In [None]:
# For the XGBClassifier interface in sklearn
# 1. Perform the same analysis
# 2. Plot the relative importance of fields through plot_importance, using importance_type ‘cover’, ‘gain’, ‘weight’
# 3. Show the classifier visually


clf = xgboost.XGBClassifier()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
Y_pred_proba = clf.predict_proba(X_test)

print('Confusion matrix:', confusion_matrix(Y_test, Y_pred))
print('Precision:', precision_score(Y_test, Y_pred))
print('Recall:', recall_score(Y_test, Y_pred))
print('Accuracy:', accuracy_score(Y_test, Y_pred))
print('Balanced accuracy:', balanced_accuracy_score(Y_test, Y_pred))
print('Matthews Correlation Coefficient:', matthews_corrcoef(Y_test, Y_pred))
print('AUPRC:', average_precision_score(Y_test, Y_pred))

fig = plotly.express.imshow(confusion_matrix(Y_test, Y_pred), title='Confusion matrix')
fig.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
Y_pred_proba = knn.predict_proba(X_test)

print("Confusion matrix:", confusion_matrix(Y_test, Y_pred))
print("Precision:", precision_score(Y_test, Y_pred))
print("Recall:", recall_score(Y_test, Y_pred))
print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("Balanced accuracy:", balanced_accuracy_score(Y_test, Y_pred))
print("Matthews Correlation Coefficient:", matthews_corrcoef(Y_test, Y_pred))
print("AUPRC:", average_precision_score(Y_test, Y_pred))

fig = plotly.express.imshow(confusion_matrix(Y_test, Y_pred), title="Confusion matrix")
fig.show()

In [None]:
F = pandas.read_csv("data/Physical dataset/phy_att_4.csv", encoding="utf-8", delimiter=',')
Y = F['Label_n'].copy()
F.drop(['Label', 'Time','Label_n'], axis=1, inplace=True)

Y_pred = clf.predict(F)
Y_pred_proba = clf.predict_proba(F)

print('Confusion matrix:', confusion_matrix(Y, Y_pred))
print('Precision:', precision_score(Y, Y_pred))
print('Recall:', recall_score(Y, Y_pred))
print('Accuracy:', accuracy_score(Y, Y_pred))
print('Balanced accuracy:', balanced_accuracy_score(Y, Y_pred))
print('Matthews Correlation Coefficient:', matthews_corrcoef(Y, Y_pred))
print('AUPRC:', average_precision_score(Y, Y_pred))

fig = plotly.express.imshow(confusion_matrix(Y, Y_pred), title='Confusion matrix')
fig.show()

Les résultats ne sont pas "bon" (on a pas 0.97 que l'on aurait si on entraine sur tout (ce qui n'est pas beacoup plus long))