In [1]:
import pandas
import os
import numpy
import plotly.express
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, balanced_accuracy_score, matthews_corrcoef, average_precision_score
from sklearn.tree import plot_tree
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
import xgboost
from sklearn.neighbors import KNeighborsClassifier


In [2]:
import glob

df = []

csv_files = glob.glob('data/Physical dataset/*.csv')
print(csv_files)
for csv_file in csv_files:
    delimiter = '\t'
    encoding="utf-16le"
    if csv_file.endswith("4.csv"):
        delimiter = ','
        encoding = "utf-8"
    df_temp = pandas.read_csv(csv_file, encoding=encoding, delimiter=delimiter)
    df.append(df_temp)

df  = pandas.concat(df, ignore_index=True)


['data/Physical dataset/phy_norm.csv', 'data/Physical dataset/phy_att_1.csv', 'data/Physical dataset/phy_att_4.csv', 'data/Physical dataset/phy_att_2.csv', 'data/Physical dataset/phy_att_3.csv']


In [3]:
print(df)

                      Time  Tank_1  Tank_2  Tank_3  Tank_4  Tank_5  Tank_6  \
0      09/04/2021 11:30:50       0       0       0       0       0       0   
1      09/04/2021 11:30:51       0       0       0       0       0       0   
2      09/04/2021 11:30:52       0       0       0       0       0       0   
3      09/04/2021 11:30:53       0       0       0       0       0       0   
4      09/04/2021 11:30:54       0       0       0       0       0       0   
...                    ...     ...     ...     ...     ...     ...     ...   
10918  09/04/2021 20:03:01       3     178       7     484     313     443   
10919  09/04/2021 20:03:02       3     230       8     471     308     443   
10920  09/04/2021 20:03:03       3     261       7     462     304     442   
10921  09/04/2021 20:03:04       3     288       7     449     299     442   
10922  09/04/2021 20:03:05       2     322       6     439     294     433   

       Tank_7  Tank_8  Pump_1  ...  Valv_16  Valv_17  Valv_18  

In [4]:
nb_attaque= df['Label'].value_counts()
print(nb_attaque)
ratio = (nb_attaque.iloc[1] + nb_attaque.iloc[2]+ nb_attaque.iloc[3] + nb_attaque.iloc[5])/(nb_attaque.iloc[0]+nb_attaque.iloc[1]+nb_attaque.iloc[2]+ nb_attaque.iloc[3] + nb_attaque.iloc[4] +nb_attaque.iloc[5])
print(ratio*100)

Label
normal            8657
MITM              1008
physical fault     685
DoS                310
nomal              249
scan                14
Name: count, dtype: int64
18.46562299734505


In [5]:
colonnes_catégorielles = df.select_dtypes(include=['category', 'object']).columns.tolist()

colonnes_numériques = df.select_dtypes(include=['number']).columns.tolist()

print("Colonnes catégorielles :")
print(colonnes_catégorielles)

print("\nColonnes numériques :")
print(colonnes_numériques)

Colonnes catégorielles :
['Time', 'Label']

Colonnes numériques :
['Tank_1', 'Tank_2', 'Tank_3', 'Tank_4', 'Tank_5', 'Tank_6', 'Tank_7', 'Tank_8', 'Flow_sensor_1', 'Flow_sensor_2', 'Flow_sensor_3', 'Flow_sensor_4', 'Label_n', 'Lable_n']


In [6]:
cat_stats = df[colonnes_catégorielles].describe()
print(cat_stats)
print()
valeur_catégorielles = {}
for col in colonnes_catégorielles:
    valeur_catégorielles[col] = df[col].unique()
    print("%s: %s" % (col, valeur_catégorielles[col]))

                       Time   Label
count                 10923   10923
unique                10923       6
top     09/04/2021 11:30:50  normal
freq                      1    8657

Time: ['09/04/2021 11:30:50' '09/04/2021 11:30:51' '09/04/2021 11:30:52' ...
 '09/04/2021 20:03:03' '09/04/2021 20:03:04' '09/04/2021 20:03:05']
Label: ['normal' 'MITM' 'physical fault' 'scan' 'DoS' 'nomal']


In [7]:
# Extract statistics for each numeric column
numeric_stats = df[colonnes_numériques].describe()

# Print the list of numeric columns
print("Numeric Columns:")
print(colonnes_numériques)

# Print statistics for each numeric column
print("\nStatistics for Numeric Columns:")
print(numeric_stats)

Numeric Columns:
['Tank_1', 'Tank_2', 'Tank_3', 'Tank_4', 'Tank_5', 'Tank_6', 'Tank_7', 'Tank_8', 'Flow_sensor_1', 'Flow_sensor_2', 'Flow_sensor_3', 'Flow_sensor_4', 'Label_n', 'Lable_n']

Statistics for Numeric Columns:
             Tank_1        Tank_2        Tank_3        Tank_4        Tank_5  \
count  10923.000000  10923.000000  10923.000000  10923.000000  10923.000000   
mean     723.644145    899.307791    590.142543    293.021148    367.792548   
std      675.581394    687.809757    616.156955    355.376570    280.270322   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        4.000000     62.500000     54.500000     25.000000     52.000000   
50%      621.000000    945.000000    429.000000     35.000000    382.000000   
75%     1317.000000   1553.000000    966.000000    562.000000    611.500000   
max     2021.000000   2012.000000   3427.000000   1261.000000    838.000000   

             Tank_6        Tank_7        Tank_8  Flow_sensor_1  Flo

In [8]:
nan_ratios = df.isna().mean()   
print(nan_ratios)


Time             0.000000
Tank_1           0.000000
Tank_2           0.000000
Tank_3           0.000000
Tank_4           0.000000
Tank_5           0.000000
Tank_6           0.000000
Tank_7           0.000000
Tank_8           0.000000
Pump_1           0.000000
Pump_2           0.000000
Pump_3           0.000000
Pump_4           0.000000
Pump_5           0.000000
Pump_6           0.000000
Flow_sensor_1    0.000000
Flow_sensor_2    0.000000
Flow_sensor_3    0.000000
Flow_sensor_4    0.000000
Valv_1           0.000000
Valv_2           0.000000
Valv_3           0.000000
Valv_4           0.000000
Valv_5           0.000000
Valv_6           0.000000
Valv_7           0.000000
Valv_8           0.000000
Valv_9           0.000000
Valv_10          0.000000
Valv_11          0.000000
Valv_12          0.000000
Valv_13          0.000000
Valv_14          0.000000
Valv_15          0.000000
Valv_16          0.000000
Valv_17          0.000000
Valv_18          0.000000
Valv_19          0.000000
Valv_20     

In [9]:
import pandas as pd
import numpy as np

# Remplacer les valeurs non nulles dans Label_n par celles de Lable_n
df['Label_n'] = df['Label_n'].combine_first(df['Lable_n'])

# Supprimer la colonne Lable_n
df.drop('Lable_n', axis=1, inplace=True)

# Afficher le DataFrame résultant
nan_ratios = df.isna().mean()   
print(nan_ratios)


Time             0.0
Tank_1           0.0
Tank_2           0.0
Tank_3           0.0
Tank_4           0.0
Tank_5           0.0
Tank_6           0.0
Tank_7           0.0
Tank_8           0.0
Pump_1           0.0
Pump_2           0.0
Pump_3           0.0
Pump_4           0.0
Pump_5           0.0
Pump_6           0.0
Flow_sensor_1    0.0
Flow_sensor_2    0.0
Flow_sensor_3    0.0
Flow_sensor_4    0.0
Valv_1           0.0
Valv_2           0.0
Valv_3           0.0
Valv_4           0.0
Valv_5           0.0
Valv_6           0.0
Valv_7           0.0
Valv_8           0.0
Valv_9           0.0
Valv_10          0.0
Valv_11          0.0
Valv_12          0.0
Valv_13          0.0
Valv_14          0.0
Valv_15          0.0
Valv_16          0.0
Valv_17          0.0
Valv_18          0.0
Valv_19          0.0
Valv_20          0.0
Valv_21          0.0
Valv_22          0.0
Label_n          0.0
Label            0.0
dtype: float64


In [10]:
target_attack_categories  = df['Label'].unique()
print(target_attack_categories)
target_normal = target_attack_categories[[0,5]]

['normal' 'MITM' 'physical fault' 'scan' 'DoS' 'nomal']


In [11]:
target_attack_categories = target_attack_categories[[1,2,3,4]]
print(target_attack_categories)
print(target_normal)

['MITM' 'physical fault' 'scan' 'DoS']
['normal' 'nomal']


In [12]:
# on récupère tous les labels 1
MM = df[(df['Label_n'] == 1)]
#on récupère toute les colonnes où il y a bien une attaque pour un label 1
matching_rows = (MM['Label'].isin(target_attack_categories))
# on vérifie que l'on a bien que la valeur True
print(matching_rows.all())

True


In [13]:
# on récupère tous les labels 0
MM = df[(df['Label_n'] == 0)]
#on récupère toute les colonnes où il y a bien une attaque pour un Label 0
matching_rows =(MM['Label'].isin(target_normal))
# on vérifie que l'on a bien que la valeur True
print(matching_rows.all())

True


In [14]:
# Create a new DataFrame X you will use for cleaning.
X = df.copy()

In [15]:
# Store the attack_cat labels in a DataFrame Y as a 1-D array
Y = X['Label_n'].copy()

In [16]:
X.drop(['Label', 'Time','Label_n'], axis=1, inplace=True)

In [17]:
# Extract train and test data through train_test_split on X and Y
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [18]:
clf = DecisionTreeClassifier()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
Y_pred_proba = clf.predict_proba(X_test)
print(Y_pred)

[0. 0. 0. ... 0. 0. 0.]


In [19]:
print('Confusion matrix:', confusion_matrix(Y_test, Y_pred))
print('Precision:', precision_score(Y_test, Y_pred))
print('Recall:', recall_score(Y_test, Y_pred))
print('Accuracy:', accuracy_score(Y_test, Y_pred))
print('Balanced accuracy:', balanced_accuracy_score(Y_test, Y_pred))
print('Matthews Correlation Coefficient:', matthews_corrcoef(Y_test, Y_pred))
print('AUPRC:', average_precision_score(Y_test, Y_pred))

Confusion matrix: [[2190   37]
 [  44  460]]
Precision: 0.9255533199195171
Recall: 0.9126984126984127
Accuracy: 0.9703405346027096
Balanced accuracy: 0.9480420666994533
Matthews Correlation Coefficient: 0.9009576671874633
AUPRC: 0.860862360495089


In [20]:
# For the XGBClassifier interface in sklearn
# 1. Perform the same analysis
# 2. Plot the relative importance of fields through plot_importance, using importance_type ‘cover’, ‘gain’, ‘weight’
# 3. Show the classifier visually


clf = DecisionTreeClassifier()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
Y_pred_proba = clf.predict_proba(X_test)

print('Confusion matrix:', confusion_matrix(Y_test, Y_pred))
print('Precision:', precision_score(Y_test, Y_pred))
print('Recall:', recall_score(Y_test, Y_pred))
print('Accuracy:', accuracy_score(Y_test, Y_pred))
print('Balanced accuracy:', balanced_accuracy_score(Y_test, Y_pred))
print('Matthews Correlation Coefficient:', matthews_corrcoef(Y_test, Y_pred))
print('AUPRC:', average_precision_score(Y_test, Y_pred))

fig = plotly.express.imshow(confusion_matrix(Y_test, Y_pred), title='Confusion matrix')
fig.show()

Confusion matrix: [[2190   37]
 [  43  461]]
Precision: 0.9257028112449799
Recall: 0.9146825396825397
Accuracy: 0.9707067008421824
Balanced accuracy: 0.9490341301915168
Matthews Correlation Coefficient: 0.9022475088117302
AUPRC: 0.8624693466781518


In [21]:
# For the XGBClassifier interface in sklearn
# 1. Perform the same analysis
# 2. Plot the relative importance of fields through plot_importance, using importance_type ‘cover’, ‘gain’, ‘weight’
# 3. Show the classifier visually


clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
Y_pred_proba = clf.predict_proba(X_test)

print('Confusion matrix:', confusion_matrix(Y_test, Y_pred))
print('Precision:', precision_score(Y_test, Y_pred))
print('Recall:', recall_score(Y_test, Y_pred))
print('Accuracy:', accuracy_score(Y_test, Y_pred))
print('Balanced accuracy:', balanced_accuracy_score(Y_test, Y_pred))
print('Matthews Correlation Coefficient:', matthews_corrcoef(Y_test, Y_pred))
print('AUPRC:', average_precision_score(Y_test, Y_pred))

fig = plotly.express.imshow(confusion_matrix(Y_test, Y_pred), title='Confusion matrix')
fig.show()

Confusion matrix: [[2206   21]
 [  17  487]]
Precision: 0.9586614173228346
Recall: 0.9662698412698413
Accuracy: 0.9860856829000366
Balanced accuracy: 0.9784200575904662
Matthews Correlation Coefficient: 0.9539226567875438
AUPRC: 0.9325504416190927
