In [None]:
# Importation des modules et librairies nécessaires
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Téléchargement de la dataset de Github à Google Colab et sa décompression
!wget https://github.com/fisher85/ml-cybersecurity/blob/master/python-web-attack-detection/datasets/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv.zip?raw=true -O dataset.zip
!unzip -u dataset.zip

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Lecture de la dataset
df = pd.read_csv('Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', encoding='cp1252')

In [None]:
# Suppression des lignes identiques 
df.columns = df.columns.str.strip()
df = df.drop(columns=['Fwd Header Length.1'])

In [None]:
df['Label'].unique()

In [None]:
df['Label'].value_counts()

In [None]:
# Suppression des champs vides
df = df.drop(df[pd.isnull(df['Flow ID'])].index)

In [None]:
# Remplacement des valeurs non numériques par des valeurs numériques
df.replace('Infinity', -1, inplace=True)
df[["Flow Bytes/s", "Flow Packets/s"]] = df[["Flow Bytes/s", "Flow Packets/s"]].apply(pd.to_numeric)

In [None]:
# Remplacement des valeurs NaN et les valeurs infinies par -1 
df.replace([np.inf, -np.inf, np.nan], -1, inplace=True)

In [None]:
# Conversion des caractères de chaîne en nombres
string_features = list(df.select_dtypes(include=['object']).columns)
string_features.remove('Label')
string_features

In [None]:
# Convertir des caractères de chaîne en chiffres
le = preprocessing.LabelEncoder()
df[string_features] = df[string_features].apply(lambda col: le.fit_transform(col))

In [None]:
benign_total = len(df[df['Label'] == "BENIGN"])
benign_total

In [None]:
attack_total = len(df[df['Label'] != "BENIGN"])
attack_total

In [3]:
# Le nombre des enregistrements "normaux" contre ceux contenant des attaques
df.to_csv("web_attacks_unbalanced.csv", index=False)
df['Label'].value_counts()

NameError: name 'df' is not defined

In [None]:
# Le multiplicateur d'élargissement est utilisé pour obtenir exactement 70% de données "normales" (5087 enregistrements)
enlargement = 1.1
benign_included_max = attack_total / 30 * 70
benign_inc_probability = (benign_included_max / benign_total) * enlargement
print(benign_included_max, benign_inc_probability)

In [None]:
# Copie des enregistrements de df vers df_balanced, sauvegarde de la dataset web_attacks_balanced.csv
import random
indexes = []
benign_included_count = 0
for index, row in df.iterrows():
    if (row['Label'] != "BENIGN"):
        indexes.append(index)
    else:
        if random.random() > benign_inc_probability: continue
        # Si on a atteint les 70% (5087 enregistrements)
        if benign_included_count > benign_included_max: continue
        benign_included_count += 1
        indexes.append(index)
df_balanced = df.loc[indexes]

In [None]:
# Visualisation
df_balanced

In [None]:
df_balanced.to_csv("web_attacks_balanced.csv", index=False)

In [None]:
df = pd.read_csv('web_attacks_balanced.csv')

In [None]:
# Normal (BENIGN) = 0, Attaque = 1
df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

In [None]:
# Exclusion des caractéristiques inutiles
excluded = ['Flow ID', 'Source IP', 'Source Port', 'Destination IP', 'Destination Port', 'Protocol', 'Timestamp']
df = df.drop(columns=excluded, errors='ignore')
df.shape

In [None]:
y = df['Label'].values
X = df.drop(columns=['Label'])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
dict_accuracy={}

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Liste des algorithmes utilisés
models = {"naive_bayes": GaussianNB(),
          "DecisionTree": DecisionTreeClassifier(max_leaf_nodes=5, random_state=0),  
          "LinearSVC": LinearSVC(),
          "KNN": KNeighborsClassifier(),
          "SVC": SVC(),
          "LogisticRegression": LogisticRegression(),
          "RandomForest": RandomForestClassifier(n_estimators = 250),
          "GradientBoostingClassifier":GradientBoostingClassifier(random_state = 0)
          }

In [None]:
# Affichage du rapport de classification et de la matrice de confusion pour chaque algorithme
import warnings
warnings.filterwarnings('ignore')
names = []
values = []
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    Mean = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    dict_accuracy[model_name] = Mean
    print("-"*20+"Classification report for "+model_name+" "+"-"*20)
    print(classification_report(y_test, y_pred))
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("-"*20+"Confusion Matrix for "+model_name+" "+"-"*20)
    print(conf_matrix)
    plt.figure(figsize = (10,7))
    sns.heatmap(conf_matrix, annot=True)
    names.append(model_name)
    names.append(Mean)

In [1]:
# Affichage de la précision pour chaque algorithme
results_df = pd.DataFrame(columns = ['Accuracy'],
                          index = [i for i, j in dict_accuracy.items()])
results_df['Accuracy'] = [j for i, j in dict_accuracy.items()]

results_df.plot.bar()
results_df

NameError: name 'pd' is not defined

In [None]:
X_train

In [None]:
from sklearn.cluster import KMeans

In [None]:
# Initialisation de KMeans avec 2 clusters et l'entraînement du modèle
kmeans = KMeans(2)
kmeans.fit(X_train)

In [None]:
# Prédiction des valeurs
identified_clusters = kmeans.fit_predict(X_test)
identified_clusters

In [None]:
# Affectation des caractéristiques aux valeurs prédites
klabel=kmeans.labels_
X_test['predicted']=klabel

In [None]:
# Fonction de mappage des nombres d'éléments avec la classe correspondante
def count(arr):
  unique, counts = np.unique(arr, return_counts=True)
  return dict(zip(unique, counts))

In [None]:
print(count(klabel))
print(count(y_test))

In [None]:
# Calcul de TP, TN, FN et FP
TP = 0
TN = 0
FP = 0
FN = 0
for i in range (1454):
  if klabel[i] == 0 and y_test[i] == 0:
      TN += 1
  if klabel[i] == 0 and y_test[i] == 1:
      FN += 1
  if klabel[i] == 1 and y_test[i] == 0:
      FP += 1
  if klabel[i] == 1 and y_test[i] == 1:
      TP += 1
print(TP)
print(TN)
print(FN)
print(FP)