## Analyse network data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import os

In [2]:
if not os.path.exists('data/cleaned'):
    os.makedirs('data/cleaned')

In [3]:
# Open attack.csv
df_attack = pd.read_csv('data/dataset/Network dataset/csv/attack_4.csv', delimiter=',', header=0)
# 5527409 rows attack_1
# 5159469 rows attack_2
# 5862547 rows attack_3
# 5131109 rows attack_4

In [4]:
df_attack = df_attack.drop_duplicates()

In [5]:
# Change the name of the columns by removing the spaces
df_attack.columns = df_attack.columns.str.replace(' ', '')

In [19]:
print("Number of rows before droping: ", len(df_attack))
column_NaN = ['Time','mac_s', 'mac_d', 'ip_s', 'ip_d', 'sport', 'dport', 'flags', 'proto', 'size', 'n_pkt_src', 'n_pkt_dst', 'label']
df_attack = df_attack.dropna(subset=column_NaN)
print("Number of rows after droping: ", len(df_attack))

Number of rows before droping:  4350185
Number of rows after droping:  4350183


In [7]:
def get_columns_info(column_name):
    print('Column name: ', column_name)
    print('Number of unique values in attack: ', df_attack[column_name].nunique())
    print('Number of missing values in attack: ', df_attack[column_name].isnull().sum())
    print('Number of values in attack: ', df_attack[column_name].count())
    print('Values in attack: ', df_attack[column_name].unique())
    print('Values count: ', df_attack[column_name].value_counts())

# Début Rapport d'analyse

problème avec ces colonne : 'ip_s', 'ip_d', 'sport', 'dport', 'flags', 'n_pkt_src', 'n_pkt_dst', 'modbus_fn', 'modbus_response'
raison : missing some values

problème avec modbus_fn, lot of missing value (153121 for attack1), useless column ? Même chose avec modbus_response : Colonne lié au proto modbus et donc pas toute les colonne

La colonne proto contient les protocoles ARP ICMP Modbus TCP pour attack mais seulement Modbus et TCP pour le cas normal

Problème de typage : 'sport', 'dport', 'flags', 'n_pkt_src', 'n_pkt_dst'

Certaine dates sont partagé pour plusieurs entrées (ex : 2021-04-09 18:42:46.997219 a 4 entrés)

Le s dans ip_s, sport est pour source et le d pour destination

Il y a des cas noté normaux dans les csv attack, besoin du csv normal ?

different types d'attaques : 

 - ['normal' 'anomaly' 'MITM' 'physical fault'] pour attack_1
 - ['normal' 'physical fault' 'anomaly' 'DoS' 'MITM'] pour attack_3
 - ['normal' 'scan' 'DoS' 'physical fault' 'MITM'] pour attack_4 et attack_2


In [8]:
get_columns_info('label')

Column name:  label
Number of unique values in attack:  5
Number of missing values in attack:  0
Number of values in attack:  4350185
Values in attack:  ['normal' 'scan' 'DoS' 'physical fault' 'MITM']
Values count:  label
normal            2844877
DoS                732749
MITM               471241
physical fault     301287
scan                   31
Name: count, dtype: int64


In [9]:
# Conversion des types de données
column_type = ['sport', 'dport', 'flags', 'n_pkt_src', 'n_pkt_dst']
for column in column_type:
    df_attack[column] = df_attack[column].astype('int64')

# On retire les brackets [ ] et on convertit en float64, puis on remplace les NaN par -1 et on convertit en int64

# df_attack['modbus_response'] = df_attack['modbus_response'].str.replace('[','').str.replace(']','').astype('float64')
# df_attack['modbus_response'] = df_attack['modbus_response'].fillna(-1) # -1 means that the value is missing
# df_attack['modbus_response'] = df_attack['modbus_response'].astype('int64')

df_attack['modbus_response'] = df_attack['modbus_response'].str.replace('[', '', regex=False).str.replace(']', '', regex=False)
df_attack['modbus_response'] = pd.to_numeric(df_attack['modbus_response'], errors='coerce').fillna(-1).astype('int64')

In [11]:
df_attack['Time'] = pd.to_datetime(df_attack['Time'], errors='coerce')

In [22]:
network_columns = ['mac_s', 'mac_d', 'ip_s', 'ip_d', 'sport', 'dport', 'proto', 'flags', 'size']
physical_columns = ['modbus_fn', 'modbus_response', 'n_pkt_src', 'n_pkt_dst']

# Création des sous-ensembles de données
df_network = df_attack[network_columns + ['label_n', 'label']]
df_physical = df_attack[physical_columns + ['label_n', 'label']]

In [32]:
# Conversion en catégories des colonnes proto, flags, label et modbus_fn
# df_network['proto'] = df_network['proto'].astype('category')
# df_network['flags'] = df_network['flags'].astype('category')
# df_network['label'] = df_network['label'].astype('category')
# df_physical['modbus_fn'] = df_physical['modbus_fn'].astype('category')
# df_physical['label'] = df_physical['label'].astype('category')

Unnamed: 0,modbus_fn,modbus_response,n_pkt_src,n_pkt_dst,label_n,label
0,Read Holding Registers,-1,0,0,0,normal
1,Read Holding Registers,-1,1,0,0,normal
2,Read Holding Registers,-1,2,0,0,normal
3,,-1,0,0,0,normal
4,Read Holding Registers Response,0,0,1,0,normal


In [18]:
# Save
df_attack.to_csv('data/dataset/Network datatset/cleaned/attack_4_cleaned.csv', index=False)
df_network.to_csv('data/dataset/Network datatset/cleaned/attack_4_network.csv', index=False)
df_physical.to_csv('data/dataset/Network datatset/cleaned/attack_4_physical.csv', index=False)

In [18]:
df_attack.shape

(4350185, 16)

In [20]:
# Analyse exploratoire des données (EDA)
print("Aperçu des données nettoyées :\n", df_attack.head())
print("\nDescription statistique :\n", df_attack.describe(include='all'))

Aperçu des données nettoyées :
                         Time              mac_s              mac_d  \
0 2022-02-21 14:45:25.454111  74:46:a0:bd:a7:1b  e6:3f:ac:c9:a8:8c   
1 2022-02-21 14:45:25.454114  74:46:a0:bd:a7:1b  fa:00:bc:90:d7:fa   
2 2022-02-21 14:45:25.454142  74:46:a0:bd:a7:1b  0a:fe:ec:47:74:fb   
3 2022-02-21 14:45:25.454260  00:80:f4:03:fb:12  74:46:a0:bd:a7:1b   
4 2022-02-21 14:45:25.454365  e6:3f:ac:c9:a8:8c  74:46:a0:bd:a7:1b   

           ip_s          ip_d  sport  dport   proto  flags  size  \
0   84.3.251.20  84.3.251.101  60614    502  Modbus  11000    66   
1   84.3.251.20  84.3.251.103  60616    502  Modbus  11000    66   
2   84.3.251.20  84.3.251.102  60615    502  Modbus  11000    66   
3   84.3.251.18   84.3.251.20    502  60523     TCP  11000   109   
4  84.3.251.101   84.3.251.20    502  60614  Modbus  11000    65   

                         modbus_fn  modbus_response  n_pkt_src  n_pkt_dst  \
0           Read Holding Registers               -1          

In [23]:
# KNN algorithm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix



In [2]:
# Load the cleaned dataframes

df_attack = pd.read_csv('data/dataset/Network datatset/clean_csv/attack_2_cleaned.csv', delimiter=',', header=0)

In [54]:
df_attack.columns

Index(['Time', 'mac_s', 'mac_d', 'ip_s', 'ip_d', 'sport', 'dport', 'proto',
       'flags', 'size', 'modbus_fn', 'n_pkt_src', 'n_pkt_dst',
       'modbus_response', 'label_n', 'label'],
      dtype='object')

In [5]:
import matplotlib.pyplot as plt

# Subset of normal and attack dataframes
df_label1 = df_attack[df_attack['label_n'] == 1]
df_label0 = df_attack[df_attack['label_n'] == 0]

In [None]:
proto1 = df_label1['proto'].value_counts(normalize=True)
print(proto1)

proto0 = df_label0['proto'].value_counts(normalize=True)
print(proto0)

mac_s1 = df_label1['mac_s'].value_counts(normalize=True)
print(mac_s1)

mac_s0 = df_label0['mac_s'].value_counts(normalize=True)
print(mac_s0)

mac_d1 = df_label1['mac_d'].value_counts(normalize=True)
print(mac_d1)

mac_d0 = df_label0['mac_d'].value_counts(normalize=True)
print(mac_d0)

ip_s1 = df_label1['ip_s'].value_counts(normalize=True)
print(ip_s1)

ip_s0 = df_label0['ip_s'].value_counts(normalize=True)
print(ip_s0)

# Rien de spécial entre attack et normal

In [None]:
# Plot first 10 sport of attack
sport1 = df_label1['n_pkt_src'].value_counts(normalize=True).head(10)
print(sport1)
# Change index of sport1 to 1 to 5
sport1.index = range(1, 11)
plt.figure(figsize=(10, 5))
plt.bar(sport1.index, sport1.values)
plt.title('Sport of attack')
plt.xlabel('Sport')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Plot first 10 sport of attack
sport1 = df_label0['n_pkt_src'].value_counts(normalize=True).head(10)
print(sport1)
# Change index of sport1 to 1 to 5
sport1.index = range(1, 11)
plt.figure(figsize=(10, 5))
plt.bar(sport1.index, sport1.values)
plt.title('Sport of normal')
plt.xlabel('Sport')
plt.ylabel('Frequency')
plt.show()

In [52]:
# Find the range of Date : '2021-04-19 15:37:19.989214' to '2021-04-19 16:12:14.167723'
df_attack.head(30)

Unnamed: 0,Time,mac_s,mac_d,ip_s,ip_d,sport,dport,proto,flags,size,modbus_fn,n_pkt_src,n_pkt_dst,modbus_response,label_n,label
0,2021-04-19 15:37:19.989214,00:80:f4:03:fb:12,74:46:a0:bd:a7:1b,84.3.251.18,84.3.251.20,502,61315,Modbus,11000,64,Read Coils Response,0,0,0,0,normal
1,2021-04-19 15:37:19.990641,74:46:a0:bd:a7:1b,e6:3f:ac:c9:a8:8c,84.3.251.20,84.3.251.101,61316,502,Modbus,11000,66,Read Holding Registers,0,0,-1,0,normal
2,2021-04-19 15:37:19.990645,74:46:a0:bd:a7:1b,0a:fe:ec:47:74:fb,84.3.251.20,84.3.251.102,61318,502,Modbus,11000,66,Read Holding Registers,1,0,-1,0,normal
3,2021-04-19 15:37:19.990647,74:46:a0:bd:a7:1b,fa:00:bc:90:d7:fa,84.3.251.20,84.3.251.103,61317,502,Modbus,11000,66,Read Coils Request,2,0,-1,0,normal
4,2021-04-19 15:37:19.990943,e6:3f:ac:c9:a8:8c,74:46:a0:bd:a7:1b,84.3.251.101,84.3.251.20,502,61316,Modbus,11000,65,Read Holding Registers Response,0,1,0,0,normal
5,2021-04-19 15:37:19.990947,fa:00:bc:90:d7:fa,74:46:a0:bd:a7:1b,84.3.251.103,84.3.251.20,502,61317,Modbus,11000,64,Read Coils Response,0,2,0,0,normal
6,2021-04-19 15:37:19.990948,0a:fe:ec:47:74:fb,74:46:a0:bd:a7:1b,84.3.251.102,84.3.251.20,502,61318,Modbus,11000,65,Read Holding Registers Response,0,3,0,0,normal
7,2021-04-19 15:37:19.991545,74:46:a0:bd:a7:1b,00:80:f4:03:fb:12,84.3.251.20,84.3.251.18,61315,502,Modbus,11000,66,Read Coils Request,3,0,-1,0,normal
8,2021-04-19 15:37:19.993615,74:46:a0:bd:a7:1b,0a:fe:ec:47:74:fb,84.3.251.20,84.3.251.102,61318,502,Modbus,11000,66,Read Holding Registers,4,1,-1,0,normal
9,2021-04-19 15:37:19.993620,74:46:a0:bd:a7:1b,fa:00:bc:90:d7:fa,84.3.251.20,84.3.251.103,61317,502,Modbus,11000,66,Read Holding Registers,5,1,-1,0,normal
