## Analyse network data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import os

In [3]:
# Make sure data is downloaded

if not os.path.exists('data'):
    os.makedirs('data')

In [2]:
# Open attack.csv

df_attack = pd.read_csv('data/dataset/Network datatset/csv/attack_4.csv', delimiter=',', header=0) 
# 5527409 rows attack_1
# 5159469 rows attack_2
# 5862547 rows attack_3
# 5131109 rows attack_4

In [9]:
df_attack.shape

(5131109, 16)

In [10]:
# Change the name of the columns by removing the spaces
df_attack.columns = df_attack.columns.str.replace(' ', '')

In [None]:
# Open normal.csv 

df_normal = pd.read_csv('data/dataset/Network datatset/csv/normal.csv', delimiter=',', header=0, nrows=5000000)

In [11]:
# Remove duplicates from both dataframes

df_attack = df_attack.drop_duplicates()
# df_normal = df_normal.drop_duplicates()

In [11]:
def get_columns_info(column_name):
    print('Column name: ', column_name)
    print('Number of unique values in attack: ', df_attack[column_name].nunique())
    print('Number of missing values in attack: ', df_attack[column_name].isnull().sum())
    print('Number of values in attack: ', df_attack[column_name].count())
    print('Values in attack: ', df_attack[column_name].unique())
    print('Values count: ', df_attack[column_name].value_counts())

# Début Rapport d'analyse

problème avec ces colonne : 'ip_s', 'ip_d', 'sport', 'dport', 'flags', 'n_pkt_src', 'n_pkt_dst', 'modbus_fn', 'modbus_response'
raison : missing some values

problème avec modbus_fn, lot of missing value (153121 for attack1), useless column ? Même chose avec modbus_response : Colonne lié au proto modbus et donc pas toute les colonne

La colonne proto contient les protocoles ARP ICMP Modbus TCP pour attack mais seulement Modbus et TCP pour le cas normal

Problème de typage : 'sport', 'dport', 'flags', 'n_pkt_src', 'n_pkt_dst'

Certaine dates sont partagé pour plusieurs entrées (ex : 2021-04-09 18:42:46.997219 a 4 entrés)

Le s dans ip_s, sport est pour source et le d pour destination

Il y a des cas noté normaux dans les csv attack, besoin du csv normal ?

different types d'attaques : 

 - ['normal' 'anomaly' 'MITM' 'physical fault'] pour attack_1
 - ['normal' 'physical fault' 'anomaly' 'DoS' 'MITM'] pour attack_3
 - ['normal' 'scan' 'DoS' 'physical fault' 'MITM'] pour attack_4 et attack_2


In [12]:
get_columns_info('label')

Column name:  label
Number of unique values in attack:  5
Number of missing values in attack:  0
Number of values in attack:  4774086
Values in attack:  ['normal' 'scan' 'DoS' 'physical fault' 'MITM']
Values count:  label
normal            4093168
physical fault     277282
MITM               216955
DoS                186651
scan                   30
Name: count, dtype: int64


In [11]:
df_attack.columns

Index(['Time', 'mac_s', 'mac_d', 'ip_s', 'ip_d', 'sport', 'dport', 'proto',
       'flags', 'size', 'modbus_fn', 'n_pkt_src', 'n_pkt_dst',
       'modbus_response', 'label_n', 'label'],
      dtype='object')

In [14]:
# Remove NaN values from the column list

print("Before removing NaN values: ", df_attack.shape)
column_NaN = ['mac_s', 'mac_d', 'ip_s', 'ip_d', 'sport', 'dport', 'flags', 'proto', 'size', 'n_pkt_src', 'n_pkt_dst','label']

df_attack = df_attack.dropna(subset=column_NaN)
print("After removing NaN values: ", df_attack.shape)

Before removing NaN values:  (5131109, 16)
After removing NaN values:  (4350185, 16)


In [15]:
# Convert the columns to the right type

column_type = ['sport', 'dport', 'flags', 'n_pkt_src', 'n_pkt_dst']

for column in column_type:
    df_attack[column] = df_attack[column].astype('int64')

# Remove brackets from the value of modbus_response and convert to int ignore NaN values

df_attack['modbus_response'] = df_attack['modbus_response'].str.replace('[','').str.replace(']','').astype('float64')

In [26]:
print(df_attack['modbus_response'].isna().sum())

1436332


In [16]:
df_attack['modbus_response'] = df_attack['modbus_response'].fillna(-1) # -1 means that the value is missing
df_attack['modbus_response'] = df_attack['modbus_response'].astype('int64')

In [17]:
df_attack.dtypes

Time               object
mac_s              object
mac_d              object
ip_s               object
ip_d               object
sport               int64
dport               int64
proto              object
flags               int64
size                int64
modbus_fn          object
modbus_response     int64
n_pkt_src           int64
n_pkt_dst           int64
label_n             int64
label              object
dtype: object

In [None]:
# Remove the Time

# df_attack = df_attack.drop(['Time'], axis=1)

In [18]:
# Save the cleaned dataframes

df_attack.to_csv('data/dataset/Network datatset/clean_csv/attack_4_cleaned.csv', index=False)

In [23]:
# KNN algorithm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix



In [4]:
# Load the cleaned dataframes

df_attack = pd.read_csv('data/dataset/Network datatset/clean_csv/attack_2_cleaned.csv', delimiter=',', header=0)

In [8]:
df_attack.shape

(4774086, 16)

In [9]:
# train_test_split

df_attack['modbus_fn'].value_counts()

modbus_fn
Read Holding Registers Response    1430074
Read Holding Registers             1429906
Read Coils Request                 1257153
Read Coils Response                1257153
Name: count, dtype: int64

In [24]:
df_attack.columns

Index(['mac_s', 'mac_d', 'ip_s', 'ip_d', 'sport', 'dport', 'proto', 'flags',
       'size', 'modbus_fn', 'n_pkt_src', 'n_pkt_dst', 'modbus_response',
       'label_n', 'label'],
      dtype='object')

In [25]:
def get_infos_col(col_name):
    print('Number of unique values: ', df_attack[col_name].nunique())
    print('Number of missing values: ', df_attack[col_name].isnull().sum())
    print('Number of values: ', df_attack[col_name].count())
    print('Values: ', df_attack[col_name].unique())

In [13]:
df_attack.info

<bound method DataFrame.info of                                Time              mac_s              mac_d  \
0        2021-04-19 15:37:19.989214  00:80:f4:03:fb:12  74:46:a0:bd:a7:1b   
1        2021-04-19 15:37:19.990641  74:46:a0:bd:a7:1b  e6:3f:ac:c9:a8:8c   
2        2021-04-19 15:37:19.990645  74:46:a0:bd:a7:1b  0a:fe:ec:47:74:fb   
3        2021-04-19 15:37:19.990647  74:46:a0:bd:a7:1b  fa:00:bc:90:d7:fa   
4        2021-04-19 15:37:19.990943  e6:3f:ac:c9:a8:8c  74:46:a0:bd:a7:1b   
...                             ...                ...                ...   
4774081  2021-04-19 16:12:14.165046  0a:fe:ec:47:74:fb  74:46:a0:bd:a7:1b   
4774082  2021-04-19 16:12:14.165050  e6:3f:ac:c9:a8:8c  74:46:a0:bd:a7:1b   
4774083  2021-04-19 16:12:14.166361  00:0c:29:47:8c:22  00:80:f4:03:fb:12   
4774084  2021-04-19 16:12:14.167706  74:46:a0:bd:a7:1b  0a:fe:ec:47:74:fb   
4774085  2021-04-19 16:12:14.167723  74:46:a0:bd:a7:1b  e6:3f:ac:c9:a8:8c   

                 ip_s          ip_d  sport 