In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import mlsecu.data_exploration_utils as deu
import mlsecu.data_preparation_utils as dpu
import mlsecu.anomaly_detection_use_case as adu

from os import listdir

%matplotlib inline

# Anomaly detection for tracking attacks

This notebook contains our work for the objective "Anomaly detection for tracking attacks". We chose to use the dataset **A Hardware-in-the-Loop Water Distribution Testbed Dataset for Cyber-Physical Security Testing**.

Our group is compose of *Clovis Lechien*, *Florian Segard-Gahery*, *Valentin San*, *Alexandre Devaux-Rivière* and *Maël Reynaud*. 

## Dataset understanding and cleaning

There is 2 type of sub-datasets in the dataset, which concerns network activity and physical activity. We will first have a look at the network ones, then the physical ones.

---

In [2]:
def get_summary(df):
    """
    This function allows us to have different metrics for each column of any dataset
    """
    df_desc = pd.DataFrame(df.describe(include='all').T)
    df_summary = pd.DataFrame({
        'dtype': df.dtypes,
        'unique':df.nunique().values,
        'missing': df.isna().sum().values,
        'duplicates': df.duplicated().sum(),
        'min': df_desc['min'].values,
        'max': df_desc['max'].values,
        'avg': df_desc['mean'].values,
        'std dev': df_desc['std'].values
    })
    return df_summary

### Network datasets

First, let's open each sub dataset and have a look at the different columns and values it contains.

#### First dataset

In [3]:
df_net_attack_1 = pd.read_csv('./data/dataset/Network datatset/csv/attack_1.csv')
df_net_attack_1.head()

Unnamed: 0,Time,mac_s,mac_d,ip_s,ip_d,sport,dport,proto,flags,size,modbus_fn,n_pkt_src,n_pkt_dst,modbus_response,label_n,label
0,2021-04-09 18:23:28.385003,74:46:a0:bd:a7:1b,0a:fe:ec:47:74:fb,84.3.251.20,84.3.251.102,56667.0,502.0,Modbus,11000.0,66,Read Coils Request,0.0,0.0,,0,normal
1,2021-04-09 18:23:28.385005,74:46:a0:bd:a7:1b,e6:3f:ac:c9:a8:8c,84.3.251.20,84.3.251.101,56666.0,502.0,Modbus,11000.0,66,Read Coils Request,1.0,0.0,,0,normal
2,2021-04-09 18:23:28.385006,74:46:a0:bd:a7:1b,fa:00:bc:90:d7:fa,84.3.251.20,84.3.251.103,56668.0,502.0,Modbus,11000.0,66,Read Coils Request,2.0,0.0,,0,normal
3,2021-04-09 18:23:28.385484,0a:fe:ec:47:74:fb,74:46:a0:bd:a7:1b,84.3.251.102,84.3.251.20,502.0,56667.0,Modbus,11000.0,64,Read Coils Response,0.0,0.0,[0],0,normal
4,2021-04-09 18:23:28.385486,fa:00:bc:90:d7:fa,74:46:a0:bd:a7:1b,84.3.251.103,84.3.251.20,502.0,56668.0,Modbus,11000.0,64,Read Coils Response,0.0,1.0,[0],0,normal


In [4]:
get_summary(df_net_attack_1).style.background_gradient(cmap='viridis_r', low=0.8)

Unnamed: 0,dtype,unique,missing,duplicates,min,max,avg,std dev
Time,object,5242099,0,2,,,,
mac_s,object,8,0,2,,,,
mac_d,object,9,0,2,,,,
ip_s,object,7,475,2,,,,
ip_d,object,7,475,2,,,,
sport,float64,8445,515,2,25.0,60999.0,28496.742642,27966.236571
dport,float64,8441,515,2,25.0,60999.0,28370.122168,27984.759506
proto,object,4,0,2,,,,
flags,float64,7,515,2,10.0,11000.0,10916.886999,839.861839
size,int64,7,0,2,60.0,78.0,65.327815,1.400719


In [5]:
df_net_attack_1[df_net_attack_1[" label_n"] == 0][" label"].unique()

array(['normal'], dtype=object)

In [6]:
df_net_attack_1[df_net_attack_1[" label_n"] == 1][" label"].unique()

array(['anomaly', 'MITM', 'physical fault'], dtype=object)

The dataset file name can be misleading, in this first dataset called **attack_1.csv**, we have attacks recording but also normal ones. 

Also, the columns contains whitespace before their names, so we better remove them to concatenate them after.

In [7]:
df_net_attack_1.columns

Index(['Time', ' mac_s', ' mac_d', ' ip_s', ' ip_d', ' sport', ' dport',
       ' proto', ' flags', ' size', ' modbus_fn', ' n_pkt_src', ' n_pkt_dst',
       ' modbus_response', ' label_n', ' label'],
      dtype='object')

In [8]:
df_net_attack_1.columns = df_net_attack_1.columns.str.strip()

In [9]:
df_net_attack_1.columns

Index(['Time', 'mac_s', 'mac_d', 'ip_s', 'ip_d', 'sport', 'dport', 'proto',
       'flags', 'size', 'modbus_fn', 'n_pkt_src', 'n_pkt_dst',
       'modbus_response', 'label_n', 'label'],
      dtype='object')

#### Second dataset

In [10]:
df_net_attack_2 = pd.read_csv('./data/dataset/Network datatset/csv/attack_2.csv')
df_net_attack_2.head()

Unnamed: 0,Time,mac_s,mac_d,ip_s,ip_d,sport,dport,proto,flags,size,modbus_fn,n_pkt_src,n_pkt_dst,modbus_response,label_n,label
0,2021-04-19 15:37:19.989214,00:80:f4:03:fb:12,74:46:a0:bd:a7:1b,84.3.251.18,84.3.251.20,502.0,61315.0,Modbus,11000.0,64,Read Coils Response,0.0,0.0,[0],0,normal
1,2021-04-19 15:37:19.990641,74:46:a0:bd:a7:1b,e6:3f:ac:c9:a8:8c,84.3.251.20,84.3.251.101,61316.0,502.0,Modbus,11000.0,66,Read Holding Registers,0.0,0.0,,0,normal
2,2021-04-19 15:37:19.990645,74:46:a0:bd:a7:1b,0a:fe:ec:47:74:fb,84.3.251.20,84.3.251.102,61318.0,502.0,Modbus,11000.0,66,Read Holding Registers,1.0,0.0,,0,normal
3,2021-04-19 15:37:19.990647,74:46:a0:bd:a7:1b,fa:00:bc:90:d7:fa,84.3.251.20,84.3.251.103,61317.0,502.0,Modbus,11000.0,66,Read Coils Request,2.0,0.0,,0,normal
4,2021-04-19 15:37:19.990943,e6:3f:ac:c9:a8:8c,74:46:a0:bd:a7:1b,84.3.251.101,84.3.251.20,502.0,61316.0,Modbus,11000.0,65,Read Holding Registers Response,0.0,1.0,[0],0,normal


In [11]:
get_summary(df_net_attack_2).style.background_gradient(cmap='viridis_r', low=0.8)

Unnamed: 0,dtype,unique,missing,duplicates,min,max,avg,std dev
Time,object,5157725,0,349,,,,
mac_s,object,8,0,349,,,,
mac_d,object,9,0,349,,,,
ip_s,object,8,276,349,,,,
ip_d,object,8,276,349,,,,
sport,float64,23748,385383,349,25.0,61646.0,30643.369958,30218.012482
dport,float64,9475,385383,349,25.0,61646.0,30469.67992,30275.37482
proto,object,4,0,349,,,,
flags,float64,12,385383,349,0.0,11000010.0,10881.941764,13354.24013
size,int64,10,0,349,60.0,82.0,64.969746,1.948424


In [12]:
df_net_attack_2.columns = df_net_attack_2.columns.str.strip()

#### Third dataset

In [13]:
df_net_attack_3 = pd.read_csv('./data/dataset/Network datatset/csv/attack_3.csv')
df_net_attack_3.head()

Unnamed: 0,Time,mac_s,mac_d,ip_s,ip_d,sport,dport,proto,flags,size,modbus_fn,n_pkt_src,n_pkt_dst,modbus_response,label_n,label
0,2021-04-09 19:42:13.484804,00:80:f4:03:fb:12,74:46:a0:bd:a7:1b,84.3.251.18,84.3.251.20,502.0,57939.0,Modbus,11000.0,64,Read Coils Response,0.0,0.0,[0],0,normal
1,2021-04-09 19:42:13.487062,74:46:a0:bd:a7:1b,0a:fe:ec:47:74:fb,84.3.251.20,84.3.251.102,57940.0,502.0,Modbus,11000.0,66,Read Holding Registers,0.0,0.0,,0,normal
2,2021-04-09 19:42:13.487078,74:46:a0:bd:a7:1b,fa:00:bc:90:d7:fa,84.3.251.20,84.3.251.103,57942.0,502.0,Modbus,11000.0,66,Read Holding Registers,1.0,0.0,,0,normal
3,2021-04-09 19:42:13.487079,74:46:a0:bd:a7:1b,00:80:f4:03:fb:12,84.3.251.20,84.3.251.18,57939.0,502.0,Modbus,11000.0,66,Read Coils Request,2.0,0.0,,0,normal
4,2021-04-09 19:42:13.487080,74:46:a0:bd:a7:1b,e6:3f:ac:c9:a8:8c,84.3.251.20,84.3.251.101,57941.0,502.0,Modbus,11000.0,66,Read Holding Registers,3.0,0.0,,0,normal


In [14]:
get_summary(df_net_attack_3).style.background_gradient(cmap='viridis_r', low=0.8)

Unnamed: 0,dtype,unique,missing,duplicates,min,max,avg,std dev
Time,object,5678943,0,118270,,,,
mac_s,object,8,0,118270,,,,
mac_d,object,9,0,118270,,,,
ip_s,object,8,189,118270,,,,
ip_d,object,8,189,118270,,,,
sport,float64,5284,3066837,118270,502.0,60999.0,29144.076738,28617.223644
dport,float64,5276,3066837,118270,502.0,60999.0,29015.472308,28638.323882
proto,object,5,0,118270,,,,
flags,float64,9,3066837,118270,10.0,11000010.0,11011.663764,32208.195023
size,int64,11,0,118270,60.0,1514.0,775.441515,723.292807


In [15]:
df_net_attack_3.columns = df_net_attack_3.columns.str.strip()

#### Fourth dataset

In [16]:
df_net_normal = pd.read_csv('./data/dataset/Network datatset/csv/normal.csv')
df_net_normal.head()

Unnamed: 0,Time,mac_s,mac_d,ip_s,ip_d,sport,dport,proto,flags,size,modbus_fn,n_pkt_src,n_pkt_dst,modbus_response,label_n,label
0,2021-04-09 11:30:52.716203,74:46:a0:bd:a7:1b,fa:00:bc:90:d7:fa,84.3.251.20,84.3.251.103,61516,502,Modbus,11000,66,Read Holding Registers,0,0,,0,normal
1,2021-04-09 11:30:52.716499,fa:00:bc:90:d7:fa,74:46:a0:bd:a7:1b,84.3.251.103,84.3.251.20,502,61516,Modbus,11000,65,Read Holding Registers Response,0,0,[0],0,normal
2,2021-04-09 11:30:52.717334,74:46:a0:bd:a7:1b,e6:3f:ac:c9:a8:8c,84.3.251.20,84.3.251.101,61515,502,Modbus,11000,66,Read Holding Registers,1,0,,0,normal
3,2021-04-09 11:30:52.717624,e6:3f:ac:c9:a8:8c,74:46:a0:bd:a7:1b,84.3.251.101,84.3.251.20,502,61515,Modbus,11000,65,Read Holding Registers Response,0,1,[0],0,normal
4,2021-04-09 11:30:52.717952,00:80:f4:03:fb:12,74:46:a0:bd:a7:1b,84.3.251.18,84.3.251.20,502,61514,Modbus,11000,64,Read Coils Response,0,2,[0],0,normal


In [17]:
get_summary(df_net_normal).style.background_gradient(cmap='viridis_r', low=0.8)

Unnamed: 0,dtype,unique,missing,duplicates,min,max,avg,std dev
Time,object,7128712,0,0,,,,
mac_s,object,7,0,0,,,,
mac_d,object,7,0,0,,,,
ip_s,object,7,0,0,,,,
ip_d,object,7,0,0,,,,
sport,int64,10261,0,0,25.0,61517.0,30848.979497,30341.080063
dport,int64,10261,0,0,25.0,61517.0,30726.536132,30366.784408
proto,object,2,0,0,,,,
flags,int64,7,0,0,10.0,11000.0,10918.518061,831.540578
size,int64,7,0,0,60.0,78.0,65.32566,1.389839


In [18]:
df_net_normal.columns = df_net_normal.columns.str.strip()

As we can see, there is 16 different columns in those datasets, which correspond to the following :
- **Time**: Date of acquisition
- **mac_s**: Source MAC address
- **mac_d**: Destination MAC address
- **ip_s**: Source IP address
- **ip_d**: Destination IP address
- **sport**: Source port
- **dport**: Destination port
- **proto**: Protocol
- **flags**: TCP flags - CWR | ECN | URG | ACK | PSH | RST | SYN | FIN
- **size**: Size of the packet payload
- **modbus_fn**: MODBUS function code
- **n_pkt_src**: Number of packets of the same source address in the last 2 seconds
- **n_pkt_dst**: Number of packets of the same destination address in the last 2 seconds
- **modbus_response**: MODBUS response value
- **label_n**: 0 if normal, 1 if attack
- **label**: type of attack

Let's have a look at the date range of each dataset, by first converting the **Time** column to the *timestamp* type :

df_net_attack_1['Time'] = pd.to_datetime(df_net_attack_1['Time'], errors='coerce')
df_net_attack_2['Time'] = pd.to_datetime(df_net_attack_2['Time'], errors='coerce')
df_net_attack_3['Time'] = pd.to_datetime(df_net_attack_3['Time'], errors='coerce')
df_net_normal['Time'] = pd.to_datetime(df_net_normal['Time'], errors='coerce')

In [20]:
print(f"Unique days of df_net_attack_1: {df_net_attack_1["Time"].dt.normalize().unique().tolist()}")
print(f"Unique days of df_net_attack_2: {df_net_attack_2["Time"].dt.normalize().unique().tolist()}")
print(f"Unique days of df_net_attack_3: {df_net_attack_3["Time"].dt.normalize().unique().tolist()}")
print(f"Unique days of df_net_normal: {df_net_normal["Time"].dt.normalize().unique().tolist()}")

Unique days of df_net_attack_1: [Timestamp('2021-04-09 00:00:00'), NaT]
Unique days of df_net_attack_2: [Timestamp('2021-04-19 00:00:00'), NaT]
Unique days of df_net_attack_3: [Timestamp('2021-04-09 00:00:00'), NaT]
Unique days of df_net_normal: [Timestamp('2021-04-09 00:00:00'), NaT]


Let's now concat all the datasets and remove **duplicates** and **nan**, as showed in the datasets's summaries. We first merge then remove duplicates because there might be duplicates between dataset parts, especially for the *df_net_attack_1* and *df_net_attack_3* which are the same day.

In [21]:
df_net = pd.concat([df_net_attack_1, df_net_attack_2, df_net_attack_3, df_net_normal])

In [22]:
df_net.columns

Index(['Time', 'mac_s', 'mac_d', 'ip_s', 'ip_d', 'sport', 'dport', 'proto',
       'flags', 'size', 'modbus_fn', 'n_pkt_src', 'n_pkt_dst',
       'modbus_response', 'label_n', 'label'],
      dtype='object')

In [23]:
len(df_net)

24306714

In [24]:
len(df_net[df_net["label_n"] == 0]), len(df_net[df_net["label_n"] == 1])

(17608422, 6698292)

In [25]:
df_net = df_net.dropna()
df_net = df_net.drop_duplicates()

In [26]:
len(df_net)

10130285

In [27]:
len(df_net[df_net["label_n"] == 0]), len(df_net[df_net["label_n"] == 1])

(8565851, 1564434)

As we can see, we drop a lot of rows, way more attack ones than normal ones (in percentage).

## Physical datasets

---

TODO