# ***SWaT Dataset - Anomaly Detection***

* Clovis Lechien
* Alexandre Devaux-Rivière
* Florian Segard-Gahery
* Valentin San
* Maël Reynaud

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from os import listdir
import re

import mlsecu.data_exploration_utils as deu
import mlsecu.data_preparation_utils as dpu
import mlsecu.anomaly_detection_use_case as adu

%matplotlib inline

In [2]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [3]:
!ls 'data/swat/'

22June2020_1.xlsx  22June2020_2.xlsx  29June2020_1.xlsx  29June2020_2.xlsx


In [4]:
def get_summary(df : pd.DataFrame) -> pd.DataFrame:
    df_desc = pd.DataFrame(df.describe(include='all').T)
    df_summary = pd.DataFrame({
        'dtype': df.dtypes,
        'unique':df.nunique().values,
        'missing': df.isna().sum().values,
        'duplicates': df.duplicated().sum(),
        'min': df_desc['min'].values,
        'max': df_desc['max'].values,
        'avg': df_desc['mean'].values,
        'std dev': df_desc['std'].values
    })
    return df_summary

In [5]:
def list_dir(path : str, pattern : str) -> list[str]:
    return [file for file in listdir(path) if re.search(pattern, file)]

In [6]:
def preproc_df(df : pd.DataFrame) -> pd.DataFrame:
    return df[df.columns.drop(list(df.filter(regex='Unnamed')))]

In [7]:
def concat_dfs(dfs_path : list[str]) -> pd.DataFrame:
    list_of_dfs = []
    for df_path in dfs_path:
        df = preproc_df(pd.read_excel(df_path))
        list_of_dfs.append(df)
    return pd.concat(list_of_dfs, axis=0)

In [8]:
path = 'data/swat/'

pattern = r"^.*\.xlsx$"
dfs = [path + file for file in list_dir(path, pattern)]
dfs

['data/swat/22June2020_1.xlsx',
 'data/swat/22June2020_2.xlsx',
 'data/swat/29June2020_2.xlsx',
 'data/swat/29June2020_1.xlsx']

In [11]:
df = concat_dfs(dfs)
df

Unnamed: 0,t_stamp,P1_STATE,LIT101.Pv,FIT101.Pv,MV101.Status,P101.Status,P102.Status,P2_STATE,FIT201.Pv,AIT201.Pv,...,P602.Status,P603.Status,LSH601.Alarm,LSL601.Alarm,LSH602.Alarm,LSL602.Alarm,LSH603.Alarm,LSL603.Alarm,d_stamp,t_stamp.1
0,2020-06-22 10:00:00,3,695.2841,0.0,1,1,1,2,0.000256,18.072288,...,1,1.0,Inactive,Inactive,Active,Inactive,Inactive,Active,NaT,NaT
1,2020-06-22 10:00:01,3,695.1271,0.0,1,1,1,2,0.000256,18.104332,...,1,1.0,Inactive,Inactive,Active,Inactive,Inactive,Active,NaT,NaT
2,2020-06-22 10:00:02.005000,3,694.930847,0.0,1,1,1,2,0.000256,18.104332,...,1,1.0,Inactive,Inactive,Active,Inactive,Inactive,Active,NaT,NaT
3,2020-06-22 10:00:03.010000,3,694.930847,0.0,1,1,1,2,0.000256,18.104332,...,1,1.0,Inactive,Inactive,Active,Inactive,Inactive,Active,NaT,NaT
4,2020-06-22 10:00:04.015000,3,694.8523,0.0,1,1,1,2,0.000256,18.104332,...,1,1.0,Inactive,Inactive,Active,Inactive,Inactive,Active,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7196,6/29/20 11:59:56,2,740.46405,0.0,1,2,1,2,2.353507,51.332993,...,1,,,,,,,,2020-06-29 11:59:56,2020-06-29 11:59:56
7197,6/29/20 11:59:57,2,740.1893,0.0,1,2,1,2,2.353507,51.332993,...,1,,,,,,,,2020-06-29 11:59:57,2020-06-29 11:59:57
7198,6/29/20 11:59:58,2,739.875244,0.0,1,2,1,2,2.353507,51.332993,...,1,,,,,,,,2020-06-29 11:59:58,2020-06-29 11:59:58
7199,6/29/20 11:59:59,2,739.129456,0.0,1,2,1,2,2.349918,51.332993,...,1,,,,,,,,2020-06-29 11:59:59,2020-06-29 11:59:59


In [24]:
df['t_stamp'] = pd.to_datetime(df['t_stamp'], errors='coerce')

In [27]:
df["t_stamp"].dt.normalize().unique()

<DatetimeArray>
['2020-06-22 00:00:00', '2020-06-29 00:00:00']
Length: 2, dtype: datetime64[ns]

In [31]:
['t_stamp',
 'P1_STATE',
 'P2_STATE',
 'P207.Status',
 'P208.Status',
 'LS201.Alarm',
 'LS202.Alarm',
 'LSL203.Alarm',
 'LSLL203.Alarm',
 'P3_STATE',
 'AIT301.Pv',
 'AIT302.Pv',
 'AIT303.Pv',
 'LIT301.Pv',
 'FIT301.Pv',
 'DPIT301.Pv',
 'MV301.Status',
 'MV302.Status',
 'MV303.Status',
 'MV304.Status',
 'P301.Status',
 'P302.Status',
 'PSH301.Alarm',
 'DPSH301.Alarm',
 'P4_STATE',
 'LIT401.Pv',
 'FIT401.Pv',
 'AIT401.Pv',
 'AIT402.Pv',
 'P401.Status',
 'P402.Status',
 'P403.Status',
 'P404.Status',
 'UV401.Status',
 'LS401.Alarm',
 'P5_STATE',
 'FIT501.Pv',
 'FIT502.Pv',
 'FIT503.Pv',
 'FIT504.Pv',
 'AIT501.Pv',
 'AIT502.Pv',
 'AIT503.Pv',
 'AIT504.Pv',
 'PIT501.Pv',
 'PIT502.Pv',
 'PIT503.Pv',
 'P501.Status',
 'P502.Status',
 'MV501.Status',
 'MV502.Status',
 'MV503.Status',
 'MV504.Status',
 'PSH501.Alarm',
 'PSL501.Alarm',
 'P6_STATE',
 'FIT601.Pv',
 'P601.Status',
 'P602.Status',
 'P603.Status',
 'LSH601.Alarm',
 'LSL601.Alarm',
 'LSH602.Alarm',
 'LSL602.Alarm',
 'LSH603.Alarm',
 'LSL603.Alarm',
 'd_stamp',
 't_stamp.1']

['t_stamp',
 'P1_STATE',
 'LIT101.Pv',
 'FIT101.Pv',
 'MV101.Status',
 'P101.Status',
 'P102.Status',
 'P2_STATE',
 'FIT201.Pv',
 'AIT201.Pv',
 'AIT202.Pv',
 'AIT203.Pv',
 'MV201.Status',
 'P201.Status',
 'P202.Status',
 'P203.Status',
 'P204.Status',
 'P205.Status',
 'P206.Status',
 'P207.Status',
 'P208.Status',
 'LS201.Alarm',
 'LS202.Alarm',
 'LSL203.Alarm',
 'LSLL203.Alarm',
 'P3_STATE',
 'AIT301.Pv',
 'AIT302.Pv',
 'AIT303.Pv',
 'LIT301.Pv',
 'FIT301.Pv',
 'DPIT301.Pv',
 'MV301.Status',
 'MV302.Status',
 'MV303.Status',
 'MV304.Status',
 'P301.Status',
 'P302.Status',
 'PSH301.Alarm',
 'DPSH301.Alarm',
 'P4_STATE',
 'LIT401.Pv',
 'FIT401.Pv',
 'AIT401.Pv',
 'AIT402.Pv',
 'P401.Status',
 'P402.Status',
 'P403.Status',
 'P404.Status',
 'UV401.Status',
 'LS401.Alarm',
 'P5_STATE',
 'FIT501.Pv',
 'FIT502.Pv',
 'FIT503.Pv',
 'FIT504.Pv',
 'AIT501.Pv',
 'AIT502.Pv',
 'AIT503.Pv',
 'AIT504.Pv',
 'PIT501.Pv',
 'PIT502.Pv',
 'PIT503.Pv',
 'P501.Status',
 'P502.Status',
 'MV501.Status',
 '

# ***Summary of the dataset***

In [12]:
get_summary(df).style.background_gradient(cmap='viridis_r', low=0.8)

Unnamed: 0,dtype,unique,missing,duplicates,min,max,avg,std dev
t_stamp,object,32402,0,0,,,,
P1_STATE,object,4,0,0,,,,
LIT101.Pv,object,5869,0,0,,,,
FIT101.Pv,object,307,0,0,,,,
MV101.Status,object,4,0,0,,,,
P101.Status,object,3,0,0,,,,
P102.Status,object,2,0,0,,,,
P2_STATE,object,3,0,0,,,,
FIT201.Pv,object,1003,0,0,,,,
AIT201.Pv,object,1818,0,0,,,,


In [13]:
def data_exploration(df : pd.DataFrame) -> None:
    dim = deu.get_nb_of_dimensions(df)
    print(bcolors.HEADER + 'Number of dimensions:' + bcolors.ENDC, dim, '\n')
    print(bcolors.HEADER + 'Number of rows:' + bcolors.ENDC, deu.get_nb_of_rows(df), '\n')
    print(bcolors.HEADER + 'Column names:' + bcolors.ENDC, deu.get_column_names(df), '\n')
    print(bcolors.HEADER + 'Number column names:' + bcolors.ENDC, deu.get_number_column_names(df), '\n')
    print(bcolors.HEADER + 'Object column names:' + bcolors.ENDC, deu.get_object_column_names(df), '\n')

    for i in range(dim):
        col = df.columns[i]
        print(bcolors.HEADER + f'Unique values of column [{col}]:' + bcolors.ENDC, deu.get_unique_values(df, col), '\n')

In [14]:
data_exploration(df)

[95mNumber of dimensions:[0m 84 

[95mNumber of rows:[0m 32402 

[95mColumn names:[0m ['t_stamp', 'P1_STATE', 'LIT101.Pv', 'FIT101.Pv', 'MV101.Status', 'P101.Status', 'P102.Status', 'P2_STATE', 'FIT201.Pv', 'AIT201.Pv', 'AIT202.Pv', 'AIT203.Pv', 'MV201.Status', 'P201.Status', 'P202.Status', 'P203.Status', 'P204.Status', 'P205.Status', 'P206.Status', 'P207.Status', 'P208.Status', 'LS201.Alarm', 'LS202.Alarm', 'LSL203.Alarm', 'LSLL203.Alarm', 'P3_STATE', 'AIT301.Pv', 'AIT302.Pv', 'AIT303.Pv', 'LIT301.Pv', 'FIT301.Pv', 'DPIT301.Pv', 'MV301.Status', 'MV302.Status', 'MV303.Status', 'MV304.Status', 'P301.Status', 'P302.Status', 'PSH301.Alarm', 'DPSH301.Alarm', 'P4_STATE', 'LIT401.Pv', 'FIT401.Pv', 'AIT401.Pv', 'AIT402.Pv', 'P401.Status', 'P402.Status', 'P403.Status', 'P404.Status', 'UV401.Status', 'LS401.Alarm', 'P5_STATE', 'FIT501.Pv', 'FIT502.Pv', 'FIT503.Pv', 'FIT504.Pv', 'AIT501.Pv', 'AIT502.Pv', 'AIT503.Pv', 'AIT504.Pv', 'PIT501.Pv', 'PIT502.Pv', 'PIT503.Pv', 'P501.Status', 'P502.

# ***SWaT System Overview***

* P1: Raw Water Storage - Model-Based Monitoring System
    * MV101
    * FIT101
    * LIT101
    * T101
    * P101
    * P102
* P2: Chemical Dosing - Data-Driven / Model-Based Monitoring System
    * FIT201
    * AIT201
    * AIT202
    * AIT203
    * MV201
    * P201
    * P202
    * P203
    * P204
    * P205
    * P206
    * LS201
    * LS202
    * LSL203
    * LSLL203
* P3: Ultra-filtration (UF) - Model-Based Monitoring System
    * LIT301
    * FIT301
    * DPIT301
    * MV301
    * MV302
    * MV304
    * P301
    * P302
    * PSH301
    * DPSH301
* P4: Dechlorination - Model-Based Monitoring System
    * LIT401
    * FIT401
    * AIT401
    * AIT402
    * P401
    * P402
    * P403
    * P404
    * UV401
    * LS401
* P5: Reverse Osmosis (RO) - Data-Driven Monitoring System
    * FIT501
    * FIT502
    * FIT503
    * FIT504
    * AIT501
    * AIT502
    * AIT503
    * AIT504
    * PIT501
    * PIT502
    * PIT503
    * P501
    * P502
    * MV501
    * MV502
    * MV503
    * MV504
    * PSH501
    * PSL501
* P6: RO Permeate transfer, UF backwash - Data-Driven Monitoring System
    * FIT601
    * P601
    * P602
    * LSH601
    * LSL601
    * LSH602
    * LSL602
    * LSH603
    * LSL603

In [15]:
stamps = df.filter(regex='.*stamp.*').copy()
p1 = df.filter(regex='P1.*|.*10.*').copy()
p2 = df.filter(regex='P2.*|.*20.*').copy()
p3 = df.filter(regex='P3.*|.*30.*').copy()
p4 = df.filter(regex='P4.*|.*40.*').copy()
p5 = df.filter(regex='P5.*|.*50.*').copy()
p6 = df.filter(regex='P6.*|.*60.*').copy()

In [18]:
p1.columns

Index(['P1_STATE', 'LIT101.Pv', 'FIT101.Pv', 'MV101.Status', 'P101.Status',
       'P102.Status'],
      dtype='object')

In [55]:
def check_sum_columns(ref: pd.DataFrame, list_df : list[pd.DataFrame]) -> bool | None:
    if ref is None or list_df is None:
        return None
    cumsum = 0
    for df_ in list_df:
        cumsum += deu.get_nb_of_dimensions(df_)
    return cumsum == deu.get_nb_of_dimensions(ref)

check_sum_columns(df, [stamps, p1, p2, p3, p4, p5, p6])

True

# ***Attack types***

Total number of attacks: 36.

-> ça vient du papier..

## ***Single Stage Single Point (SSSP)***
Targets a single sensor at a single point in time.

In [None]:
# TODO

## ***Single Stage Multi Point (SSMP)***
Targets multiple sensors at a single point in time.

In [30]:
# TODO

## ***Multi Stage Single Point (MSSP)***
Targets a single sensor at multiple points in time.

In [None]:
# TODO

 ## ***Multi Stage Multi Point (MSMP)***
Targets multiple sensors at multiple points in time.

In [None]:
# TODO

# ***Learning Algorithms***

## ***Isolation Forest***

In [58]:
# uncomment this for a giant fucking pavé sa mère
# list_if_outliers = adu.get_list_of_if_outliers(df, 0.1)

## ***Local Outlier Factor***

In [59]:
# uncomment this for a giant fucking pavé sa mère
# list_lof_outliers = adu.get_list_of_lof_outliers(df, 0.1)