# ***SWaT Dataset - Anomaly Detection***

* Clovis Lechien
* Alexandre Devaux-Rivière
* Florian Segard-Gahery
* Valentin San
* Maël Reynaud

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from os import listdir
import re

import mlsecu.data_exploration_utils as deu
import mlsecu.data_preparation_utils as dpu
import mlsecu.anomaly_detection_use_case as adu

%matplotlib inline

In [25]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [7]:
!ls 'data/swat/'

22June2020_1.xlsx  22June2020_2.xlsx  29June2020_1.xlsx  29June2020_2.xlsx


In [8]:
def get_summary(df : pd.DataFrame) -> pd.DataFrame:
    df_desc = pd.DataFrame(df.describe(include='all').T)
    df_summary = pd.DataFrame({
        'dtype': df.dtypes,
        'unique':df.nunique().values,
        'missing': df.isna().sum().values,
        'duplicates': df.duplicated().sum(),
        'min': df_desc['min'].values,
        'max': df_desc['max'].values,
        'avg': df_desc['mean'].values,
        'std dev': df_desc['std'].values
    })
    return df_summary

In [9]:
def list_dir(path : str, pattern : str) -> list[str]:
    return [file for file in listdir(path) if re.search(pattern, file)]

In [10]:
def preproc_df(df : pd.DataFrame) -> pd.DataFrame:
    return df[df.columns.drop(list(df.filter(regex='Unnamed')))]

In [11]:
def concat_dfs(dfs_path : list[str]) -> pd.DataFrame:
    list_of_dfs = []
    for df_path in dfs_path:
        df = preproc_df(pd.read_excel(df_path))
        list_of_dfs.append(df)
    return pd.concat(list_of_dfs, axis=0)

In [12]:
path = 'data/swat/'

pattern = r"^.*\.xlsx$"
dfs = [path + file for file in list_dir(path, pattern)]
dfs

['data/swat/29June2020_1.xlsx',
 'data/swat/29June2020_2.xlsx',
 'data/swat/22June2020_1.xlsx',
 'data/swat/22June2020_2.xlsx']

In [13]:
df = concat_dfs(dfs)
df

Unnamed: 0,t_stamp,P1_STATE,LIT101.Pv,FIT101.Pv,MV101.Status,P101.Status,P102.Status,P2_STATE,FIT201.Pv,AIT201.Pv,...,LS401.Alarm,PSH501.Alarm,PSL501.Alarm,P603.Status,LSH601.Alarm,LSL601.Alarm,LSH602.Alarm,LSL602.Alarm,LSH603.Alarm,LSL603.Alarm
0,6/29/20 10:00:00,3,531.168335,0.0,1,2,1,2,2.344408,72.86593,...,,,,,,,,,,
1,6/29/20 10:00:01,3,530.422546,0.0,1,2,1,2,2.344408,72.86593,...,,,,,,,,,,
2,6/29/20 10:00:02,3,530.304749,0.0,1,2,1,2,2.344408,72.930016,...,,,,,,,,,,
3,6/29/20 10:00:03,3,529.402,0.0,1,2,1,2,2.344408,72.930016,...,,,,,,,,,,
4,6/29/20 10:00:04,3,529.5197,0.0,1,2,1,2,2.345433,73.026146,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3595,2020-06-22 09:59:55,3,695.1271,0,1,1,1,2,0.000256,17.97616,...,Inactive,Inactive,Inactive,1.0,Inactive,Inactive,Active,Inactive,Inactive,Active
3596,2020-06-22 09:59:56,3,694.9701,0,1,1,1,2,0.000256,17.97616,...,Inactive,Inactive,Inactive,1.0,Inactive,Inactive,Active,Inactive,Inactive,Active
3597,2020-06-22 09:59:57,3,694.734558,0,1,1,1,2,0.000256,17.97616,...,Inactive,Inactive,Inactive,1.0,Inactive,Inactive,Active,Inactive,Inactive,Active
3598,2020-06-22 09:59:58,3,694.8523,0,1,1,1,2,0.000256,17.97616,...,Inactive,Inactive,Inactive,1.0,Inactive,Inactive,Active,Inactive,Inactive,Active


# ***Summary of the dataset***

In [None]:
get_summary(df).style.background_gradient(cmap='viridis_r', low=0.8)

In [28]:
def data_exploration(df : pd.DataFrame) -> None:
    dim = deu.get_nb_of_dimensions(df)
    print(bcolors.HEADER + 'Number of dimensions:' + bcolors.ENDC, dim, '\n')
    print(bcolors.HEADER + 'Number of rows:' + bcolors.ENDC, deu.get_nb_of_rows(df), '\n')
    print(bcolors.HEADER + 'Column names:' + bcolors.ENDC, deu.get_column_names(df), '\n')
    print(bcolors.HEADER + 'Number column names:' + bcolors.ENDC, deu.get_number_column_names(df), '\n')
    print(bcolors.HEADER + 'Object column names:' + bcolors.ENDC, deu.get_object_column_names(df), '\n')

    for i in range(dim):
        col = df.columns[i]
        print(bcolors.HEADER + f'Unique values of column [{col}]:' + bcolors.ENDC, deu.get_unique_values(df, col), '\n')

In [29]:
data_exploration(df)

[95mNumber of dimensions:[0m 84 

[95mNumber of rows:[0m 32402 

[95mColumn names:[0m ['t_stamp', 'P1_STATE', 'LIT101.Pv', 'FIT101.Pv', 'MV101.Status', 'P101.Status', 'P102.Status', 'P2_STATE', 'FIT201.Pv', 'AIT201.Pv', 'AIT202.Pv', 'AIT203.Pv', 'MV201.Status', 'P201.Status', 'P202.Status', 'P203.Status', 'P204.Status', 'P205.Status', 'P206.Status', 'P3_STATE', 'LIT301.Pv', 'FIT301.Pv', 'DPIT301.Pv', 'MV301.Status', 'MV302.Status', 'MV303.Status', 'MV304.Status', 'P301.Status', 'P302.Status', 'P4_STATE', 'LIT401.Pv', 'FIT401.Pv', 'AIT401.Pv', 'AIT402.Pv', 'P401.Status', 'P402.Status', 'P403.Status', 'P404.Status', 'UV401.Status', 'P5_STATE', 'FIT501.Pv', 'FIT502.Pv', 'FIT503.Pv', 'FIT504.Pv', 'AIT501.Pv', 'AIT502.Pv', 'AIT503.Pv', 'AIT504.Pv', 'PIT501.Pv', 'PIT502.Pv', 'PIT503.Pv', 'P501.Status', 'P502.Status', 'MV501.Status', 'MV502.Status', 'MV503.Status', 'MV504.Status', 'P6_STATE', 'FIT601.Pv', 'P601.Status', 'P602.Status', 'd_stamp', 't_stamp.1', 'P207.Status', 'P208.Status'

# ***SWaT System Overview***

* P1: Raw Water Storage - Model-Based Monitoring System
    * MV101
    * FIT101
    * LIT101
    * T101
    * P101
    * P102
* P2: Chemical Dosing - Data-Driven / Model-Based Monitoring System
    * FIT201
    * AIT201
    * AIT202
    * AIT203
    * MV201
    * P201
    * P202
    * P203
    * P204
    * P205
    * P206
    * LS201
    * LS202
    * LSL203
    * LSLL203
* P3: Ultra-filtration (UF) - Model-Based Monitoring System
    * LIT301
    * FIT301
    * DPIT301
    * MV301
    * MV302
    * MV304
    * P301
    * P302
    * PSH301
    * DPSH301
* P4: Dechlorination - Model-Based Monitoring System
    * LIT401
    * FIT401
    * AIT401
    * AIT402
    * P401
    * P402
    * P403
    * P404
    * UV401
    * LS401
* P5: Reverse Osmosis (RO) - Data-Driven Monitoring System
    * FIT501
    * FIT502
    * FIT503
    * FIT504
    * AIT501
    * AIT502
    * AIT503
    * AIT504
    * PIT501
    * PIT502
    * PIT503
    * P501
    * P502
    * MV501
    * MV502
    * MV503
    * MV504
    * PSH501
    * PSL501
* P6: RO Permeate transfer, UF backwash - Data-Driven Monitoring System
    * FIT601
    * P601
    * P602
    * LSH601
    * LSL601
    * LSH602
    * LSL602
    * LSH603
    * LSL603

In [53]:
stamps = df.filter(regex='.*stamp.*').copy()
p1 = df.filter(regex='P1.*|.*10.*').copy()
p2 = df.filter(regex='P2.*|.*20.*').copy()
p3 = df.filter(regex='P3.*|.*30.*').copy()
p4 = df.filter(regex='P4.*|.*40.*').copy()
p5 = df.filter(regex='P5.*|.*50.*').copy()
p6 = df.filter(regex='P6.*|.*60.*').copy()

In [55]:
def check_sum_columns(ref: pd.DataFrame, list_df : list[pd.DataFrame]) -> bool | None:
    if ref is None or list_df is None:
        return None
    cumsum = 0
    for df_ in list_df:
        cumsum += deu.get_nb_of_dimensions(df_)
    return cumsum == deu.get_nb_of_dimensions(ref)

check_sum_columns(df, [stamps, p1, p2, p3, p4, p5, p6])

True

# ***Attack types***

Total number of attacks: 36.

-> ça vient du papier..

## ***Single Stage Single Point (SSSP)***
Targets a single sensor at a single point in time.

In [None]:
# TODO

## ***Single Stage Multi Point (SSMP)***
Targets multiple sensors at a single point in time.

In [30]:
# TODO

## ***Multi Stage Single Point (MSSP)***
Targets a single sensor at multiple points in time.

In [None]:
# TODO

 ## ***Multi Stage Multi Point (MSMP)***
Targets multiple sensors at multiple points in time.

In [None]:
# TODO

# ***Learning Algorithms***

## ***Isolation Forest***

In [58]:
# uncomment this for a giant fucking pavé sa mère
# list_if_outliers = adu.get_list_of_if_outliers(df, 0.1)

## ***Local Outlier Factor***

In [59]:
# uncomment this for a giant fucking pavé sa mère
# list_lof_outliers = adu.get_list_of_lof_outliers(df, 0.1)