# Isolation forest

In [1]:
from pathlib import Path
import os

os.chdir(Path(os.getcwd()).parent)

In [2]:
from AnomalyCableDetection.load import Loader, Preprocessor
from AnomalyCableDetection.stl import CableSTL, CrossCorrelation, AdjacencyType
from AnomalyCableDetection.plot import *

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
from os.path import join
from pathlib import Path

import matplotlib.dates as mdates
import matplotlib.pylab as plt
import seaborn as sns
import pandas as pd
import numpy as np
import glob
import os
import re

### Load pre1 dataset

In [4]:
pre1_path = join(Path(os.getcwd()), 'data', 'csv', 'preprocessed_1')
pre1_list = glob.glob(join(pre1_path, '*.csv'))
date_regex = r'(19|20)\d{2}-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[0-1])'
date_list = []
cable_dict = dict()

for pre1 in pre1_list:
    m = re.search(date_regex, pre1)
    date = m.group(0)
    date_list.append(date)
    cable_df = pd.read_csv(pre1, index_col=0)
    cable_dict.update({date: cable_df})

date_list.sort()
cable_list = cable_dict[date_list[0]].columns.tolist()
cable_list.remove('SJS13')
cable_list.remove('SJX08')
cable_list.remove('SJX13')

for date in date_list:
    df = cable_dict[date].loc[:, cable_list]
    cable_dict.update({date: df})

### Get scatter dataframe

In [5]:
def get_dup_cable_numbers(tension_df):
    cable_list = tension_df.columns.to_list()
    cable_numbers = []

    for cable in cable_list:
        cable_number = int(cable[-2:])
        cable_numbers.append(cable_number)

    cable_numbers = [x for i, x in enumerate(cable_numbers) if i != cable_numbers.index(x)]
    return cable_numbers

def get_scatter_df(tension_df, start=0, end=100):
    cable_numbers = get_dup_cable_numbers(tension_df)
    
    single_len = end - start
    index_len = single_len * len(cable_numbers)
    
    scatter_df = pd.DataFrame(index=[i for i in range(index_len)], columns=['x', 'y', 'cable'])
    for i, cable in enumerate(cable_numbers):
        cable_number = str(cable).zfill(2)
        x = tension_df.loc[start:end - 1, f'SJS{cable_number}'].to_list()
        y = tension_df.loc[start:end - 1, f'SJX{cable_number}'].to_list()
        scatter_df.loc[i * single_len:(i + 1) * single_len - 1, 'x'] = x
        scatter_df.loc[i * single_len:(i + 1) * single_len - 1, 'y'] = y
        scatter_df.loc[i * single_len:(i + 1) * single_len, 'cable'] = f'SJ{cable_number}'
        
    return scatter_df

In [6]:
scatter_dict = dict()
scatter_list = []

tmp_df = cable_dict[date_list[0]]
cable_numbers = get_dup_cable_numbers(tmp_df)
length = len(tmp_df.index.to_list())

for date in date_list:
    tension_df = cable_dict[date]
    scatter_df = get_scatter_df(tension_df, end=length)
    scatter_list.append(scatter_df)
    scatter_dict.update({date: scatter_df})
    
train_scatter_df = pd.concat(scatter_list)
test_scatter_df = scatter_dict['2011-11-01']
train_copy = train_scatter_df.copy()
test_copy = test_scatter_df.copy()

### Get train test set

In [7]:
cable_encode = LabelEncoder()
train_scatter_df['cable'] = cable_encode.fit_transform(train_scatter_df['cable'])
test_scatter_df['cable'] = cable_encode.transform(test_scatter_df['cable'])

### Isolation forest

In [8]:
model = IsolationForest(n_estimators=100, max_samples='auto', n_jobs=-1, max_features=2, contamination=0.025)
model.fit(train_scatter_df.to_numpy())

IsolationForest(contamination=0.025, max_features=2, n_jobs=-1)

In [9]:
score = model.decision_function(train_scatter_df.to_numpy())
anomaly = model.predict(train_scatter_df.to_numpy())
train_copy['scores'] = score
train_copy['anomaly'] = anomaly
anomaly_data = train_copy.loc[train_copy['anomaly'] == -1]
anomaly_data

Unnamed: 0,x,y,cable,scores,anomaly
974,28.917913,67.735468,SJ09,-0.000051,-1
975,30.485386,69.695832,SJ09,-0.011980,-1
976,32.121458,72.418257,SJ09,-0.021688,-1
977,33.174442,74.517176,SJ09,-0.025330,-1
978,33.937202,77.11259,SJ09,-0.030061,-1
...,...,...,...,...,...
863095,116.343209,163.109778,SJ14,-0.129097,-1
863096,112.362924,159.443548,SJ14,-0.129097,-1
863097,103.137979,151.421362,SJ14,-0.128044,-1
863098,83.969877,132.285685,SJ14,-0.107873,-1


In [16]:
anomaly_data['cable'].value_counts()

SJ09    59223
SJ14    58315
SJ12    48260
SJ11    31457
SJ10    18515
Name: cable, dtype: int64

In [13]:
test_score = model.decision_function(test_scatter_df.to_numpy())
test_anomaly = model.predict(test_scatter_df.to_numpy())
test_copy['scores'] = test_score
test_copy['anomaly'] = test_anomaly
test_anomaly_data = test_copy.loc[test_copy['anomaly'] == -1]
test_anomaly_data

Unnamed: 0,x,y,cable,scores,anomaly
93,54.4129,52.753613,SJ09,-0.003475,-1
94,58.261975,58.414104,SJ09,-0.018243,-1
95,60.388176,61.263048,SJ09,-0.021901,-1
96,59.261231,62.117355,SJ09,-0.021447,-1
97,53.836334,61.505272,SJ09,-0.019088,-1
...,...,...,...,...,...
863095,116.343209,163.109778,SJ14,-0.129097,-1
863096,112.362924,159.443548,SJ14,-0.129097,-1
863097,103.137979,151.421362,SJ14,-0.128044,-1
863098,83.969877,132.285685,SJ14,-0.107873,-1


In [17]:
test_anomaly_data['cable'].value_counts()

SJ14    18612
SJ09    17293
SJ12    15379
SJ11     7027
SJ10     6663
Name: cable, dtype: int64

### Hourly detection

In [47]:
def get_hourly_range(tension_df, hour_index):
    start = (7200 * hour_index) - 90
    end = start + 7200
    if hour_index == 0:
        start += 90
        end -= 90
    if hour_index == 23:
        end -= 90
        
    return start, end

hourly_dict = dict()
for date in cable_dict:
    tension_df = cable_dict[date]
    for i in range(24):
        start, end = get_hourly_range(tension_df, i)
        hourly_df = tension_df.loc[start:end-1, :]
        hourly_df.reset_index(drop=True, inplace=True)
        scatter_df = get_scatter_df(hourly_df, end=end-start)
        hourly_dict.update({(date, i): scatter_df})

In [48]:
cable_encode = LabelEncoder()
train_count_list = []
test_count_list = []

for i in range(24):
    train_list = []
    test_list = []
    for date in date_list:
        df = hourly_dict[(date, i)]
        train_list.append(df)
        if date == '2011-11-01':
            test_list.append(df)
    
    train_df = pd.concat(train_list)
    test_df = pd.concat(test_list)
    train_copy = train_df.copy()
    test_copy = test_df.copy()
    
    train_df['cable'] = cable_encode.fit_transform(train_df['cable'])
    test_df['cable'] = cable_encode.transform(test_df['cable'])
    
    model = IsolationForest(n_estimators=100, max_samples='auto', n_jobs=-1, max_features=2, contamination=0.02)
    model.fit(train_df.to_numpy())
    
    score = model.decision_function(train_df.to_numpy())
    anomaly = model.predict(train_df.to_numpy())
    train_copy['scores'] = score
    train_copy['anomaly'] = anomaly
    anomaly_data = train_copy.loc[train_copy['anomaly'] == -1]
    train_count_list.append(anomaly_data['cable'].value_counts())
    
    test_score = model.decision_function(test_df.to_numpy())
    test_anomaly = model.predict(test_df.to_numpy())
    test_copy['scores'] = test_score
    test_copy['anomaly'] = test_anomaly
    test_anomaly_data = test_copy.loc[test_copy['anomaly'] == -1]
    test_count_list.append(test_anomaly_data['cable'].value_counts())

In [49]:
train_count_list

[SJ14    2566
 SJ12    1666
 SJ09    1600
 SJ11     893
 SJ10     295
 Name: cable, dtype: int64,
 SJ14    2259
 SJ12    1826
 SJ09    1645
 SJ11    1088
 SJ10     382
 Name: cable, dtype: int64,
 SJ14    2221
 SJ09    1950
 SJ12    1617
 SJ11     988
 SJ10     424
 Name: cable, dtype: int64,
 SJ14    2020
 SJ09    1931
 SJ12    1836
 SJ11     862
 SJ10     551
 Name: cable, dtype: int64,
 SJ14    1932
 SJ12    1909
 SJ09    1679
 SJ11     901
 SJ10     779
 Name: cable, dtype: int64,
 SJ09    1977
 SJ14    1786
 SJ12    1722
 SJ11     994
 SJ10     721
 Name: cable, dtype: int64,
 SJ14    2399
 SJ09    1818
 SJ12    1579
 SJ11     750
 SJ10     653
 Name: cable, dtype: int64,
 SJ09    2145
 SJ14    1820
 SJ12    1716
 SJ11     803
 SJ10     716
 Name: cable, dtype: int64,
 SJ14    2113
 SJ09    1910
 SJ12    1701
 SJ11     810
 SJ10     666
 Name: cable, dtype: int64,
 SJ14    2336
 SJ09    1910
 SJ12    1595
 SJ11     875
 SJ10     484
 Name: cable, dtype: int64,
 SJ14    2173
 SJ09 

In [50]:
test_count_list

[SJ14    1020
 SJ12     633
 SJ09     549
 SJ11     202
 SJ10     166
 Name: cable, dtype: int64,
 SJ14    552
 SJ09    529
 SJ12    493
 SJ11    202
 SJ10    171
 Name: cable, dtype: int64,
 SJ14    538
 SJ09    463
 SJ12    332
 SJ11    159
 SJ10    118
 Name: cable, dtype: int64,
 SJ14    512
 SJ12    500
 SJ09    494
 SJ10    199
 SJ11    152
 Name: cable, dtype: int64,
 SJ14    621
 SJ12    547
 SJ09    487
 SJ10    270
 SJ11    193
 Name: cable, dtype: int64,
 SJ09    722
 SJ12    588
 SJ14    555
 SJ10    306
 SJ11    263
 Name: cable, dtype: int64,
 SJ14    832
 SJ09    687
 SJ12    665
 SJ10    353
 SJ11    252
 Name: cable, dtype: int64,
 SJ09    726
 SJ14    636
 SJ12    621
 SJ10    243
 SJ11    128
 Name: cable, dtype: int64,
 SJ14    650
 SJ09    613
 SJ12    505
 SJ10    265
 SJ11    158
 Name: cable, dtype: int64,
 SJ14    829
 SJ09    708
 SJ12    619
 SJ10    250
 SJ11    239
 Name: cable, dtype: int64,
 SJ14    554
 SJ09    506
 SJ12    422
 SJ10    217
 SJ11    160
