# CLUSTERING 
### CLUSTERING BASED ANOMALY DETECTION

ANOMALY DETECTION WITH THE HELP OF CLUSTERING METHOD

#### DATASET : AIRQUALITY DATA
#### AUTHOR: AVINASH BAGUL

IMPORT REQUIRED LIBRARIES

In [1]:
import pycaret
import pandas as pd
import numpy as np

READING DATASET

In [2]:
df1 = pd.read_csv('iqr_benchmark.csv')
df = df1[df1['boxName']=='Bernem']
df = df.set_index('Time_stamp')
df = df.drop(df.iloc[:,0:1], axis = 1)
df = df.drop(df.iloc[:,-1:], axis = 1)

DATASET FEATURES COUNT DESCRIPTION

In [3]:
df.nunique()

PM 2.5                 2480
temp                   3067
pressure                 71
humidity                 87
wind_speed               84
Time of Day               4
Peak/NoPeak               2
Day                       7
Week Day                  2
Weather                  23
Weather Description      68
dtype: int64

DATAFRAME

In [4]:
df

Unnamed: 0_level_0,PM 2.5,temp,pressure,humidity,wind_speed,Time of Day,Peak/NoPeak,Day,Week Day,Weather,Weather Description
Time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-12-31 18:30:53,139.63,7.71,1032,100,3.09,Evening_Hours,Peak,Monday,Workday,Clouds,broken clouds
2018-12-31 18:35:01,136.43,7.71,1032,100,3.09,Evening_Hours,Peak,Monday,Workday,Clouds,broken clouds
2018-12-31 18:39:09,112.67,7.71,1032,100,3.09,Evening_Hours,Peak,Monday,Workday,Clouds,broken clouds
2018-12-31 18:43:17,46.87,7.71,1032,100,3.09,Evening_Hours,Peak,Monday,Workday,Clouds,broken clouds
2018-12-31 18:47:24,38.30,7.71,1032,100,3.09,Evening_Hours,Peak,Monday,Workday,Clouds,broken clouds
...,...,...,...,...,...,...,...,...,...,...,...
2020-02-28 18:11:12,3.93,6.00,1013,56,3.60,Evening_Hours,Peak,Friday,Workday,Clear,sky is clear
2020-02-28 18:14:31,4.33,6.00,1013,56,3.60,Evening_Hours,Peak,Friday,Workday,Clear,sky is clear
2020-02-28 18:21:01,3.93,6.00,1013,56,3.60,Evening_Hours,Peak,Friday,Workday,Clear,sky is clear
2020-02-28 18:24:01,4.10,6.00,1013,56,3.60,Evening_Hours,Peak,Friday,Workday,Clear,sky is clear


IMPORTING ANOMALY MODULE FROM PYCARET LIBRARY

In [5]:
from pycaret.anomaly import *
exp_ano=setup(df)


Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,8605
1,Original Data,"(162173, 11)"
2,Missing Values,False
3,Numeric Features,5
4,Categorical Features,6
5,Ordinal Features,False
6,High Cardinality Features,False
7,Transformed Data,"(162173, 111)"
8,Numeric Imputer,mean
9,Categorical Imputer,constant


CREATING CLUSTERING MODEL

In [6]:
import time

start = time.time()

cluster = create_model('cluster')
cluster_df = assign_model(cluster)
cluster_df

print('time: ', time.time() - start)

time:  87.52932095527649


In [7]:
cluster

CBLOF(alpha=0.9, beta=5, check_estimator=False, clustering_estimator=None,
   contamination=0.05, n_clusters=8, n_jobs=1, random_state=8605,
   use_weights=False)

CONVERTING LABELS

In [8]:
idf = cluster_df['Label']
ll = []
for i in idf:
    if i == 0:
        ll.append('normal')
    else:
        ll.append('abnormal')

print(len(ll))

162173


ANOMALY COUNT 

In [9]:
c = 0
for x in ll:
    if x == 'abnormal':
        c +=1

print(c)

8108


In [10]:
ff = df1[df1['boxName']=='Bernem']
ff = ff.set_index('Time_stamp')
anml = ff[ff['label']=='abnormal']

## EVALUATION

CALCULATING ACCURACY, PRECISION, RECALL AND F1-SCORE

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

obt_label = ll
gold_label = ff['label']
testy = obt_label
yhat_classes = gold_label

# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(testy, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(testy, yhat_classes, pos_label = 'abnormal')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(testy, yhat_classes, pos_label = 'abnormal')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(testy, yhat_classes, pos_label = 'abnormal')
print('F1 score: %f' % f1)

Accuracy: 0.925999
Precision: 0.278983
Recall: 0.303034
F1 score: 0.290511
