# Anomaly Detection

## Documentation

The objective of this script is to identify potential anomalies in the ISCS event logs beyond the alarms defined within the system. This is to be acheived via a combination of baselining, sliding/rolling window comparisons of event counts and event combinations, and clustering methods.

## Initialisation

### Load libraries

In [1]:
# Load libraries
import pandas as pd
import numpy as np
import datetime as dt
import time
import sys
import os
from textblob import TextBlob # New Update
import swifter # speeds up pandas apply functions on vectorised data
#import memory_profiler # for logging memory consumption at the function level
import psutil # or logging memory consumption at that point instant

### Enable Memory Logging

In [2]:
# Load memory profiler to log memory usage by function
# uses the "%memit" prefix before each function to start logging
%load_ext memory_profiler

In [3]:
p = psutil.Process()
# Get peak memory usage at that instant of time
print(p.memory_info().peak_wset / 1024 ** 2)

173.71484375


### Control Memory Reduction Treatment for CMS Data
CMS data is generally much larger and more varied than ATS or ECS data, resulting in higher RAM consumption which may exceed the computer's available RAM capacity.

In [4]:
DataReductionOn = True

In [5]:
# Functional Category Cluster to filter CMS data by
FuncCatCluster = [
                  "Depot", #0
                  "PowerSys", #1
                  "ITESS", #2
                  "Lighting", #3
                  "StationEquipment", #4
                  "StationFacilities", #5
                  "CCTV", #6
                  "FireProtection", #7
                  "PassengerInfo", #8
                  "Comms", #9
                  "SCS", #10
                  "LENV", #11
                  "Others", #12 (this is an optional catch all)
                  "AltRun", #13 (Exclude CCTV and PassengerInfo)
                  "All" #14
                  ]

# Define Filter Combo
targetFuncCatCluster = 13

### Configure display

In [6]:
# Enable display of all columns for dataframes with many variables
pd.set_option('display.max_columns', None)

### Set Up Directory

In [7]:
# Check current directory location

cwd = os.getcwd()
cwd

'C:\\Users\\schdadmin\\Documents\\IAMS Analytics\\Scripts'

In [8]:
# Define root file directory folder where the files are being stored
#os.chdir(cwd + alarmLoc)
os.chdir(os.path.dirname(os.getcwd()) + '\\alarm-event-logs')

# Check current directory location
cwd = os.getcwd()

# Check directory location
cwd

'C:\\Users\\schdadmin\\Documents\\IAMS Analytics\\alarm-event-logs'

In [9]:
# Location of tagged Event Files
taggedEventLoc = '\\taggedOutput\\Subset File Ver\\cms'

## Initial Tagging of Parameters

### Load AlarmList Files

In [10]:
# Define root file directory folder where the files are being stored
os.chdir(cwd + taggedEventLoc)
#os.chdir(cwd + eventLoc)
# Check directory location
os.getcwd()

'C:\\Users\\schdadmin\\Documents\\IAMS Analytics\\alarm-event-logs\\taggedOutput\\Subset File Ver\\cms'

In [11]:
# Inspect files in directory
fileList = os.listdir()
fileList

['CMS-B0001-000-alarmsTagged.csv',
 'CMS-B0001-001-alarmsTagged.csv',
 'CMS-B0001-002-alarmsTagged.csv',
 'CMS-B0001-003-alarmsTagged.csv',
 'CMS-B0001-004-alarmsTagged.csv',
 'CMS-B0001-005-alarmsTagged.csv',
 'CMS-B0001-006-alarmsTagged.csv',
 'CMS-B0001-007-alarmsTagged.csv',
 'CMS-B0001-008-alarmsTagged.csv',
 'CMS-B0001-009-alarmsTagged.csv',
 'CMS-B0001-010-alarmsTagged.csv',
 'CMS-B0001-011-alarmsTagged.csv',
 'CMS-B0001-012-alarmsTagged.csv',
 'CMS-B0001-013-alarmsTagged.csv',
 'CMS-B0001-014-alarmsTagged.csv',
 'CMS-B0001-015-alarmsTagged.csv',
 'CMS-B0001-016-alarmsTagged.csv',
 'CMS-B0001-017-alarmsTagged.csv',
 'CMS-B0001-018-alarmsTagged.csv',
 'CMS-B0001-019-alarmsTagged.csv',
 'CMS-B0001-020-alarmsTagged.csv',
 'CMS-B0001-021-alarmsTagged.csv',
 'CMS-B0001-022-alarmsTagged.csv',
 'CMS-B0001-023-alarmsTagged.csv',
 'CMS-B0001-024-alarmsTagged.csv',
 'CMS-B0001-025-alarmsTagged.csv',
 'CMS-B0001-026-alarmsTagged.csv',
 'CMS-B0001-027-alarmsTagged.csv',
 'CMS-B0001-028-alar

**Notes**
1. The 1st start-end pair should start earlier than than 2nd pair so that past values may be extracted
2. In general, use a smaller time window if the dataset too large for the computer's hardware resources to handle
3. Due to the size of the CMS data, a maximum of 7 days of the data could be used for each run
4. Some fields with high cardinality has been dropped for the CMS data in order to preserve memory and compute power
5. Do note that the current CMS data lacks non-alarm data from the stations, hence if such data is piped in, the data volume would be significantly higher and additional compromises have to be made

In [12]:
# Define Time Windows to File Data
#'2020-12-25' to '2021-01-01' to '2021-01-08'
#'2021-01-01' to '2021-01-08' to '2021-01-15'
#'2021-01-08' to '2021-01-15' to '2021-01-22'
#'2021-01-15' to '2021-01-22' to '2021-01-29'
#'2021-01-22' to '2021-01-29' to '2021-02-05'

windowStart1 = '2020-12-25' # buffer period start
windowStart2 = '2021-01-01' # main start
windowEnd1 = '2021-01-29' # main End
windowEnd2 = windowEnd1

# choose whether to load single file or by batch (environment) 
# if targeting by file index, use a format like fileList[n]
# else go by environment name, found in the file prefix e.g. "ATS", "CMS", "ECS", if you need to compile multiple files together
targetFile = "CMS"

# Choose to exclude Engineering Hours in Anomaly Detection step
# This is to avoid confusion the algorithms due to the volatile nature of engineering hours
# and to save memory
# Set to "True" as the default to exclude it
# Set to "False" to perform a separate run for engineering hours only (opposite)
ExcludeEngHours = False

if (ExcludeEngHours == False):
    EngHrFilter = True
    EngHrTag = "-EngHr"
elif (ExcludeEngHours == True):
    EngHrFilter = False
    EngHrTag = ""
    

if (targetFile == "ATS") | (targetFile == "CMS") | (targetFile == "ECS"):
    # Gather list of cleaned alarmList files
    dfs = [pd.read_csv(f)
            for f in os.listdir(os.getcwd()) if f.startswith(targetFile)]

    # Compile list of cleaned alarmList files into a single dataframe
    df = pd.concat(dfs, ignore_index=True).reset_index()

    # Delete redundant index col
    del df["index"], dfs

else:
    # Load Tagged Event Log File
    df = pd.read_csv(targetFile)

# Define File Parameters (For exporting outputs later)
SrcEnv = targetFile + "-"
BatchCode = "05-"
    
# Print df summary stats
print(df.shape)
print(df.info())

  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)
  dfs = [pd.read_csv(f)


(19604630, 38)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19604630 entries, 0 to 19604629
Data columns (total 38 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   ENTRY_CODE_SUFFIX         object 
 1   ENTRY_CODE                int64  
 2   ALARM_ID                  int64  
 3   USER_ID                   int64  
 4   EQUIPMENT_NAME            float64
 5   VALUE                     object 
 6   VALUE_STATE               int64  
 7   ACKNOWLEDGEMENT_REQUIRED  bool   
 8   SEVERITY                  int64  
 9   HIDDEN                    bool   
 10  THEME                     int64  
 11  EQUIPMENT_DATE            object 
 12  ACQUISITION_DATE          object 
 13  SCS_TIME                  object 
 14  FUNCTIONAL_CATEGORY       int64  
 15  GEOGRAPHICAL_CATEGORY     int64  
 16  ENVIRONMENT               object 
 17  USER1                     float64
 18  ASSET_ID_RAW              object 
 19  ASSET_DESCRIPTION         object 
 20  EVENT_D

In [13]:
# Inspect df
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,ENTRY_CODE,ALARM_ID,USER_ID,EQUIPMENT_NAME,VALUE,VALUE_STATE,ACKNOWLEDGEMENT_REQUIRED,SEVERITY,HIDDEN,THEME,EQUIPMENT_DATE,ACQUISITION_DATE,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,USER1,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS,ASSET_DESC_CAT,EVENT_DESC_CAT,TrainID,CarID,ServiceID,AssetClass,AssetSubClass,DATETIME_SENT,DATETIME_RECEIVED,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3
0,+,-1168181814,12636,0,,1.0,1,True,4,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,4,OCCCMS,1.0,HBF_LENV_SMS_,STN SMS Server - Environment HBFSMS,Environment 2 Status,STOP,,STN SMS Server - Environment SUBLOCATIONSMS,Environment Status,,,,LENV,SMS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False
1,+,-1168181804,12626,0,,1.0,1,True,4,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,17,OCCCMS,1.0,BGK_LENV_SMS_,STN SMS Server - Environment BGKSMS,Environment 2 Status,STOP,,STN SMS Server - Environment SUBLOCATIONSMS,Environment Status,,,,LENV,SMS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False
2,+,-1168181791,12613,0,,1.0,1,True,4,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,OCC_LENV_CMS_,CMS SCS Server - Environment OCCCMS,Environment 1 Status,STOP,,CMS SCS Server - Environment CMS,Environment Status,,,,LENV,CMS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False
3,+,-1168181819,12641,0,,1.0,1,True,4,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,SCS/NED/1212/GWS05,OCC GWS 5,Environment Status,STOPPED,,GWS,Environment Status,,,,SCS,GWS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False
4,+,-1168181817,12639,0,,1.0,1,True,4,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,SCS/NED/1212/GWS03,OCC GWS 3,Environment Status,STOPPED,,GWS,Environment Status,,,,SCS,GWS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False


### Filter Data By Funtional Category Clusters
This is vital for handling CMS data which needs to be fragmented into smaller chunks for processing as RAM is limited

In [14]:
# Filter Data
if (targetFile != "CMS"):
    pass

elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "Depot"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] == 1)].reset_index(drop = True)
    
elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "PowerSys"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] == 2) | 
                (df["FUNCTIONAL_CATEGORY"] == 3) | 
                (df["FUNCTIONAL_CATEGORY"] == 4) | 
                (df["FUNCTIONAL_CATEGORY"] == 7)           
               ].reset_index(drop = True)
    
elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "ITESS"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] == 5)].reset_index(drop = True)
    
elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "Lighting"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] == 6) | 
                (df["FUNCTIONAL_CATEGORY"] == 8)           
               ].reset_index(drop = True)

elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "StationEquipment"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] == 30)].reset_index(drop = True)

elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "StationFacilities"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] == 31) | 
                (df["FUNCTIONAL_CATEGORY"] == 33) | 
                (df["FUNCTIONAL_CATEGORY"] == 34)          
               ].reset_index(drop = True)
    
elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "CCTV"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] == 40)].reset_index(drop = True)

elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "FireProtection"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] == 32) | 
                (df["FUNCTIONAL_CATEGORY"] == 35)           
               ].reset_index(drop = True)
    
elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "PassengerInfo"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] == 42) | 
                (df["FUNCTIONAL_CATEGORY"] == 44)           
               ].reset_index(drop = True)
    
elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "Comms"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] == 46) | 
                (df["FUNCTIONAL_CATEGORY"] == 48)           
               ].reset_index(drop = True)
    
elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "SCS"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] == 51) | 
                (df["FUNCTIONAL_CATEGORY"] == 60) | 
                (df["FUNCTIONAL_CATEGORY"] == 70) | 
                (df["FUNCTIONAL_CATEGORY"] == 71) | 
                (df["FUNCTIONAL_CATEGORY"] == 72) | 
                (df["FUNCTIONAL_CATEGORY"] == 73) | 
                (df["FUNCTIONAL_CATEGORY"] == 74)      
               ].reset_index(drop = True)
    
elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "LENV"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] == 62) | 
                (df["FUNCTIONAL_CATEGORY"] == 63) | 
                (df["FUNCTIONAL_CATEGORY"] == 65) | 
                (df["FUNCTIONAL_CATEGORY"] == 66)           
               ].reset_index(drop = True)
    
elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "Others"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] != 1) & #01
                (df["FUNCTIONAL_CATEGORY"] != 2) & #02
                (df["FUNCTIONAL_CATEGORY"] != 3) & #03
                (df["FUNCTIONAL_CATEGORY"] != 4) & #04
                (df["FUNCTIONAL_CATEGORY"] != 5) & #05
                (df["FUNCTIONAL_CATEGORY"] != 6) & #06
                (df["FUNCTIONAL_CATEGORY"] != 7) & #07
                (df["FUNCTIONAL_CATEGORY"] != 8) & #08
                (df["FUNCTIONAL_CATEGORY"] != 30) & #09
                (df["FUNCTIONAL_CATEGORY"] != 31) & #10
                (df["FUNCTIONAL_CATEGORY"] != 32) & #11
                (df["FUNCTIONAL_CATEGORY"] != 33) & #12
                (df["FUNCTIONAL_CATEGORY"] != 34) & #13
                (df["FUNCTIONAL_CATEGORY"] != 35) & #14
                (df["FUNCTIONAL_CATEGORY"] != 40) & #15
                (df["FUNCTIONAL_CATEGORY"] != 42) & #16
                (df["FUNCTIONAL_CATEGORY"] != 44) & #17
                (df["FUNCTIONAL_CATEGORY"] != 46) & #18
                (df["FUNCTIONAL_CATEGORY"] != 48) & #19
                (df["FUNCTIONAL_CATEGORY"] != 51) & #20
                (df["FUNCTIONAL_CATEGORY"] != 60) & #21
                (df["FUNCTIONAL_CATEGORY"] != 62) & #22
                (df["FUNCTIONAL_CATEGORY"] != 63) & #23
                (df["FUNCTIONAL_CATEGORY"] != 65) & #24
                (df["FUNCTIONAL_CATEGORY"] != 66) & #25
                (df["FUNCTIONAL_CATEGORY"] != 70) & #26
                (df["FUNCTIONAL_CATEGORY"] != 71) & #27
                (df["FUNCTIONAL_CATEGORY"] != 72) & #28
                (df["FUNCTIONAL_CATEGORY"] != 73) & #29
                (df["FUNCTIONAL_CATEGORY"] != 74)   #30        
               ].reset_index(drop = True)

elif (targetFile == "CMS") and (FuncCatCluster[targetFuncCatCluster] == "AltRun"):
    df = df.loc[(df["FUNCTIONAL_CATEGORY"] == 1) | #01
                (df["FUNCTIONAL_CATEGORY"] == 2) | #02
                (df["FUNCTIONAL_CATEGORY"] == 3) | #03
                (df["FUNCTIONAL_CATEGORY"] == 4) | #04
                (df["FUNCTIONAL_CATEGORY"] == 5) | #05
                (df["FUNCTIONAL_CATEGORY"] == 6) | #06
                (df["FUNCTIONAL_CATEGORY"] == 7) | #07
                (df["FUNCTIONAL_CATEGORY"] == 8) | #08
                (df["FUNCTIONAL_CATEGORY"] == 30) | #09
                (df["FUNCTIONAL_CATEGORY"] == 31) | #10
                (df["FUNCTIONAL_CATEGORY"] == 32) | #11
                (df["FUNCTIONAL_CATEGORY"] == 33) | #12
                (df["FUNCTIONAL_CATEGORY"] == 34) | #13
                (df["FUNCTIONAL_CATEGORY"] == 35) | #14
                (df["FUNCTIONAL_CATEGORY"] == 46) | #18
                (df["FUNCTIONAL_CATEGORY"] == 48) | #19
                (df["FUNCTIONAL_CATEGORY"] == 51) | #20
                (df["FUNCTIONAL_CATEGORY"] == 60) | #21
                (df["FUNCTIONAL_CATEGORY"] == 62) | #22
                (df["FUNCTIONAL_CATEGORY"] == 63) | #23
                (df["FUNCTIONAL_CATEGORY"] == 65) | #24
                (df["FUNCTIONAL_CATEGORY"] == 66) | #25
                (df["FUNCTIONAL_CATEGORY"] == 70) | #26
                (df["FUNCTIONAL_CATEGORY"] == 71) | #27
                (df["FUNCTIONAL_CATEGORY"] == 72) | #28
                (df["FUNCTIONAL_CATEGORY"] == 73) | #29
                (df["FUNCTIONAL_CATEGORY"] == 74)   #30        
               ].reset_index(drop = True)

elif (targetFile == "CMS"):
    pass
    
# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2869843 entries, 0 to 2869842
Data columns (total 38 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   ENTRY_CODE_SUFFIX         object 
 1   ENTRY_CODE                int64  
 2   ALARM_ID                  int64  
 3   USER_ID                   int64  
 4   EQUIPMENT_NAME            float64
 5   VALUE                     object 
 6   VALUE_STATE               int64  
 7   ACKNOWLEDGEMENT_REQUIRED  bool   
 8   SEVERITY                  int64  
 9   HIDDEN                    bool   
 10  THEME                     int64  
 11  EQUIPMENT_DATE            object 
 12  ACQUISITION_DATE          object 
 13  SCS_TIME                  object 
 14  FUNCTIONAL_CATEGORY       int64  
 15  GEOGRAPHICAL_CATEGORY     int64  
 16  ENVIRONMENT               object 
 17  USER1                     float64
 18  ASSET_ID_RAW              object 
 19  ASSET_DESCRIPTION         object 
 20  EVENT_DESCRIPTION       

In [15]:
# Auto terminate script if resulting dataframe is empty
if (df.shape[0] == 0):
    sys.exit("Empty Dataframe")
else:
    pass

### Load Functional Category Lookup Table (Deprecated)

In [16]:
# Check directory location
#cwd

In [17]:
# Check directory location
#os.getcwd()

In [18]:
# Location of lookup table
#lookupTableLoc = '../../../Dashboard/ReferenceIndex.xlsx'

# Load Funcational Category lookup table
#funcCat_df = pd.read_excel(lookupTableLoc, sheet_name = "Sheet1")

# Inspect Data
#funcCat_df.head()

In [19]:
# Inspect Data
#funcCat_df.info()

In [20]:
# Rename Column
#funcCat_df.rename(columns = {'Functional Category':'FUNCTIONAL_CATEGORY_DESC', 
#                             'Code':'FUNCTIONAL_CATEGORY'}, inplace = True)

# Inspect Data
#funcCat_df.head()

### Tag Functional Category (Deprecated)

In [21]:
# Merge Data
#df = df.merge(funcCat_df, "left", left_on = "FUNCTIONAL_CATEGORY", right_on = "FUNCTIONAL_CATEGORY")

# Inspect Data
#df.info()

In [22]:
# Inspect df
#df.head()

### Load Geographical Category Look Up Table

In [23]:
# Check directory location
os.getcwd()

'C:\\Users\\schdadmin\\Documents\\IAMS Analytics\\alarm-event-logs\\taggedOutput\\Subset File Ver\\cms'

In [24]:
# Location of lookup table
lookupTableLoc = '../ReferenceIndex-Geo.csv'

# Load Funcational Category lookup table
geoCat_df = pd.read_csv(lookupTableLoc)

# Inspect Data
geoCat_df.head()

Unnamed: 0,SN,GeoCode,GeographicalCat,Sector
0,1,OCC,Operation Control Centre,10
1,2,SOCC,Standby OCC,11
2,3,NED,NEL Depot,12
3,4,HBF,Harbour Front station,1
4,5,OTP,Outram Park station,2


In [25]:
# Inspect Data
geoCat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   SN               22 non-null     int64 
 1   GeoCode          22 non-null     object
 2   GeographicalCat  22 non-null     object
 3   Sector           22 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 832.0+ bytes


In [26]:
# Rename Column
geoCat_df.rename(columns = {'SN':'GEOGRAPHICAL_CATEGORY', 'Sector':'GeoSector'}, inplace = True)

# Inspect Data
geoCat_df.head()

Unnamed: 0,GEOGRAPHICAL_CATEGORY,GeoCode,GeographicalCat,GeoSector
0,1,OCC,Operation Control Centre,10
1,2,SOCC,Standby OCC,11
2,3,NED,NEL Depot,12
3,4,HBF,Harbour Front station,1
4,5,OTP,Outram Park station,2


### Tag Geographical Information

In [27]:
# Merge Data
df = df.merge(geoCat_df, "left", left_on = "GEOGRAPHICAL_CATEGORY", right_on = "GEOGRAPHICAL_CATEGORY")

# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2869843 entries, 0 to 2869842
Data columns (total 41 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   ENTRY_CODE_SUFFIX         object 
 1   ENTRY_CODE                int64  
 2   ALARM_ID                  int64  
 3   USER_ID                   int64  
 4   EQUIPMENT_NAME            float64
 5   VALUE                     object 
 6   VALUE_STATE               int64  
 7   ACKNOWLEDGEMENT_REQUIRED  bool   
 8   SEVERITY                  int64  
 9   HIDDEN                    bool   
 10  THEME                     int64  
 11  EQUIPMENT_DATE            object 
 12  ACQUISITION_DATE          object 
 13  SCS_TIME                  object 
 14  FUNCTIONAL_CATEGORY       int64  
 15  GEOGRAPHICAL_CATEGORY     int64  
 16  ENVIRONMENT               object 
 17  USER1                     float64
 18  ASSET_ID_RAW              object 
 19  ASSET_DESCRIPTION         object 
 20  EVENT_DESCRIPTION       

### Load Severity Category Look Up table

In [28]:
# Location of lookup table
lookupTableLoc = '../ReferenceIndex-Severity.csv'

# Load Funcational Category lookup table
sevCat_df = pd.read_csv(lookupTableLoc)

# Inspect Data
sevCat_df

Unnamed: 0,Code,Type,Severity,Severity_Class,SeverityRank
0,1,Others,Others,Others,1
1,2,Maintenance,Low,Maintenance-Low,2
2,3,Operational,Low,Operational-Low,3
3,4,Maintenance,Urgent,Maintenance-Urgent,6
4,5,Operational,Urgent,Operational-Urgent,7
5,6,Maintenance,Critical,Maintenance-Critical,10
6,7,Operational,Critical,Operational-Critical,11


In [29]:
# Inspect Data
sevCat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Code            7 non-null      int64 
 1   Type            7 non-null      object
 2   Severity        7 non-null      object
 3   Severity_Class  7 non-null      object
 4   SeverityRank    7 non-null      int64 
dtypes: int64(2), object(3)
memory usage: 408.0+ bytes


### Tag Severity Category

In [30]:
# Merge Data
df = df.merge(sevCat_df, "left", left_on = "SEVERITY", right_on = "Code")
df = df.drop(columns = ["Code", "Type", "Severity", "SEVERITY"])

# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2869843 entries, 0 to 2869842
Data columns (total 42 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   ENTRY_CODE_SUFFIX         object 
 1   ENTRY_CODE                int64  
 2   ALARM_ID                  int64  
 3   USER_ID                   int64  
 4   EQUIPMENT_NAME            float64
 5   VALUE                     object 
 6   VALUE_STATE               int64  
 7   ACKNOWLEDGEMENT_REQUIRED  bool   
 8   HIDDEN                    bool   
 9   THEME                     int64  
 10  EQUIPMENT_DATE            object 
 11  ACQUISITION_DATE          object 
 12  SCS_TIME                  object 
 13  FUNCTIONAL_CATEGORY       int64  
 14  GEOGRAPHICAL_CATEGORY     int64  
 15  ENVIRONMENT               object 
 16  USER1                     float64
 17  ASSET_ID_RAW              object 
 18  ASSET_DESCRIPTION         object 
 19  EVENT_DESCRIPTION         object 
 20  EVENT_STATUS            

In [31]:
# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2869843 entries, 0 to 2869842
Data columns (total 42 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   ENTRY_CODE_SUFFIX         object 
 1   ENTRY_CODE                int64  
 2   ALARM_ID                  int64  
 3   USER_ID                   int64  
 4   EQUIPMENT_NAME            float64
 5   VALUE                     object 
 6   VALUE_STATE               int64  
 7   ACKNOWLEDGEMENT_REQUIRED  bool   
 8   HIDDEN                    bool   
 9   THEME                     int64  
 10  EQUIPMENT_DATE            object 
 11  ACQUISITION_DATE          object 
 12  SCS_TIME                  object 
 13  FUNCTIONAL_CATEGORY       int64  
 14  GEOGRAPHICAL_CATEGORY     int64  
 15  ENVIRONMENT               object 
 16  USER1                     float64
 17  ASSET_ID_RAW              object 
 18  ASSET_DESCRIPTION         object 
 19  EVENT_DESCRIPTION         object 
 20  EVENT_STATUS            

In [32]:
# Inspect Data
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,ENTRY_CODE,ALARM_ID,USER_ID,EQUIPMENT_NAME,VALUE,VALUE_STATE,ACKNOWLEDGEMENT_REQUIRED,HIDDEN,THEME,EQUIPMENT_DATE,ACQUISITION_DATE,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,USER1,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS,ASSET_DESC_CAT,EVENT_DESC_CAT,TrainID,CarID,ServiceID,AssetClass,AssetSubClass,DATETIME_SENT,DATETIME_RECEIVED,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank
0,+,-1168181814,12636,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,4,OCCCMS,1.0,HBF_LENV_SMS_,STN SMS Server - Environment HBFSMS,Environment 2 Status,STOP,,STN SMS Server - Environment SUBLOCATIONSMS,Environment Status,,,,LENV,SMS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,HBF,Harbour Front station,1,Maintenance-Urgent,6
1,+,-1168181804,12626,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,17,OCCCMS,1.0,BGK_LENV_SMS_,STN SMS Server - Environment BGKSMS,Environment 2 Status,STOP,,STN SMS Server - Environment SUBLOCATIONSMS,Environment Status,,,,LENV,SMS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,BGK,Buangkok station,6,Maintenance-Urgent,6
2,+,-1168181791,12613,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,OCC_LENV_CMS_,CMS SCS Server - Environment OCCCMS,Environment 1 Status,STOP,,CMS SCS Server - Environment CMS,Environment Status,,,,LENV,CMS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6
3,+,-1168181819,12641,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,SCS/NED/1212/GWS05,OCC GWS 5,Environment Status,STOPPED,,GWS,Environment Status,,,,SCS,GWS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6
4,+,-1168181817,12639,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,SCS/NED/1212/GWS03,OCC GWS 3,Environment Status,STOPPED,,GWS,Environment Status,,,,SCS,GWS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6


### Load ISCS Crash Combo Look Up Table

In [33]:
# Location of lookup table
lookupTableLoc2 = '../ISCS Crash Notes.csv'

# Load Funcational Category lookup table
iscsCrash_df = pd.read_csv(lookupTableLoc2)

# Inspect Data
iscsCrash_df.head()

Unnamed: 0,Fault Combo,CrashWarning
0,ascv normal unit status-out of service,True
1,ascv reserve unit status-out of service,True
2,ascv status table validity-invalid,True
3,atc status-failure,True
4,cbi monitor table-invalid,True


In [34]:
# Inspect Data
iscsCrash_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Fault Combo   85 non-null     object
dtypes: bool(1), object(1)
memory usage: 893.0+ bytes


### Tag ISCS Crash Combo

In [35]:
# Create Fault Combo
df['Fault Combo'] = df['EVENT_DESC_CAT'].str.lower() + "-" + df['EVENT_STATUS'].str.lower()

# Inspect Data
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,ENTRY_CODE,ALARM_ID,USER_ID,EQUIPMENT_NAME,VALUE,VALUE_STATE,ACKNOWLEDGEMENT_REQUIRED,HIDDEN,THEME,EQUIPMENT_DATE,ACQUISITION_DATE,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,USER1,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS,ASSET_DESC_CAT,EVENT_DESC_CAT,TrainID,CarID,ServiceID,AssetClass,AssetSubClass,DATETIME_SENT,DATETIME_RECEIVED,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,Fault Combo
0,+,-1168181814,12636,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,4,OCCCMS,1.0,HBF_LENV_SMS_,STN SMS Server - Environment HBFSMS,Environment 2 Status,STOP,,STN SMS Server - Environment SUBLOCATIONSMS,Environment Status,,,,LENV,SMS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,HBF,Harbour Front station,1,Maintenance-Urgent,6,environment status-stop
1,+,-1168181804,12626,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,17,OCCCMS,1.0,BGK_LENV_SMS_,STN SMS Server - Environment BGKSMS,Environment 2 Status,STOP,,STN SMS Server - Environment SUBLOCATIONSMS,Environment Status,,,,LENV,SMS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,BGK,Buangkok station,6,Maintenance-Urgent,6,environment status-stop
2,+,-1168181791,12613,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,OCC_LENV_CMS_,CMS SCS Server - Environment OCCCMS,Environment 1 Status,STOP,,CMS SCS Server - Environment CMS,Environment Status,,,,LENV,CMS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6,environment status-stop
3,+,-1168181819,12641,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,SCS/NED/1212/GWS05,OCC GWS 5,Environment Status,STOPPED,,GWS,Environment Status,,,,SCS,GWS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6,environment status-stopped
4,+,-1168181817,12639,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,SCS/NED/1212/GWS03,OCC GWS 3,Environment Status,STOPPED,,GWS,Environment Status,,,,SCS,GWS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6,environment status-stopped


In [36]:
# Merge Data
df = df.merge(iscsCrash_df, "left", left_on = "Fault Combo", right_on = "Fault Combo")
df = df.replace({"CrashWarning": {np.nan: False}})
df = df.drop(columns = ["Fault Combo"])

# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2869843 entries, 0 to 2869842
Data columns (total 43 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   ENTRY_CODE_SUFFIX         object 
 1   ENTRY_CODE                int64  
 2   ALARM_ID                  int64  
 3   USER_ID                   int64  
 4   EQUIPMENT_NAME            float64
 5   VALUE                     object 
 6   VALUE_STATE               int64  
 7   ACKNOWLEDGEMENT_REQUIRED  bool   
 8   HIDDEN                    bool   
 9   THEME                     int64  
 10  EQUIPMENT_DATE            object 
 11  ACQUISITION_DATE          object 
 12  SCS_TIME                  object 
 13  FUNCTIONAL_CATEGORY       int64  
 14  GEOGRAPHICAL_CATEGORY     int64  
 15  ENVIRONMENT               object 
 16  USER1                     float64
 17  ASSET_ID_RAW              object 
 18  ASSET_DESCRIPTION         object 
 19  EVENT_DESCRIPTION         object 
 20  EVENT_STATUS            

In [37]:
# Inspect Data
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,ENTRY_CODE,ALARM_ID,USER_ID,EQUIPMENT_NAME,VALUE,VALUE_STATE,ACKNOWLEDGEMENT_REQUIRED,HIDDEN,THEME,EQUIPMENT_DATE,ACQUISITION_DATE,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,USER1,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS,ASSET_DESC_CAT,EVENT_DESC_CAT,TrainID,CarID,ServiceID,AssetClass,AssetSubClass,DATETIME_SENT,DATETIME_RECEIVED,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning
0,+,-1168181814,12636,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,4,OCCCMS,1.0,HBF_LENV_SMS_,STN SMS Server - Environment HBFSMS,Environment 2 Status,STOP,,STN SMS Server - Environment SUBLOCATIONSMS,Environment Status,,,,LENV,SMS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,HBF,Harbour Front station,1,Maintenance-Urgent,6,True
1,+,-1168181804,12626,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,17,OCCCMS,1.0,BGK_LENV_SMS_,STN SMS Server - Environment BGKSMS,Environment 2 Status,STOP,,STN SMS Server - Environment SUBLOCATIONSMS,Environment Status,,,,LENV,SMS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,BGK,Buangkok station,6,Maintenance-Urgent,6,True
2,+,-1168181791,12613,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,OCC_LENV_CMS_,CMS SCS Server - Environment OCCCMS,Environment 1 Status,STOP,,CMS SCS Server - Environment CMS,Environment Status,,,,LENV,CMS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6,True
3,+,-1168181819,12641,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,SCS/NED/1212/GWS05,OCC GWS 5,Environment Status,STOPPED,,GWS,Environment Status,,,,SCS,GWS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6,True
4,+,-1168181817,12639,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,SCS/NED/1212/GWS03,OCC GWS 3,Environment Status,STOPPED,,GWS,Environment Status,,,,SCS,GWS,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501000000,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6,True


### Tag Time

In [38]:
# Format time values to time format
df['DATETIME_SENT'] = pd.to_datetime(df['DATETIME_SENT'])
df['DATETIME_RECEIVED'] = pd.to_datetime(df['DATETIME_RECEIVED'])
df['EQUIPMENT_DATE'] = pd.to_datetime(df['EQUIPMENT_DATE'])
df['ACQUISITION_DATE'] = pd.to_datetime(df['ACQUISITION_DATE'])
df['SCS_TIME'] = pd.to_datetime(df['SCS_TIME'])
df['TIME_CODE'] = pd.to_datetime(df['TIME_CODE'])

# Print df summary stats
print(df.shape)
print(df.info())

(2869843, 43)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2869843 entries, 0 to 2869842
Data columns (total 43 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   ENTRY_CODE_SUFFIX         object        
 1   ENTRY_CODE                int64         
 2   ALARM_ID                  int64         
 3   USER_ID                   int64         
 4   EQUIPMENT_NAME            float64       
 5   VALUE                     object        
 6   VALUE_STATE               int64         
 7   ACKNOWLEDGEMENT_REQUIRED  bool          
 8   HIDDEN                    bool          
 9   THEME                     int64         
 10  EQUIPMENT_DATE            datetime64[ns]
 11  ACQUISITION_DATE          datetime64[ns]
 12  SCS_TIME                  datetime64[ns]
 13  FUNCTIONAL_CATEGORY       int64         
 14  GEOGRAPHICAL_CATEGORY     int64         
 15  ENVIRONMENT               object        
 16  USER1                     float64       

In [39]:
# Inspect df
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,ENTRY_CODE,ALARM_ID,USER_ID,EQUIPMENT_NAME,VALUE,VALUE_STATE,ACKNOWLEDGEMENT_REQUIRED,HIDDEN,THEME,EQUIPMENT_DATE,ACQUISITION_DATE,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,USER1,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS,ASSET_DESC_CAT,EVENT_DESC_CAT,TrainID,CarID,ServiceID,AssetClass,AssetSubClass,DATETIME_SENT,DATETIME_RECEIVED,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning
0,+,-1168181814,12636,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,4,OCCCMS,1.0,HBF_LENV_SMS_,STN SMS Server - Environment HBFSMS,Environment 2 Status,STOP,,STN SMS Server - Environment SUBLOCATIONSMS,Environment Status,,,,LENV,SMS,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501458944,False,False,False,False,False,HBF,Harbour Front station,1,Maintenance-Urgent,6,True
1,+,-1168181804,12626,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,17,OCCCMS,1.0,BGK_LENV_SMS_,STN SMS Server - Environment BGKSMS,Environment 2 Status,STOP,,STN SMS Server - Environment SUBLOCATIONSMS,Environment Status,,,,LENV,SMS,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501458944,False,False,False,False,False,BGK,Buangkok station,6,Maintenance-Urgent,6,True
2,+,-1168181791,12613,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,OCC_LENV_CMS_,CMS SCS Server - Environment OCCCMS,Environment 1 Status,STOP,,CMS SCS Server - Environment CMS,Environment Status,,,,LENV,CMS,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6,True
3,+,-1168181819,12641,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,SCS/NED/1212/GWS05,OCC GWS 5,Environment Status,STOPPED,,GWS,Environment Status,,,,SCS,GWS,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6,True
4,+,-1168181817,12639,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,SCS/NED/1212/GWS03,OCC GWS 3,Environment Status,STOPPED,,GWS,Environment Status,,,,SCS,GWS,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6,True


In [40]:
# Get date from timestamp
df["Date"] = df["SCS_TIME"].dt.date

# Get day of week from date
# 0 is for Monday & 6 is for Sunday
df["DayofWeek"] = df["SCS_TIME"].dt.dayofweek

# Tag whether date falls on a weekend or not
# public holidays and school holidays would not be tagged as there are too few to make a good baseline
df["Weekend"] = (df["DayofWeek"] > 4)

# Tag hour of day
df["HourofDay"] = df["SCS_TIME"].dt.hour

# Tag hour of day
df["MinuteofDay"] = df["SCS_TIME"].dt.minute

# Inspect df
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,ENTRY_CODE,ALARM_ID,USER_ID,EQUIPMENT_NAME,VALUE,VALUE_STATE,ACKNOWLEDGEMENT_REQUIRED,HIDDEN,THEME,EQUIPMENT_DATE,ACQUISITION_DATE,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,USER1,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS,ASSET_DESC_CAT,EVENT_DESC_CAT,TrainID,CarID,ServiceID,AssetClass,AssetSubClass,DATETIME_SENT,DATETIME_RECEIVED,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,MinuteofDay
0,+,-1168181814,12636,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,4,OCCCMS,1.0,HBF_LENV_SMS_,STN SMS Server - Environment HBFSMS,Environment 2 Status,STOP,,STN SMS Server - Environment SUBLOCATIONSMS,Environment Status,,,,LENV,SMS,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501458944,False,False,False,False,False,HBF,Harbour Front station,1,Maintenance-Urgent,6,True,2020-12-23,2,False,1,13
1,+,-1168181804,12626,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,17,OCCCMS,1.0,BGK_LENV_SMS_,STN SMS Server - Environment BGKSMS,Environment 2 Status,STOP,,STN SMS Server - Environment SUBLOCATIONSMS,Environment Status,,,,LENV,SMS,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501458944,False,False,False,False,False,BGK,Buangkok station,6,Maintenance-Urgent,6,True,2020-12-23,2,False,1,13
2,+,-1168181791,12613,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,OCC_LENV_CMS_,CMS SCS Server - Environment OCCCMS,Environment 1 Status,STOP,,CMS SCS Server - Environment CMS,Environment Status,,,,LENV,CMS,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6,True,2020-12-23,2,False,1,13
3,+,-1168181819,12641,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,SCS/NED/1212/GWS05,OCC GWS 5,Environment Status,STOPPED,,GWS,Environment Status,,,,SCS,GWS,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6,True,2020-12-23,2,False,1,13
4,+,-1168181817,12639,0,,1.0,1,True,True,0,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,2020-12-23 01:13:54.605474048,60,1,OCCCMS,1.0,SCS/NED/1212/GWS03,OCC GWS 3,Environment Status,STOPPED,,GWS,Environment Status,,,,SCS,GWS,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501,2020-12-23 01:13:55.501458944,False,False,False,False,False,OCC,Operation Control Centre,10,Maintenance-Urgent,6,True,2020-12-23,2,False,1,13


In [41]:
# Define Time Filter Parameters for Engineering Hours
start_time_hour = 0
start_time_minute = 30
end_time_hour = 4
end_time_minute = 30

# Tag Time if it's within Engineering Hours
df["EngHours"] = False
df.loc[((df["HourofDay"] >= start_time_hour) & (df["HourofDay"] < end_time_hour)), "EngHours"] = True
df.loc[(df["HourofDay"] == start_time_hour) & (df["MinuteofDay"] < start_time_minute), "EngHours"] = False
df.loc[(df["HourofDay"] == end_time_hour) & (df["MinuteofDay"] > end_time_minute), "EngHours"] = False

# Delete redundant variable
del df["MinuteofDay"]

# Subset / Filter Data by Date Period to better manage the data volume
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d")
df = df[(df['Date'] >= windowStart2) & (df['Date'] < windowEnd2)].reset_index()
del df["index"]


# Subset / Filter Data for Revenue Hours Only
df = df[(df['EngHours'] == EngHrFilter)].reset_index()
del df["index"]

# Inspect df
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,ENTRY_CODE,ALARM_ID,USER_ID,EQUIPMENT_NAME,VALUE,VALUE_STATE,ACKNOWLEDGEMENT_REQUIRED,HIDDEN,THEME,EQUIPMENT_DATE,ACQUISITION_DATE,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,USER1,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS,ASSET_DESC_CAT,EVENT_DESC_CAT,TrainID,CarID,ServiceID,AssetClass,AssetSubClass,DATETIME_SENT,DATETIME_RECEIVED,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours
0,+,-1173403598,3223210,0,,1,0,True,True,0,2021-01-01 00:30:03.941113856,2021-01-01 00:30:03.941113856,2021-01-01 00:30:03.941113856,48,11,OCCCMS,1.0,COM/BNK/B1/PABX01,PABX,Fan 1 Status,NORMAL,,PABX,Fan Status,,,,COM,PABX,2021-01-01 00:30:04.503,2021-01-01 00:30:04.503,2021-01-01 00:30:04.503241984,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True
1,+,-1173403603,3209898,0,,0,1,True,True,0,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,51,17,OCCCMS,1.0,SCS/BGK/B1/PLC01,BGK ISCS PLC 1,Mux Selection 01,FAULT,,ISCS PLC,Mux Selection,,,,SCS,PLC,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504389888,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
2,+,-1173403602,3209898,0,,1,0,True,True,0,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,51,17,OCCCMS,1.0,SCS/BGK/B1/PLC01,BGK ISCS PLC 1,Mux Selection 01,NORMAL,,ISCS PLC,Mux Selection,,,,SCS,PLC,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504389888,False,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
3,+,-1173403604,3223214,0,,0,0,True,True,0,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,48,11,OCCCMS,1.0,COM/BNK/B1/PABX01,PABX,Fan 1 Status,FAILURE,,PABX,Fan Status,,,,COM,PABX,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504389888,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True
4,+,-1173403638,3223224,0,,1,0,True,True,0,2021-01-01 00:30:07.483118080,2021-01-01 00:30:07.483118080,2021-01-01 00:30:07.483118080,48,11,OCCCMS,1.0,COM/BNK/B1/PABX01,PABX,Fan 1 Status,NORMAL,,PABX,Fan Status,,,,COM,PABX,2021-01-01 00:30:07.503,2021-01-01 00:30:07.503,2021-01-01 00:30:07.503307008,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True


### Set Data Types Accordingly

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 48 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   ENTRY_CODE_SUFFIX         312160 non-null  object        
 1   ENTRY_CODE                312160 non-null  int64         
 2   ALARM_ID                  312160 non-null  int64         
 3   USER_ID                   312160 non-null  int64         
 4   EQUIPMENT_NAME            0 non-null       float64       
 5   VALUE                     312160 non-null  object        
 6   VALUE_STATE               312160 non-null  int64         
 7   ACKNOWLEDGEMENT_REQUIRED  312160 non-null  bool          
 8   HIDDEN                    312160 non-null  bool          
 9   THEME                     312160 non-null  int64         
 10  EQUIPMENT_DATE            312160 non-null  datetime64[ns]
 11  ACQUISITION_DATE          312160 non-null  datetime64[ns]
 12  SC

In [43]:
# Inspect df
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,ENTRY_CODE,ALARM_ID,USER_ID,EQUIPMENT_NAME,VALUE,VALUE_STATE,ACKNOWLEDGEMENT_REQUIRED,HIDDEN,THEME,EQUIPMENT_DATE,ACQUISITION_DATE,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,USER1,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS,ASSET_DESC_CAT,EVENT_DESC_CAT,TrainID,CarID,ServiceID,AssetClass,AssetSubClass,DATETIME_SENT,DATETIME_RECEIVED,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours
0,+,-1173403598,3223210,0,,1,0,True,True,0,2021-01-01 00:30:03.941113856,2021-01-01 00:30:03.941113856,2021-01-01 00:30:03.941113856,48,11,OCCCMS,1.0,COM/BNK/B1/PABX01,PABX,Fan 1 Status,NORMAL,,PABX,Fan Status,,,,COM,PABX,2021-01-01 00:30:04.503,2021-01-01 00:30:04.503,2021-01-01 00:30:04.503241984,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True
1,+,-1173403603,3209898,0,,0,1,True,True,0,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,51,17,OCCCMS,1.0,SCS/BGK/B1/PLC01,BGK ISCS PLC 1,Mux Selection 01,FAULT,,ISCS PLC,Mux Selection,,,,SCS,PLC,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504389888,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
2,+,-1173403602,3209898,0,,1,0,True,True,0,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,51,17,OCCCMS,1.0,SCS/BGK/B1/PLC01,BGK ISCS PLC 1,Mux Selection 01,NORMAL,,ISCS PLC,Mux Selection,,,,SCS,PLC,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504389888,False,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
3,+,-1173403604,3223214,0,,0,0,True,True,0,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,48,11,OCCCMS,1.0,COM/BNK/B1/PABX01,PABX,Fan 1 Status,FAILURE,,PABX,Fan Status,,,,COM,PABX,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504389888,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True
4,+,-1173403638,3223224,0,,1,0,True,True,0,2021-01-01 00:30:07.483118080,2021-01-01 00:30:07.483118080,2021-01-01 00:30:07.483118080,48,11,OCCCMS,1.0,COM/BNK/B1/PABX01,PABX,Fan 1 Status,NORMAL,,PABX,Fan Status,,,,COM,PABX,2021-01-01 00:30:07.503,2021-01-01 00:30:07.503,2021-01-01 00:30:07.503307008,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True


In [44]:
convert_dict = {'ALARM_ID': 'str',
                'USER_ID': 'str',
                'VALUE': 'str',
                'VALUE_STATE': 'category',
                'ACKNOWLEDGEMENT_REQUIRED': 'bool',
                'GeoSector': 'category',
                'Severity_Class': 'category',
                'HIDDEN': 'bool',
                'THEME': 'category',
                'FUNCTIONAL_CATEGORY': 'category',
                'GEOGRAPHICAL_CATEGORY': 'category',
                'ASSET_ID_RAW': 'str',
                'ASSET_DESCRIPTION': 'str',
                'EVENT_DESCRIPTION': 'str',
                'EVENT_STATUS': 'category',
                'OPERATOR_INITIALS': 'str',
                'ASSET_DESC_CAT': 'category',
                'EVENT_DESC_CAT': 'category',
                'TrainID': 'str',
                'CarID': 'str',
                'ServiceID': 'str',
                'AssetClass': 'category',
                'AssetSubClass': 'category'
               }


df = df.astype(convert_dict)

# Delete redundant variables
del convert_dict

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 48 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   ENTRY_CODE_SUFFIX         312160 non-null  object        
 1   ENTRY_CODE                312160 non-null  int64         
 2   ALARM_ID                  312160 non-null  object        
 3   USER_ID                   312160 non-null  object        
 4   EQUIPMENT_NAME            0 non-null       float64       
 5   VALUE                     312160 non-null  object        
 6   VALUE_STATE               312160 non-null  category      
 7   ACKNOWLEDGEMENT_REQUIRED  312160 non-null  bool          
 8   HIDDEN                    312160 non-null  bool          
 9   THEME                     312160 non-null  category      
 10  EQUIPMENT_DATE            312160 non-null  datetime64[ns]
 11  ACQUISITION_DATE          312160 non-null  datetime64[ns]
 12  SC

In [45]:
# Inspect df
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,ENTRY_CODE,ALARM_ID,USER_ID,EQUIPMENT_NAME,VALUE,VALUE_STATE,ACKNOWLEDGEMENT_REQUIRED,HIDDEN,THEME,EQUIPMENT_DATE,ACQUISITION_DATE,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,USER1,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS,ASSET_DESC_CAT,EVENT_DESC_CAT,TrainID,CarID,ServiceID,AssetClass,AssetSubClass,DATETIME_SENT,DATETIME_RECEIVED,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours
0,+,-1173403598,3223210,0,,1,0,True,True,0,2021-01-01 00:30:03.941113856,2021-01-01 00:30:03.941113856,2021-01-01 00:30:03.941113856,48,11,OCCCMS,1.0,COM/BNK/B1/PABX01,PABX,Fan 1 Status,NORMAL,,PABX,Fan Status,,,,COM,PABX,2021-01-01 00:30:04.503,2021-01-01 00:30:04.503,2021-01-01 00:30:04.503241984,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True
1,+,-1173403603,3209898,0,,0,1,True,True,0,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,51,17,OCCCMS,1.0,SCS/BGK/B1/PLC01,BGK ISCS PLC 1,Mux Selection 01,FAULT,,ISCS PLC,Mux Selection,,,,SCS,PLC,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504389888,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
2,+,-1173403602,3209898,0,,1,0,True,True,0,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,51,17,OCCCMS,1.0,SCS/BGK/B1/PLC01,BGK ISCS PLC 1,Mux Selection 01,NORMAL,,ISCS PLC,Mux Selection,,,,SCS,PLC,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504389888,False,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
3,+,-1173403604,3223214,0,,0,0,True,True,0,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,2021-01-01 00:30:04.953128192,48,11,OCCCMS,1.0,COM/BNK/B1/PABX01,PABX,Fan 1 Status,FAILURE,,PABX,Fan Status,,,,COM,PABX,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504,2021-01-01 00:30:05.504389888,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True
4,+,-1173403638,3223224,0,,1,0,True,True,0,2021-01-01 00:30:07.483118080,2021-01-01 00:30:07.483118080,2021-01-01 00:30:07.483118080,48,11,OCCCMS,1.0,COM/BNK/B1/PABX01,PABX,Fan 1 Status,NORMAL,,PABX,Fan Status,,,,COM,PABX,2021-01-01 00:30:07.503,2021-01-01 00:30:07.503,2021-01-01 00:30:07.503307008,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True


### Junk Redundant Fields

In [46]:
# Drop redundant variables
df = df.drop(columns = [
                        "ENTRY_CODE",
                        "ALARM_ID",
                        "USER_ID",
                        "VALUE",
                        "VALUE_STATE",
                        "HIDDEN",
                        "THEME",
                        "EQUIPMENT_DATE",
                        "ACQUISITION_DATE",
                        "ASSET_DESCRIPTION",
                        "EVENT_DESCRIPTION",
                        "OPERATOR_INITIALS",
                        "TrainID",
                        "CarID",
                        "ServiceID",
                        "DATETIME_SENT",
                        "DATETIME_RECEIVED",
                        "USER1"
                        ])

# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 30 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   ENTRY_CODE_SUFFIX         312160 non-null  object        
 1   EQUIPMENT_NAME            0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED  312160 non-null  bool          
 3   SCS_TIME                  312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY       312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY     312160 non-null  category      
 6   ENVIRONMENT               312160 non-null  object        
 7   ASSET_ID_RAW              312160 non-null  object        
 8   EVENT_STATUS              312157 non-null  category      
 9   ASSET_DESC_CAT            311610 non-null  category      
 10  EVENT_DESC_CAT            312160 non-null  category      
 11  AssetClass                312160 non-null  category      
 12  As

## Clean Up Event Description Data
There is still significant variation due to sublocation, timetable name, train direction, tunnel direction and operator ID information embedded in the EVENT_DESC_CAT. Hence, additional cleaning is performed to supplement what was already done during pre-processing.

In [47]:
# Toggle switch to clean data
# Set to false to skip this if this has been handled in the preprocessing script
addCleaningReq = True

In [48]:
if (addCleaningReq == True):
    # Handling of common cases
    # Track and Train Direction
    try:
        df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.replace("S/B ", "", regex = False)
    except:
        pass
    try:
        df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.replace("N/B ", "", regex = False)
    except:
        pass
    try:
        df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.replace(" S ", "", regex = False)
    except:
        pass
    try:
        df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.replace(" N ", "", regex = False)
    except:
        pass
    try:
        df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.replace(" N-N ", "", regex = False)
    except:
        pass
    try:
        df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.replace(" S-S ", "", regex = False)
    except:
        pass
    # SUBLOCATION
    try:
        df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.replace("SUBLOCATION", "", regex = False)
    except:
        pass
    try:
        df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.replace("SUBLOCATIONN", "", regex = False)
    except:
        pass
    try:
        df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.replace("SUBLOCATIONS", "", regex = False)
    except:
        pass

    # Remove repeated spaces & trailing spaces
    df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.replace("/s+", "/s", regex = True).str.strip()

else:
    pass



In [49]:
if (addCleaningReq == True):
    # Handling of exception case
    # Automatic Hold Applied
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Automatic Hold applied to TrainCar stalled", regex = False)), "EVENT_DESC_CAT"] = "Automatic Hold applied to TrainCar stalled"
    except:
        pass
    # Control Hand Over
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Control Hand Over for ECS - Environmental Control System", regex = False)), "EVENT_DESC_CAT"] = "Control Hand Over for ECS - Environmental Control System"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Control Hand Over for ECS - Smoke Extraction System", regex = False)), "EVENT_DESC_CAT"] = "Control Hand Over for ECS - Smoke Extraction System"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Control Hand Over for ECS - Tunnel Ventilation System", regex = False)), "EVENT_DESC_CAT"] = "Control Hand Over for ECS - Tunnel Ventilation System"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Control Hand Over for SIG - Control Train ATC", regex = False)), "EVENT_DESC_CAT"] = "Control Hand Over for SIG - Control Train ATC"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Control Hand Over for SIG - Platform Equipment", regex = False)), "EVENT_DESC_CAT"] = "Control Hand Over for SIG - Platform Equipment"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Control Hand Over for SIG - Track Side Equipment", regex = False)), "EVENT_DESC_CAT"] = "Control Hand Over for SIG - Track Side Equipment"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Control Hand Over for TrainBorne CCTV", regex = False)), "EVENT_DESC_CAT"] = "Control Hand Over for TrainBorne CCTV"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Control Hand Over for TrainBorne PA", regex = False)), "EVENT_DESC_CAT"] = "Control Hand Over for TrainBorne PA"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Control Hand Over for TrainBorne PEC", regex = False)), "EVENT_DESC_CAT"] = "Control Hand Over for TrainBorne PEC"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Control Hand Over for TrainBorne PIS/VPIS", regex = False)), "EVENT_DESC_CAT"] = "Control Hand Over for TrainBorne PIS/VPIS"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Control Take Over for All Functions", regex = False)), "EVENT_DESC_CAT"] = "Control Take Over for All Functions"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Control Take Over for PIS - Passenger Information", regex = False)), "EVENT_DESC_CAT"] = "Control Take Over for PIS - Passenger Information"
    except:
        pass
    # Operator Calls
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("accepts a PEC call", regex = False)), "EVENT_DESC_CAT"] = "OPERATOR accepts a PEC call"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("terminates all PEC calls", regex = False)), "EVENT_DESC_CAT"] = "OPERATOR terminates all PEC calls"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("terminates PEC call", regex = False)), "EVENT_DESC_CAT"] = "OPERATOR terminates PEC call"
    except:
        pass
    # Free all paths to Stations
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Free all paths for Station", regex = False)), "EVENT_DESC_CAT"] = "Free all paths for Station"
    except:
        pass
    # Gama status request
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Gama Status Request For an Atc", regex = False)), "EVENT_DESC_CAT"] = "Gama Status Request For an Atc"
    except:
        pass
    # NelVisu Password Change
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("change password on NelVisu", regex = False)), "EVENT_DESC_CAT"] = "OPERATOR change password on NelVisu"
    except:
        pass
    # Track-Side Atc Status Request
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Track-Side Atc Status Request for An Atc", regex = False)), "EVENT_DESC_CAT"] = "Track-Side Atc Status Request for An Atc"
    except:
        pass
    # Train Location
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Train found at", regex = False)) &
               (df["EVENT_DESC_CAT"].str.contains("instead of Train", regex = False)), "EVENT_DESC_CAT"] = "Train found at SUBLOCATION instead of Train"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Train still not a Man RTL", regex = False)) &
               (df["EVENT_DESC_CAT"].str.contains("origin after wait period", regex = False)), "EVENT_DESC_CAT"] = "Train still not a Man RTL origin after wait period"
    except:
        pass
    # Timetable
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Timetable", regex = False)) &
               (df["EVENT_DESC_CAT"].str.contains("download", regex = False)), "EVENT_DESC_CAT"] = "Timetable download"
    except:
        pass
    try:
        df.loc[(df["EVENT_DESC_CAT"].str.contains("Timetable", regex = False)) &
               (df["EVENT_DESC_CAT"].str.contains("successfully autoloaded", regex = False)), "EVENT_DESC_CAT"] = "Timetable successfully autoloaded"
    except:
        pass
    
else:
    pass




In [50]:
# Inspect data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 30 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   ENTRY_CODE_SUFFIX         312160 non-null  object        
 1   EQUIPMENT_NAME            0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED  312160 non-null  bool          
 3   SCS_TIME                  312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY       312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY     312160 non-null  category      
 6   ENVIRONMENT               312160 non-null  object        
 7   ASSET_ID_RAW              312160 non-null  object        
 8   EVENT_STATUS              312157 non-null  category      
 9   ASSET_DESC_CAT            311610 non-null  category      
 10  EVENT_DESC_CAT            312160 non-null  object        
 11  AssetClass                312160 non-null  category      
 12  As

In [51]:
# Inspect data
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours
0,+,,True,2021-01-01 00:30:03.941113856,48,11,OCCCMS,COM/BNK/B1/PABX01,NORMAL,PABX,Fan Status,COM,PABX,2021-01-01 00:30:04.503241984,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True
1,+,,True,2021-01-01 00:30:04.953128192,51,17,OCCCMS,SCS/BGK/B1/PLC01,FAULT,ISCS PLC,Mux Selection,SCS,PLC,2021-01-01 00:30:05.504389888,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
2,+,,True,2021-01-01 00:30:04.953128192,51,17,OCCCMS,SCS/BGK/B1/PLC01,NORMAL,ISCS PLC,Mux Selection,SCS,PLC,2021-01-01 00:30:05.504389888,False,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
3,+,,True,2021-01-01 00:30:04.953128192,48,11,OCCCMS,COM/BNK/B1/PABX01,FAILURE,PABX,Fan Status,COM,PABX,2021-01-01 00:30:05.504389888,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True
4,+,,True,2021-01-01 00:30:07.483118080,48,11,OCCCMS,COM/BNK/B1/PABX01,NORMAL,PABX,Fan Status,COM,PABX,2021-01-01 00:30:07.503307008,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True


## Baseline Comparison

### Baseline Comparison
The general idea is to get a baseline of the number of events by the following categories only, as some events appears too infrequently to get a proper baseline. Median was used as there were significant variation in event counts during the sample period from late Dec 2020 to 1 Feb 2021.

#### Create Basic Baseline

"ENVIRONMENT", "FUNCTIONAL_CATEGORY", "DayofWeek", "HourofDay"

In [52]:
# Get event count by "Date", "ENVIRONMENT", "FUNCTIONAL_CATEGORY", "DayofWeek", "HourofDay"
# Note Entry Code Suffix is only used as a proxy for counting the number of events
eventCountSummary = df[["Date", 
                        "ENVIRONMENT", 
                        "FUNCTIONAL_CATEGORY", 
                        "DayofWeek", 
                        "HourofDay", 
                        "ENTRY_CODE_SUFFIX"]].groupby(["Date", 
                                                       "ENVIRONMENT", 
                                                       "FUNCTIONAL_CATEGORY", 
                                                       "DayofWeek", 
                                                       "HourofDay"]).count()


# Inspect Data
eventCountSummary.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,ENTRY_CODE_SUFFIX
Date,ENVIRONMENT,FUNCTIONAL_CATEGORY,DayofWeek,HourofDay,Unnamed: 5_level_1
2021-01-01,OCCCMS,1,0,0,0
2021-01-01,OCCCMS,1,0,1,0
2021-01-01,OCCCMS,1,0,2,0
2021-01-01,OCCCMS,1,0,3,0
2021-01-01,OCCCMS,1,1,0,0
2021-01-01,OCCCMS,1,1,1,0
2021-01-01,OCCCMS,1,1,2,0
2021-01-01,OCCCMS,1,1,3,0
2021-01-01,OCCCMS,1,2,0,0
2021-01-01,OCCCMS,1,2,1,0


In [53]:
# Get threshold hour of day
# It will not matter whether it is at any time of the week or time of day
# This will be the generic baseline of scenarios where it's a stable flat line pattern
# The mean is used as the threshold level should be a stable flat line in theory
ECThreshold_Flat = eventCountSummary.groupby(["ENVIRONMENT", 
                                              "FUNCTIONAL_CATEGORY"]).quantile(0.9)


# Inspect Data
ECThreshold_Flat.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,ENTRY_CODE_SUFFIX
ENVIRONMENT,FUNCTIONAL_CATEGORY,Unnamed: 2_level_1
OCCCMS,1,0.0
OCCCMS,2,12.4
OCCCMS,3,0.0
OCCCMS,4,0.0
OCCCMS,5,0.0
OCCCMS,6,33.7
OCCCMS,7,0.0
OCCCMS,8,0.0
OCCCMS,30,0.0
OCCCMS,31,0.0


#### Create Baseline for Seasonal Effects (24/7 Cycle)

"ENVIRONMENT", "FUNCTIONAL_CATEGORY", "AssetClass", "DayofWeek", "HourofDay"

In [54]:
# Initialise Filters
AssetClassList = ["", "ROUS", "ROUT", "EMU", "ECS", "FPS", "E", "CIV", "COM", "SCS"]
FCatList = [10, 11, 12, 17, 19, 32, 33, 34, 40, 42, 44, 48, 51]

# Fiter Data
df_temp = df.loc[(df["AssetClass"].isin(AssetClassList)) & 
                 (df["FUNCTIONAL_CATEGORY"].isin(FCatList))].reset_index().drop(columns=["index"])

# Inspect Data
df_temp

Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours
0,+,,True,2021-01-01 00:30:03.941113856,48,11,OCCCMS,COM/BNK/B1/PABX01,NORMAL,PABX,Fan Status,COM,PABX,2021-01-01 00:30:04.503241984,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True
1,+,,True,2021-01-01 00:30:04.953128192,51,17,OCCCMS,SCS/BGK/B1/PLC01,FAULT,ISCS PLC,Mux Selection,SCS,PLC,2021-01-01 00:30:05.504389888,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
2,+,,True,2021-01-01 00:30:04.953128192,51,17,OCCCMS,SCS/BGK/B1/PLC01,NORMAL,ISCS PLC,Mux Selection,SCS,PLC,2021-01-01 00:30:05.504389888,False,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
3,+,,True,2021-01-01 00:30:04.953128192,48,11,OCCCMS,COM/BNK/B1/PABX01,FAILURE,PABX,Fan Status,COM,PABX,2021-01-01 00:30:05.504389888,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True
4,+,,True,2021-01-01 00:30:07.483118080,48,11,OCCCMS,COM/BNK/B1/PABX01,NORMAL,PABX,Fan Status,COM,PABX,2021-01-01 00:30:07.503307008,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275761,+,,True,2021-01-28 03:59:56.239026944,48,11,OCCCMS,COM/BNK/B1/PABX01,NORMAL,PABX,Fan Status,COM,PABX,2021-01-28 03:59:56.257340928,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-28,3,False,3,True
275762,+,,True,2021-01-28 03:59:56.491053056,51,17,OCCCMS,SCS/BGK/B1/PLC01,NORMAL,SUBLOCATION ISCS PLC,Mux Selection,SCS,PLC,2021-01-28 03:59:57.258107904,False,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-28,3,False,3,True
275763,+,,True,2021-01-28 03:59:56.995066880,51,17,OCCCMS,SCS/BGK/B1/PLC01,FAULT,SUBLOCATION ISCS PLC,Mux Selection,SCS,PLC,2021-01-28 03:59:57.258107904,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-28,3,False,3,True
275764,+,,True,2021-01-28 03:59:58.506998016,51,17,OCCCMS,SCS/BGK/B1/PLC03,NORMAL,SUBLOCATION ISCS PLC,Mux Selection,SCS,PLC,2021-01-28 03:59:59.257446144,True,True,True,False,False,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-28,3,False,3,True


In [55]:
# Get event count by "Date", "ENVIRONMENT", "FUNCTIONAL_CATEGORY", "DayofWeek", "HourofDay"
# Note Entry Code Suffix is only used as a proxy for counting the number of events
eventCountSummary = df_temp[["Date", 
                             "ENVIRONMENT", 
                             "FUNCTIONAL_CATEGORY", 
                             "AssetClass", 
                             "DayofWeek", 
                             "HourofDay", 
                             "ENTRY_CODE_SUFFIX"]].groupby(["Date", 
                                                            "ENVIRONMENT", 
                                                            "FUNCTIONAL_CATEGORY", 
                                                            "AssetClass", 
                                                            "DayofWeek", 
                                                            "HourofDay"]).count()

# Inspect Data
eventCountSummary.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,ENTRY_CODE_SUFFIX
Date,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,DayofWeek,HourofDay,Unnamed: 6_level_1
2021-01-01,OCCCMS,1,CIV,0,0,0
2021-01-01,OCCCMS,1,CIV,0,1,0
2021-01-01,OCCCMS,1,CIV,0,2,0
2021-01-01,OCCCMS,1,CIV,0,3,0
2021-01-01,OCCCMS,1,CIV,1,0,0
2021-01-01,OCCCMS,1,CIV,1,1,0
2021-01-01,OCCCMS,1,CIV,1,2,0
2021-01-01,OCCCMS,1,CIV,1,3,0
2021-01-01,OCCCMS,1,CIV,2,0,0
2021-01-01,OCCCMS,1,CIV,2,1,0


In [56]:
# 90th Percentile value for the hour within a particular day of week
# This will be the generic baseline of scenarios where there's a 24 hour seasonal pattern and a 7 day Seasonal Pattern
# The 90th percentile is set as the threshold to capture more extreme outliers
# The median is not discerning enough
ECThreshold_Seasonal = eventCountSummary.groupby(["ENVIRONMENT", 
                                                  "FUNCTIONAL_CATEGORY", 
                                                  "AssetClass", 
                                                  "DayofWeek", 
                                                  "HourofDay"]).quantile(0.9)

# Inspect Data
ECThreshold_Seasonal.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,ENTRY_CODE_SUFFIX
ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,DayofWeek,HourofDay,Unnamed: 5_level_1
OCCCMS,1,CIV,0,0,0.0
OCCCMS,1,CIV,0,1,0.0
OCCCMS,1,CIV,0,2,0.0
OCCCMS,1,CIV,0,3,0.0
OCCCMS,1,CIV,1,0,0.0
OCCCMS,1,CIV,1,1,0.0
OCCCMS,1,CIV,1,2,0.0
OCCCMS,1,CIV,1,3,0.0
OCCCMS,1,CIV,2,0,0.0
OCCCMS,1,CIV,2,1,0.0


#### Create Alarm Baseline for Seasonal Effects (24/7 Cycle)

"ENVIRONMENT", "FUNCTIONAL_CATEGORY", "AssetClass", "isAlarm", "DayofWeek", "HourofDay"

In [57]:
# Initialise Filters
AssetClassList2 = ["EMU", "ECS", "FPS", "E", "COM", "SCS"]
FCatList2 = [12, 17, 32, 33, 44, 51]

# Fiter Data
# Fiter Data
df_temp = df.loc[(df["AssetClass"].isin(AssetClassList2)) & 
                 (df["FUNCTIONAL_CATEGORY"].isin(FCatList2)) &
                 (df["isAlarm"] == True)].reset_index().drop(columns=["index"])

# Inspect Data
df_temp

Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours
0,+,,True,2021-01-01 00:30:04.953128192,51,17,OCCCMS,SCS/BGK/B1/PLC01,FAULT,ISCS PLC,Mux Selection,SCS,PLC,2021-01-01 00:30:05.504389888,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
1,+,,True,2021-01-01 00:30:25.637425920,51,17,OCCCMS,SCS/BGK/B1/PLC01,NORMAL,ISCS PLC,Mux Selection,SCS,PLC,2021-01-01 00:30:26.503803904,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
2,+,,True,2021-01-01 00:30:26.649394944,51,17,OCCCMS,SCS/BGK/B1/PLC01,FAULT,ISCS PLC,Mux Selection,SCS,PLC,2021-01-01 00:30:27.508146944,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
3,+,,True,2021-01-01 00:30:38.263488000,51,17,OCCCMS,SCS/BGK/B1/PLC01,FAULT,ISCS PLC,Mux Selection,SCS,PLC,2021-01-01 00:30:38.502909952,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
4,+,,True,2021-01-01 00:30:40.279556096,51,17,OCCCMS,SCS/BGK/B1/PLC01,NORMAL,ISCS PLC,Mux Selection,SCS,PLC,2021-01-01 00:30:40.502872064,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78137,+,,True,2021-01-28 03:59:53.970972928,51,17,OCCCMS,SCS/BGK/B1/PLC01,NORMAL,SUBLOCATION ISCS PLC,Mux Selection,SCS,PLC,2021-01-28 03:59:54.257379072,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-28,3,False,3,True
78138,+,,True,2021-01-28 03:59:54.475037184,51,17,OCCCMS,SCS/BGK/B1/PLC01,FAULT,SUBLOCATION ISCS PLC,Mux Selection,SCS,PLC,2021-01-28 03:59:55.255389952,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-28,3,False,3,True
78139,+,,True,2021-01-28 03:59:56.995066880,51,17,OCCCMS,SCS/BGK/B1/PLC01,FAULT,SUBLOCATION ISCS PLC,Mux Selection,SCS,PLC,2021-01-28 03:59:57.258107904,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-28,3,False,3,True
78140,+,,True,2021-01-28 03:59:58.506998016,51,17,OCCCMS,SCS/BGK/B1/PLC03,NORMAL,SUBLOCATION ISCS PLC,Mux Selection,SCS,PLC,2021-01-28 03:59:59.257446144,True,True,True,False,False,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-28,3,False,3,True


In [58]:
# Get event count by "Date", "ENVIRONMENT", "FUNCTIONAL_CATEGORY", "DayofWeek", "HourofDay"
# Note Entry Code Suffix is only used as a proxy for counting the number of events
eventCountSummary = df_temp[["Date", 
                             "ENVIRONMENT", 
                             "FUNCTIONAL_CATEGORY", 
                             "AssetClass", 
                             "isAlarm", 
                             "DayofWeek", 
                             "HourofDay", 
                             "ENTRY_CODE_SUFFIX"]].groupby(["Date", 
                                                            "ENVIRONMENT", 
                                                            "FUNCTIONAL_CATEGORY", 
                                                            "AssetClass", 
                                                            "isAlarm", 
                                                            "DayofWeek", 
                                                            "HourofDay"]).count()

# Inspect Data
eventCountSummary.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,ENTRY_CODE_SUFFIX
Date,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,isAlarm,DayofWeek,HourofDay,Unnamed: 7_level_1
2021-01-01,OCCCMS,1,CIV,True,0,0,0
2021-01-01,OCCCMS,1,CIV,True,0,1,0
2021-01-01,OCCCMS,1,CIV,True,0,2,0
2021-01-01,OCCCMS,1,CIV,True,0,3,0
2021-01-01,OCCCMS,1,CIV,True,1,0,0
2021-01-01,OCCCMS,1,CIV,True,1,1,0
2021-01-01,OCCCMS,1,CIV,True,1,2,0
2021-01-01,OCCCMS,1,CIV,True,1,3,0
2021-01-01,OCCCMS,1,CIV,True,2,0,0
2021-01-01,OCCCMS,1,CIV,True,2,1,0


In [59]:
# 90th Percentile value for the hour within a particular day of week
# This will be the generic baseline of scenarios where there's a 24 hour seasonal pattern and a 7 day Seasonal Pattern
# The 90th percentile is set as the threshold to capture more extreme outliers
# The median is not discerning enough
ECThreshold_SeasonalAlarm = eventCountSummary.groupby(["ENVIRONMENT", 
                                                       "FUNCTIONAL_CATEGORY", 
                                                       "AssetClass", 
                                                       "isAlarm", 
                                                       "DayofWeek", 
                                                       "HourofDay"]).quantile(0.9)

# Inspect Data
ECThreshold_SeasonalAlarm.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,ENTRY_CODE_SUFFIX
ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,isAlarm,DayofWeek,HourofDay,Unnamed: 6_level_1
OCCCMS,1,CIV,True,0,0,0.0
OCCCMS,1,CIV,True,0,1,0.0
OCCCMS,1,CIV,True,0,2,0.0
OCCCMS,1,CIV,True,0,3,0.0
OCCCMS,1,CIV,True,1,0,0.0
OCCCMS,1,CIV,True,1,1,0.0
OCCCMS,1,CIV,True,1,2,0.0
OCCCMS,1,CIV,True,1,3,0.0
OCCCMS,1,CIV,True,2,0,0.0
OCCCMS,1,CIV,True,2,1,0.0


#### Clean Up Baseline Figures

In [60]:
# Flatten baselines summaries to make it easier to lookup against later
ECThreshold_Flat = ECThreshold_Flat.reset_index()
ECThreshold_Seasonal = ECThreshold_Seasonal.reset_index()
ECThreshold_SeasonalAlarm = ECThreshold_SeasonalAlarm.reset_index()

# Rename Summary Column for Ease of Lookup later
ECThreshold_Flat.rename(columns = {'ENTRY_CODE_SUFFIX':'ECThreshold_Flat'}, inplace = True)
ECThreshold_Seasonal.rename(columns = {'ENTRY_CODE_SUFFIX':'ECThreshold_Seasonal'}, inplace = True)
ECThreshold_SeasonalAlarm.rename(columns = {'ENTRY_CODE_SUFFIX':'ECThreshold_SeasonalAlarm'}, inplace = True)

In [61]:
# Inspect data
ECThreshold_Flat.head(5)

Unnamed: 0,ENVIRONMENT,FUNCTIONAL_CATEGORY,ECThreshold_Flat
0,OCCCMS,1,0.0
1,OCCCMS,2,12.4
2,OCCCMS,3,0.0
3,OCCCMS,4,0.0
4,OCCCMS,5,0.0


In [62]:
# Inspect data
ECThreshold_Seasonal.head()

Unnamed: 0,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,DayofWeek,HourofDay,ECThreshold_Seasonal
0,OCCCMS,1,CIV,0,0,0.0
1,OCCCMS,1,CIV,0,1,0.0
2,OCCCMS,1,CIV,0,2,0.0
3,OCCCMS,1,CIV,0,3,0.0
4,OCCCMS,1,CIV,1,0,0.0


In [63]:
# Inspect data
ECThreshold_SeasonalAlarm.head()

Unnamed: 0,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,isAlarm,DayofWeek,HourofDay,ECThreshold_SeasonalAlarm
0,OCCCMS,1,CIV,True,0,0,0.0
1,OCCCMS,1,CIV,True,0,1,0.0
2,OCCCMS,1,CIV,True,0,2,0.0
3,OCCCMS,1,CIV,True,0,3,0.0
4,OCCCMS,1,CIV,True,1,0,0.0


In [64]:
# Remove redundant permutations
ECThreshold_Seasonal = ECThreshold_Seasonal[(ECThreshold_Seasonal["AssetClass"].isin(AssetClassList)) & 
                                            (ECThreshold_Seasonal["FUNCTIONAL_CATEGORY"].isin(FCatList))]
ECThreshold_Seasonal = ECThreshold_Seasonal.reset_index().drop(columns=["index"])

# Inspect data
ECThreshold_Seasonal.head()

Unnamed: 0,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,DayofWeek,HourofDay,ECThreshold_Seasonal
0,OCCCMS,32,CIV,0,0,0.0
1,OCCCMS,32,CIV,0,1,0.0
2,OCCCMS,32,CIV,0,2,0.0
3,OCCCMS,32,CIV,0,3,0.0
4,OCCCMS,32,CIV,1,0,0.0


In [65]:
# Remove redundant permutations
ECThreshold_SeasonalAlarm = ECThreshold_SeasonalAlarm[(ECThreshold_SeasonalAlarm["AssetClass"].isin(AssetClassList2)) & 
                                                      (ECThreshold_SeasonalAlarm["FUNCTIONAL_CATEGORY"].isin(FCatList2) &
                                                      (ECThreshold_SeasonalAlarm["isAlarm"] == True))]
ECThreshold_SeasonalAlarm = ECThreshold_SeasonalAlarm.reset_index().drop(columns=["index", "isAlarm"])

# Inspect data
ECThreshold_SeasonalAlarm.head()

Unnamed: 0,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,DayofWeek,HourofDay,ECThreshold_SeasonalAlarm
0,OCCCMS,32,COM,0,0,0.0
1,OCCCMS,32,COM,0,1,0.0
2,OCCCMS,32,COM,0,2,0.0
3,OCCCMS,32,COM,0,3,0.0
4,OCCCMS,32,COM,1,0,0.0


In [66]:
# Inspect data
ECThreshold_Flat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   ENVIRONMENT          25 non-null     object  
 1   FUNCTIONAL_CATEGORY  25 non-null     category
 2   ECThreshold_Flat     25 non-null     float64 
dtypes: category(1), float64(1), object(1)
memory usage: 1.3+ KB


In [67]:
# Inspect data
ECThreshold_Seasonal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   ENVIRONMENT           700 non-null    object  
 1   FUNCTIONAL_CATEGORY   700 non-null    category
 2   AssetClass            700 non-null    category
 3   DayofWeek             700 non-null    int64   
 4   HourofDay             700 non-null    int64   
 5   ECThreshold_Seasonal  700 non-null    float64 
dtypes: category(2), float64(1), int64(2), object(1)
memory usage: 24.8+ KB


In [68]:
# Inspect data
ECThreshold_SeasonalAlarm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336 entries, 0 to 335
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   ENVIRONMENT                336 non-null    object  
 1   FUNCTIONAL_CATEGORY        336 non-null    category
 2   AssetClass                 336 non-null    category
 3   DayofWeek                  336 non-null    int64   
 4   HourofDay                  336 non-null    int64   
 5   ECThreshold_SeasonalAlarm  336 non-null    float64 
dtypes: category(2), float64(1), int64(2), object(1)
memory usage: 12.7+ KB


#### Merge Baseline Figures Into Main Dataframe

In [69]:
# Import Baseline Values from Lookup tables
df = df.merge(ECThreshold_Flat, on =["ENVIRONMENT", 
                                     "FUNCTIONAL_CATEGORY"], how='left')
df = df.merge(ECThreshold_Seasonal, on =["ENVIRONMENT", 
                                         "FUNCTIONAL_CATEGORY", 
                                         "AssetClass", 
                                         "DayofWeek", 
                                         "HourofDay"], how='left')
df = df.merge(ECThreshold_SeasonalAlarm, on =["ENVIRONMENT", 
                                              "FUNCTIONAL_CATEGORY", 
                                              "AssetClass", 
                                              "DayofWeek", 
                                              "HourofDay"], how='left')

# Delete redundant variables
del df_temp, eventCountSummary

# Inspect data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312160 entries, 0 to 312159
Data columns (total 33 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   ENTRY_CODE_SUFFIX          312160 non-null  object        
 1   EQUIPMENT_NAME             0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED   312160 non-null  bool          
 3   SCS_TIME                   312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY        312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY      312160 non-null  category      
 6   ENVIRONMENT                312160 non-null  object        
 7   ASSET_ID_RAW               312160 non-null  object        
 8   EVENT_STATUS               312157 non-null  category      
 9   ASSET_DESC_CAT             311610 non-null  category      
 10  EVENT_DESC_CAT             312160 non-null  object        
 11  AssetClass                 312160 non-null  category

### Export Computed Baselines as CSV

In [70]:
# Check current directory
cwd

'C:\\Users\\schdadmin\\Documents\\IAMS Analytics\\alarm-event-logs'

In [71]:
# Define Save Location
saveLoc = '\\taggedOutput\\'
#os.chdir(cwd + alarmLoc)
os.chdir(cwd + saveLoc)
# Check directory location
print(os.getcwd())

C:\Users\schdadmin\Documents\IAMS Analytics\alarm-event-logs\taggedOutput


In [72]:
# Export Baselines as CSV for archival
#eventCountSummary.to_csv("BaselineResults/" + SrcEnv + BatchCode + "eventCountSummary" + ".csv",  index=False)
ECThreshold_Flat.to_csv("BaselineResults/" + SrcEnv + BatchCode + "ECThreshold_Flat" + "-" + FuncCatCluster[targetFuncCatCluster] + ".csv",  index=False)
ECThreshold_Seasonal.to_csv("BaselineResults/" + SrcEnv + BatchCode + "ECThreshold_Seasonal" + "-" + FuncCatCluster[targetFuncCatCluster] + ".csv", index=False)
ECThreshold_SeasonalAlarm.to_csv("BaselineResults/" + SrcEnv + BatchCode + "ECThreshold_SeasonalAlarm" + "-" + FuncCatCluster[targetFuncCatCluster] + ".csv",  index=False)
print("Files Saved")

# Delete redundant variables
del ECThreshold_Flat, ECThreshold_Seasonal, ECThreshold_SeasonalAlarm

Files Saved


### Perform Rolling Event Count

In [73]:
# Inspect data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312160 entries, 0 to 312159
Data columns (total 33 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   ENTRY_CODE_SUFFIX          312160 non-null  object        
 1   EQUIPMENT_NAME             0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED   312160 non-null  bool          
 3   SCS_TIME                   312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY        312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY      312160 non-null  category      
 6   ENVIRONMENT                312160 non-null  object        
 7   ASSET_ID_RAW               312160 non-null  object        
 8   EVENT_STATUS               312157 non-null  category      
 9   ASSET_DESC_CAT             311610 non-null  category      
 10  EVENT_DESC_CAT             312160 non-null  object        
 11  AssetClass                 312160 non-null  category

In [74]:
# Inspect data
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours,ECThreshold_Flat,ECThreshold_Seasonal,ECThreshold_SeasonalAlarm
0,+,,True,2021-01-01 00:30:03.941113856,48,11,OCCCMS,COM/BNK/B1/PABX01,NORMAL,PABX,Fan Status,COM,PABX,2021-01-01 00:30:04.503241984,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True,1176.9,778.7,
1,+,,True,2021-01-01 00:30:04.953128192,51,17,OCCCMS,SCS/BGK/B1/PLC01,FAULT,ISCS PLC,Mux Selection,SCS,PLC,2021-01-01 00:30:05.504389888,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True,970.8,483.2,351.9
2,+,,True,2021-01-01 00:30:04.953128192,51,17,OCCCMS,SCS/BGK/B1/PLC01,NORMAL,ISCS PLC,Mux Selection,SCS,PLC,2021-01-01 00:30:05.504389888,False,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True,970.8,483.2,351.9
3,+,,True,2021-01-01 00:30:04.953128192,48,11,OCCCMS,COM/BNK/B1/PABX01,FAILURE,PABX,Fan Status,COM,PABX,2021-01-01 00:30:05.504389888,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True,1176.9,778.7,
4,+,,True,2021-01-01 00:30:07.483118080,48,11,OCCCMS,COM/BNK/B1/PABX01,NORMAL,PABX,Fan Status,COM,PABX,2021-01-01 00:30:07.503307008,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True,1176.9,778.7,


In [75]:
# Reindex Data by Time
df_rollingWindow = df.copy()
df_rollingWindow = df_rollingWindow[["SCS_TIME", 
                                     "ENVIRONMENT", 
                                     "FUNCTIONAL_CATEGORY",
                                     "AssetClass",
                                     "isAlarm",
                                     "ENTRY_CODE_SUFFIX"]]

# Create copy of time sequence field (SCS_TIME) as a proto index column
#df_rollingWindow["datetimeIndex"] = df_rollingWindow["SCS_TIME"].copy()

# Set Date-Time to be the index
#df_rollingWindow = df_rollingWindow.set_index(df_rollingWindow["SCS_TIME"])

# Drop proto index column
#del df_rollingWindow["SCS_TIME"]


# Inspect data sample
df_rollingWindow.head()

Unnamed: 0,SCS_TIME,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,isAlarm,ENTRY_CODE_SUFFIX
0,2021-01-01 00:30:03.941113856,OCCCMS,48,COM,False,+
1,2021-01-01 00:30:04.953128192,OCCCMS,51,SCS,True,+
2,2021-01-01 00:30:04.953128192,OCCCMS,51,SCS,False,+
3,2021-01-01 00:30:04.953128192,OCCCMS,48,COM,False,+
4,2021-01-01 00:30:07.483118080,OCCCMS,48,COM,False,+


In [76]:
# Rename Summary Column for Ease of Lookup later
df_rollingWindow.rename(columns = {'ENTRY_CODE_SUFFIX':'Counter'}, inplace = True)

# Inspect data sample
df_rollingWindow.head()

Unnamed: 0,SCS_TIME,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,isAlarm,Counter
0,2021-01-01 00:30:03.941113856,OCCCMS,48,COM,False,+
1,2021-01-01 00:30:04.953128192,OCCCMS,51,SCS,True,+
2,2021-01-01 00:30:04.953128192,OCCCMS,51,SCS,False,+
3,2021-01-01 00:30:04.953128192,OCCCMS,48,COM,False,+
4,2021-01-01 00:30:07.483118080,OCCCMS,48,COM,False,+


In [77]:
# Inspect data sample
df_rollingWindow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312160 entries, 0 to 312159
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   SCS_TIME             312160 non-null  datetime64[ns]
 1   ENVIRONMENT          312160 non-null  object        
 2   FUNCTIONAL_CATEGORY  312160 non-null  category      
 3   AssetClass           312160 non-null  category      
 4   isAlarm              312160 non-null  bool          
 5   Counter              312160 non-null  object        
dtypes: bool(1), category(2), datetime64[ns](1), object(2)
memory usage: 10.4+ MB


#### Rolling Window Count - Flat (ENVIRONMENT + FUNCTIONAL_CATEGORY)

In [78]:
# Set Date-Time to be the index
df_rollingWindow_flat = df_rollingWindow.set_index(df_rollingWindow["SCS_TIME"])

# Drop proto index column & redundant cols
del df_rollingWindow_flat["SCS_TIME"], df_rollingWindow_flat["AssetClass"], df_rollingWindow_flat["isAlarm"]

# Compute Rolling Window Count
df_rollingWindow_flat = df_rollingWindow_flat.groupby(["ENVIRONMENT", 
                                                       "FUNCTIONAL_CATEGORY"]).rolling('3600s', min_periods=1).count()
#flatten every level of MultiIndex 
df_rollingWindow_flat.reset_index(inplace=True, level = ["SCS_TIME","ENVIRONMENT", "FUNCTIONAL_CATEGORY"])


# Inspect data
df_rollingWindow_flat.head()

Unnamed: 0,ENVIRONMENT,FUNCTIONAL_CATEGORY,SCS_TIME,Counter
0,OCCCMS,1,2021-01-01 01:28:58.632349952,1.0
1,OCCCMS,1,2021-01-01 01:29:04.116326144,2.0
2,OCCCMS,1,2021-01-01 01:30:20.889762048,3.0
3,OCCCMS,1,2021-01-01 01:30:32.305819904,4.0
4,OCCCMS,1,2021-01-01 01:30:43.809310976,5.0


In [79]:
# Drop Duplicates, whilst keeping the last value
df_rollingWindow_flat = df_rollingWindow_flat.drop_duplicates(subset = ["ENVIRONMENT", 
                                                                        "FUNCTIONAL_CATEGORY", 
                                                                        "SCS_TIME"], ignore_index = True, keep = 'last')

# Inspect Data
df_rollingWindow_flat.head()

Unnamed: 0,ENVIRONMENT,FUNCTIONAL_CATEGORY,SCS_TIME,Counter
0,OCCCMS,1,2021-01-01 01:28:58.632349952,1.0
1,OCCCMS,1,2021-01-01 01:29:04.116326144,2.0
2,OCCCMS,1,2021-01-01 01:30:20.889762048,3.0
3,OCCCMS,1,2021-01-01 01:30:32.305819904,4.0
4,OCCCMS,1,2021-01-01 01:30:43.809310976,5.0


In [80]:
# Inspect Data
df_rollingWindow_flat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267628 entries, 0 to 267627
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   ENVIRONMENT          267628 non-null  object        
 1   FUNCTIONAL_CATEGORY  267628 non-null  category      
 2   SCS_TIME             267628 non-null  datetime64[ns]
 3   Counter              267628 non-null  float64       
dtypes: category(1), datetime64[ns](1), float64(1), object(1)
memory usage: 6.4+ MB


In [81]:
# Compare Baseline Values with Rolling Window Values

# Rename Columns
df_rollingWindow_flat.rename(columns = {'Counter':'RWEC_Flat'}, inplace = True)

# Merge Data
df = df.merge(df_rollingWindow_flat, "left", on =["ENVIRONMENT", 
                                                  "FUNCTIONAL_CATEGORY", 
                                                  "SCS_TIME"])
# Delete redundant variables
del df_rollingWindow_flat

# Compute the Ratio Between Actual Rolling Window and Baseline
# Compute Log10 of the Ratio Between Actual Rolling Window and Baseline (deprecated)
df["RWEC_Ratio_Flat"] = (df["RWEC_Flat"] / df["ECThreshold_Flat"])

# Drop Redundant Columns
df = df.drop(columns = [
                        "ECThreshold_Flat",
                        "RWEC_Flat"
                        ])

# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312160 entries, 0 to 312159
Data columns (total 33 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   ENTRY_CODE_SUFFIX          312160 non-null  object        
 1   EQUIPMENT_NAME             0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED   312160 non-null  bool          
 3   SCS_TIME                   312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY        312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY      312160 non-null  category      
 6   ENVIRONMENT                312160 non-null  object        
 7   ASSET_ID_RAW               312160 non-null  object        
 8   EVENT_STATUS               312157 non-null  category      
 9   ASSET_DESC_CAT             311610 non-null  category      
 10  EVENT_DESC_CAT             312160 non-null  object        
 11  AssetClass                 312160 non-null  category

#### Rolling Window Count - Seasonal (ENVIRONMENT + FUNCTIONAL_CATEGORY + AssetClass)

In [82]:
# Filter Data
#df_rollingWindow_Seasonal
df_rollingWindow_seasonal = df_rollingWindow.loc[(df["AssetClass"].isin(AssetClassList)) & 
                                                 (df_rollingWindow["FUNCTIONAL_CATEGORY"].isin(FCatList))].reset_index().drop(columns=["index"])

del df_rollingWindow_seasonal["isAlarm"]

# Set Date-Time to be the index
df_rollingWindow_seasonal = df_rollingWindow_seasonal.set_index(df_rollingWindow_seasonal["SCS_TIME"])

# Drop proto index column
del df_rollingWindow_seasonal["SCS_TIME"]

# Inspect Data
df_rollingWindow_seasonal.head()

Unnamed: 0_level_0,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,Counter
SCS_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01 00:30:03.941113856,OCCCMS,48,COM,+
2021-01-01 00:30:04.953128192,OCCCMS,51,SCS,+
2021-01-01 00:30:04.953128192,OCCCMS,51,SCS,+
2021-01-01 00:30:04.953128192,OCCCMS,48,COM,+
2021-01-01 00:30:07.483118080,OCCCMS,48,COM,+


In [83]:
# Compute Rolling Window Count
df_rollingWindow_seasonal = df_rollingWindow_seasonal.groupby(["ENVIRONMENT", 
                                                               "FUNCTIONAL_CATEGORY", 
                                                               "AssetClass"]).rolling('3600s', min_periods=1).count()

# Reset Index for ease of look up
df_rollingWindow_seasonal.reset_index(inplace = True, level = ["SCS_TIME",
                                                               "ENVIRONMENT", 
                                                               "FUNCTIONAL_CATEGORY", 
                                                               "AssetClass"])

# Inspect data
df_rollingWindow_seasonal.head()

Unnamed: 0,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,SCS_TIME,Counter
0,OCCCMS,32,FPS,2021-01-22 01:14:06.545007872,1.0
1,OCCCMS,32,FPS,2021-01-22 01:14:32.787326976,2.0
2,OCCCMS,32,FPS,2021-01-22 03:02:28.375931904,1.0
3,OCCCMS,32,FPS,2021-01-22 03:02:36.946053120,2.0
4,OCCCMS,33,E,2021-01-01 01:15:06.395309824,1.0


In [84]:
# Drop Duplicates, whilst keeping the last value
df_rollingWindow_seasonal = df_rollingWindow_seasonal.drop_duplicates(subset = ["ENVIRONMENT", 
                                                                                "FUNCTIONAL_CATEGORY", 
                                                                                "AssetClass", 
                                                                                "SCS_TIME"], ignore_index = True, keep = 'last')

# Inspect Data
df_rollingWindow_seasonal.head()

Unnamed: 0,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,SCS_TIME,Counter
0,OCCCMS,32,FPS,2021-01-22 01:14:06.545007872,1.0
1,OCCCMS,32,FPS,2021-01-22 01:14:32.787326976,2.0
2,OCCCMS,32,FPS,2021-01-22 03:02:28.375931904,1.0
3,OCCCMS,32,FPS,2021-01-22 03:02:36.946053120,2.0
4,OCCCMS,33,E,2021-01-01 01:15:06.395309824,1.0


In [85]:
# Inspect Data
df_rollingWindow_seasonal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239976 entries, 0 to 239975
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   ENVIRONMENT          239976 non-null  object        
 1   FUNCTIONAL_CATEGORY  239976 non-null  category      
 2   AssetClass           239976 non-null  category      
 3   SCS_TIME             239976 non-null  datetime64[ns]
 4   Counter              239976 non-null  float64       
dtypes: category(2), datetime64[ns](1), float64(1), object(1)
memory usage: 6.0+ MB


In [86]:
# Compare Baseline Values with Rolling Window Values

# Rename Columns
df_rollingWindow_seasonal.rename(columns = {'Counter':'RWEC_Seasonal'}, inplace = True)

# Merge Data
df = df.merge(df_rollingWindow_seasonal, "left", on =["ENVIRONMENT", 
                                                      "FUNCTIONAL_CATEGORY", 
                                                      "AssetClass", 
                                                      "SCS_TIME"])
# Delete redundant variables
del df_rollingWindow_seasonal

# Compute the Ratio Between Actual Rolling Window and Baseline
# Compute Log10 of the Ratio Between Actual Rolling Window and Baseline (deprecated)
df["RWEC_Ratio_Seasonal"] = (df["RWEC_Seasonal"] / df["ECThreshold_Seasonal"])

# Drop Redundant Columns
df = df.drop(columns = [
                        "ECThreshold_Seasonal",
                        "RWEC_Seasonal"
                        ])

# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312160 entries, 0 to 312159
Data columns (total 33 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   ENTRY_CODE_SUFFIX          312160 non-null  object        
 1   EQUIPMENT_NAME             0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED   312160 non-null  bool          
 3   SCS_TIME                   312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY        312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY      312160 non-null  category      
 6   ENVIRONMENT                312160 non-null  object        
 7   ASSET_ID_RAW               312160 non-null  object        
 8   EVENT_STATUS               312157 non-null  category      
 9   ASSET_DESC_CAT             311610 non-null  category      
 10  EVENT_DESC_CAT             312160 non-null  object        
 11  AssetClass                 312160 non-null  category

#### Rolling Window Count - SeasonalAlarm (ENVIRONMENT + FUNCTIONAL_CATEGORY + AssetClass + isAlarm)

In [87]:
# Filter Data
#df_rollingWindow_Seasonal
df_rollingWindow_seasonalAlarm = df_rollingWindow.loc[(df["AssetClass"].isin(AssetClassList2)) & 
                                                      (df_rollingWindow["FUNCTIONAL_CATEGORY"].isin(FCatList2)) &
                                                      (df_rollingWindow["isAlarm"] == True)
                                                     ].reset_index().drop(columns=["index"])

# Set Date-Time to be the index
df_rollingWindow_seasonalAlarm = df_rollingWindow_seasonalAlarm.set_index(df_rollingWindow_seasonalAlarm["SCS_TIME"])

# Drop proto index column
del df_rollingWindow_seasonalAlarm["SCS_TIME"]

# Inspect Data
df_rollingWindow_seasonalAlarm.head()

Unnamed: 0_level_0,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,isAlarm,Counter
SCS_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-01 00:30:04.953128192,OCCCMS,51,SCS,True,+
2021-01-01 00:30:25.637425920,OCCCMS,51,SCS,True,+
2021-01-01 00:30:26.649394944,OCCCMS,51,SCS,True,+
2021-01-01 00:30:38.263488000,OCCCMS,51,SCS,True,+
2021-01-01 00:30:40.279556096,OCCCMS,51,SCS,True,+


In [88]:
# Compute Rolling Window Count
df_rollingWindow_seasonalAlarm = df_rollingWindow_seasonalAlarm.groupby(["ENVIRONMENT", 
                                                                         "FUNCTIONAL_CATEGORY", 
                                                                         "AssetClass", 
                                                                         "isAlarm"]).rolling('3600s', min_periods=1).count()

# Drop redundant variables
df_rollingWindow_seasonalAlarm.reset_index(inplace = True, level = ["SCS_TIME",
                                                                    "ENVIRONMENT", 
                                                                    "FUNCTIONAL_CATEGORY", 
                                                                    "AssetClass", 
                                                                    "isAlarm"])

# Inspect data
df_rollingWindow_seasonalAlarm.head()

Unnamed: 0,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,isAlarm,SCS_TIME,Counter
0,OCCCMS,32,FPS,True,2021-01-22 01:14:06.545007872,1.0
1,OCCCMS,32,FPS,True,2021-01-22 03:02:28.375931904,1.0
2,OCCCMS,33,E,True,2021-01-01 01:15:06.395309824,1.0
3,OCCCMS,33,E,True,2021-01-01 01:15:13.601731072,2.0
4,OCCCMS,33,E,True,2021-01-01 01:23:39.180551936,3.0


In [89]:
# Drop Duplicates, whilst keeping the last value
df_rollingWindow_seasonalAlarm = df_rollingWindow_seasonalAlarm.drop_duplicates(subset = ["ENVIRONMENT", 
                                                                                          "FUNCTIONAL_CATEGORY", 
                                                                                          "AssetClass", 
                                                                                          "isAlarm", 
                                                                                          "SCS_TIME"], ignore_index = True, keep = 'last')

# Inspect Data
df_rollingWindow_seasonalAlarm.head()

Unnamed: 0,ENVIRONMENT,FUNCTIONAL_CATEGORY,AssetClass,isAlarm,SCS_TIME,Counter
0,OCCCMS,32,FPS,True,2021-01-22 01:14:06.545007872,1.0
1,OCCCMS,32,FPS,True,2021-01-22 03:02:28.375931904,1.0
2,OCCCMS,33,E,True,2021-01-01 01:15:06.395309824,1.0
3,OCCCMS,33,E,True,2021-01-01 01:15:13.601731072,2.0
4,OCCCMS,33,E,True,2021-01-01 01:23:39.180551936,3.0


In [90]:
# Inspect Data
df_rollingWindow_seasonalAlarm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70052 entries, 0 to 70051
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ENVIRONMENT          70052 non-null  object        
 1   FUNCTIONAL_CATEGORY  70052 non-null  category      
 2   AssetClass           70052 non-null  category      
 3   isAlarm              70052 non-null  bool          
 4   SCS_TIME             70052 non-null  datetime64[ns]
 5   Counter              70052 non-null  float64       
dtypes: bool(1), category(2), datetime64[ns](1), float64(1), object(1)
memory usage: 1.8+ MB


In [91]:
# Compare Baseline Values with Rolling Window Values

# Rename Columns
df_rollingWindow_seasonalAlarm.rename(columns = {'Counter':'RWEC_SeasonalAlarm'}, inplace = True)

# Merge Data
df = df.merge(df_rollingWindow_seasonalAlarm, "left", on =["ENVIRONMENT", 
                                                           "FUNCTIONAL_CATEGORY", 
                                                           "AssetClass",
                                                           "isAlarm",
                                                           "SCS_TIME"])
# Delete redundant variables
del df_rollingWindow_seasonalAlarm

# Compute the Ratio Between Actual Rolling Window and Baseline
# Compute Log10 of the Ratio Between Actual Rolling Window and Baseline (deprecated)
df["RWEC_Ratio_SeasonalAlarm"] = (df["RWEC_SeasonalAlarm"] / df["ECThreshold_SeasonalAlarm"])

# Drop Redundant Columns
df = df.drop(columns = [
                        "ECThreshold_SeasonalAlarm",
                        "RWEC_SeasonalAlarm"
                        ])

# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312160 entries, 0 to 312159
Data columns (total 33 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   ENTRY_CODE_SUFFIX         312160 non-null  object        
 1   EQUIPMENT_NAME            0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED  312160 non-null  bool          
 3   SCS_TIME                  312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY       312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY     312160 non-null  category      
 6   ENVIRONMENT               312160 non-null  object        
 7   ASSET_ID_RAW              312160 non-null  object        
 8   EVENT_STATUS              312157 non-null  category      
 9   ASSET_DESC_CAT            311610 non-null  category      
 10  EVENT_DESC_CAT            312160 non-null  object        
 11  AssetClass                312160 non-null  category      
 12  As

#### Initiate a New Set of Rolling Window for SeverityRank Mean
Not for baseline comparison. The underlying principle is that the Severity Category serves as a likert scale ranking, whereby events of higher severity, are more likely to be anomalies of interest. This is moderated by the isAlarm and NuisanceAlarm status of the event, whereby alarm events are more likely to be an event of interest; and nuisance events are more likely to be noise to be ignored. Moderation weightage is based on the analysis that isAlarm value is a stronger predictor of an event of interest than NuisanceAlarm value, since some nuisance events can be caused by legitimate faults. The exact weightage given is just a back-of-envelope calculation.

**Importance Ranking**

1. Others < Low < Urgent < Critical
2. Maintenance < Operational
3. isAlarm False < isAlarm True
4. NuisanceAlarm True < NuisanceAlarm False
5. NuisanceAlarm False < isAlarm True

In [92]:
# Initiate Dateframe of computation
df_rollingWindow = df.copy()
df_rollingWindow = df_rollingWindow[["SCS_TIME", 
                                     "ENVIRONMENT", 
                                     "FUNCTIONAL_CATEGORY",
                                     "GEOGRAPHICAL_CATEGORY",
                                     "AssetClass",
                                     "AssetSubClass",
                                     "isAlarm",
                                     "NuisanceAlarm",
                                     "SeverityRank",
                                     "EQUIPMENT_NAME",
                                     "ASSET_ID_RAW"
                                    ]]

# Inspect data
df_rollingWindow.head()


Unnamed: 0,SCS_TIME,ENVIRONMENT,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,AssetClass,AssetSubClass,isAlarm,NuisanceAlarm,SeverityRank,EQUIPMENT_NAME,ASSET_ID_RAW
0,2021-01-01 00:30:03.941113856,OCCCMS,48,11,COM,PABX,False,True,2,,COM/BNK/B1/PABX01
1,2021-01-01 00:30:04.953128192,OCCCMS,51,17,SCS,PLC,True,True,2,,SCS/BGK/B1/PLC01
2,2021-01-01 00:30:04.953128192,OCCCMS,51,17,SCS,PLC,False,True,2,,SCS/BGK/B1/PLC01
3,2021-01-01 00:30:04.953128192,OCCCMS,48,11,COM,PABX,False,True,2,,COM/BNK/B1/PABX01
4,2021-01-01 00:30:07.483118080,OCCCMS,48,11,COM,PABX,False,True,2,,COM/BNK/B1/PABX01


In [93]:
# Compute Moderated Event Severity Score
moderation_baseline = 0.1
moderation_isAlarm = 0.55
moderation_nonNuisance = 1 - moderation_isAlarm - moderation_baseline
df_rollingWindow["SeverityScore"] = df_rollingWindow["SeverityRank"] * ((df_rollingWindow["isAlarm"] * moderation_isAlarm) + 
                                                                        ((df_rollingWindow["NuisanceAlarm"] ==  False) * 
                                                                         moderation_nonNuisance) + moderation_baseline)

# Inspect data
df_rollingWindow.head()

Unnamed: 0,SCS_TIME,ENVIRONMENT,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,AssetClass,AssetSubClass,isAlarm,NuisanceAlarm,SeverityRank,EQUIPMENT_NAME,ASSET_ID_RAW,SeverityScore
0,2021-01-01 00:30:03.941113856,OCCCMS,48,11,COM,PABX,False,True,2,,COM/BNK/B1/PABX01,0.2
1,2021-01-01 00:30:04.953128192,OCCCMS,51,17,SCS,PLC,True,True,2,,SCS/BGK/B1/PLC01,1.3
2,2021-01-01 00:30:04.953128192,OCCCMS,51,17,SCS,PLC,False,True,2,,SCS/BGK/B1/PLC01,0.2
3,2021-01-01 00:30:04.953128192,OCCCMS,48,11,COM,PABX,False,True,2,,COM/BNK/B1/PABX01,0.2
4,2021-01-01 00:30:07.483118080,OCCCMS,48,11,COM,PABX,False,True,2,,COM/BNK/B1/PABX01,0.2


In [94]:
# Delete redundant variables
df_rollingWindow = df_rollingWindow.drop(columns = ["SeverityRank", "isAlarm", "NuisanceAlarm"])

# Inspect data
df_rollingWindow.head()

Unnamed: 0,SCS_TIME,ENVIRONMENT,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,AssetClass,AssetSubClass,EQUIPMENT_NAME,ASSET_ID_RAW,SeverityScore
0,2021-01-01 00:30:03.941113856,OCCCMS,48,11,COM,PABX,,COM/BNK/B1/PABX01,0.2
1,2021-01-01 00:30:04.953128192,OCCCMS,51,17,SCS,PLC,,SCS/BGK/B1/PLC01,1.3
2,2021-01-01 00:30:04.953128192,OCCCMS,51,17,SCS,PLC,,SCS/BGK/B1/PLC01,0.2
3,2021-01-01 00:30:04.953128192,OCCCMS,48,11,COM,PABX,,COM/BNK/B1/PABX01,0.2
4,2021-01-01 00:30:07.483118080,OCCCMS,48,11,COM,PABX,,COM/BNK/B1/PABX01,0.2


In [95]:
# Set Date-Time to be the index
df_rollingWindow = df_rollingWindow.set_index(df_rollingWindow["SCS_TIME"])

# Drop proto index column
del df_rollingWindow["SCS_TIME"]

# Inspect Data
df_rollingWindow.head()

Unnamed: 0_level_0,ENVIRONMENT,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,AssetClass,AssetSubClass,EQUIPMENT_NAME,ASSET_ID_RAW,SeverityScore
SCS_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-01-01 00:30:03.941113856,OCCCMS,48,11,COM,PABX,,COM/BNK/B1/PABX01,0.2
2021-01-01 00:30:04.953128192,OCCCMS,51,17,SCS,PLC,,SCS/BGK/B1/PLC01,1.3
2021-01-01 00:30:04.953128192,OCCCMS,51,17,SCS,PLC,,SCS/BGK/B1/PLC01,0.2
2021-01-01 00:30:04.953128192,OCCCMS,48,11,COM,PABX,,COM/BNK/B1/PABX01,0.2
2021-01-01 00:30:07.483118080,OCCCMS,48,11,COM,PABX,,COM/BNK/B1/PABX01,0.2


#### Rolling Window Count (10min Window) - SeverityRank X isAlarm Mean by ENVIRONMENT + GEOGRAPHICAL_CATEGORY + EQUIPMENT_NAME + ASSET_ID_RAW
Not for baseline comparison

In [96]:
df_rollingWindow.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 312160 entries, 2021-01-01 00:30:03.941113856 to 2021-01-28 03:59:58.957999872
Data columns (total 8 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   ENVIRONMENT            312160 non-null  object  
 1   FUNCTIONAL_CATEGORY    312160 non-null  category
 2   GEOGRAPHICAL_CATEGORY  312160 non-null  category
 3   AssetClass             312160 non-null  category
 4   AssetSubClass          307757 non-null  category
 5   EQUIPMENT_NAME         0 non-null       float64 
 6   ASSET_ID_RAW           312160 non-null  object  
 7   SeverityScore          312160 non-null  float64 
dtypes: category(4), float64(2), object(2)
memory usage: 13.1+ MB


In [97]:
# Compute Rolling Window Mean
try:
    df_SeverityRank_LocAsset = df_rollingWindow.groupby(["ENVIRONMENT", 
                                                         "GEOGRAPHICAL_CATEGORY",
                                                         "EQUIPMENT_NAME", 
                                                         "ASSET_ID_RAW"], dropna = True).rolling('600s', min_periods=1).mean()
    # Drop redundant variables
    df_SeverityRank_LocAsset = df_SeverityRank_LocAsset.drop(columns = ["FUNCTIONAL_CATEGORY"])
    
    # Reset Index for ease of look up
    df_SeverityRank_LocAsset = df_SeverityRank_LocAsset.reset_index()
    
except:
    # Create copy
    df_SeverityRank_LocAsset = df_rollingWindow.copy()

    # Drop redundant variables
    df_SeverityRank_LocAsset = df_SeverityRank_LocAsset.drop(columns = ["FUNCTIONAL_CATEGORY", 
                                                                        "AssetClass", 
                                                                        "AssetSubClass"])
    # Reset Index for ease of look up
    df_SeverityRank_LocAsset = df_SeverityRank_LocAsset.reset_index()
    
# Inspect data
df_SeverityRank_LocAsset.head()

Unnamed: 0,SCS_TIME,ENVIRONMENT,GEOGRAPHICAL_CATEGORY,EQUIPMENT_NAME,ASSET_ID_RAW,SeverityScore
0,2021-01-01 00:30:03.941113856,OCCCMS,11,,COM/BNK/B1/PABX01,0.2
1,2021-01-01 00:30:04.953128192,OCCCMS,17,,SCS/BGK/B1/PLC01,1.3
2,2021-01-01 00:30:04.953128192,OCCCMS,17,,SCS/BGK/B1/PLC01,0.2
3,2021-01-01 00:30:04.953128192,OCCCMS,11,,COM/BNK/B1/PABX01,0.2
4,2021-01-01 00:30:07.483118080,OCCCMS,11,,COM/BNK/B1/PABX01,0.2


In [98]:
df_SeverityRank_LocAsset.shape

(312160, 6)

In [99]:
# Drop Duplicates, whilst keeping the last value
df_SeverityRank_LocAsset = df_SeverityRank_LocAsset.drop_duplicates(subset = ["ENVIRONMENT", 
                                                                              "EQUIPMENT_NAME",
                                                                              "GEOGRAPHICAL_CATEGORY",
                                                                              "ASSET_ID_RAW", 
                                                                              "SCS_TIME"], ignore_index = True, keep = 'last')

# Inspect Data
df_SeverityRank_LocAsset.head()

Unnamed: 0,SCS_TIME,ENVIRONMENT,GEOGRAPHICAL_CATEGORY,EQUIPMENT_NAME,ASSET_ID_RAW,SeverityScore
0,2021-01-01 00:30:03.941113856,OCCCMS,11,,COM/BNK/B1/PABX01,0.2
1,2021-01-01 00:30:04.953128192,OCCCMS,17,,SCS/BGK/B1/PLC01,0.2
2,2021-01-01 00:30:04.953128192,OCCCMS,11,,COM/BNK/B1/PABX01,0.2
3,2021-01-01 00:30:07.483118080,OCCCMS,11,,COM/BNK/B1/PABX01,0.2
4,2021-01-01 00:30:09.508243200,OCCCMS,11,,COM/BNK/B1/PABX01,0.2


In [100]:
# Inspect Data
df_SeverityRank_LocAsset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277475 entries, 0 to 277474
Data columns (total 6 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   SCS_TIME               277475 non-null  datetime64[ns]
 1   ENVIRONMENT            277475 non-null  object        
 2   GEOGRAPHICAL_CATEGORY  277475 non-null  category      
 3   EQUIPMENT_NAME         0 non-null       float64       
 4   ASSET_ID_RAW           277475 non-null  object        
 5   SeverityScore          277475 non-null  float64       
dtypes: category(1), datetime64[ns](1), float64(2), object(2)
memory usage: 10.9+ MB


In [101]:
# Rename Columns
df_SeverityRank_LocAsset.rename(columns = {'SeverityScore':'RWSS_LocAsset'}, inplace = True)

# Merge Data
df = df.merge(df_SeverityRank_LocAsset, "left", on = ["ENVIRONMENT", 
                                                      "GEOGRAPHICAL_CATEGORY", 
                                                      "EQUIPMENT_NAME",
                                                      "ASSET_ID_RAW",
                                                      "SCS_TIME"])

# Delete redundant variables
del df_SeverityRank_LocAsset

# Inspect data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312160 entries, 0 to 312159
Data columns (total 34 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   ENTRY_CODE_SUFFIX         312160 non-null  object        
 1   EQUIPMENT_NAME            0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED  312160 non-null  bool          
 3   SCS_TIME                  312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY       312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY     312160 non-null  category      
 6   ENVIRONMENT               312160 non-null  object        
 7   ASSET_ID_RAW              312160 non-null  object        
 8   EVENT_STATUS              312157 non-null  category      
 9   ASSET_DESC_CAT            311610 non-null  category      
 10  EVENT_DESC_CAT            312160 non-null  object        
 11  AssetClass                312160 non-null  category      
 12  As

#### Rolling Window Count (10min Window) - SeverityRank X isAlarm Mean by ENVIRONMENT + GEOGRAPHICAL_CATEGORY + AssetClass + AssetSubClass
Not for baseline comparison

In [102]:
# Compute Rolling Window Count
df_SeverityRank_LocAssetClass = df_rollingWindow.groupby(["ENVIRONMENT", 
                                                          "GEOGRAPHICAL_CATEGORY",
                                                          "AssetClass", 
                                                          "AssetSubClass"]).rolling('600s', min_periods=1).mean()

# Drop redundant variables
df_SeverityRank_LocAssetClass = df_SeverityRank_LocAssetClass.drop(columns = ["FUNCTIONAL_CATEGORY", "EQUIPMENT_NAME"])

# Inspect data
df_SeverityRank_LocAssetClass.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,SeverityScore
ENVIRONMENT,GEOGRAPHICAL_CATEGORY,AssetClass,AssetSubClass,SCS_TIME,Unnamed: 5_level_1
OCCCMS,1,COM,PABX,2021-01-08 02:55:49.303392768,0.9
OCCCMS,1,COM,PABX,2021-01-16 02:37:27.319163136,2.0
OCCCMS,1,COM,PABX,2021-01-16 02:38:05.551353088,1.65
OCCCMS,1,COM,PABX,2021-01-16 03:56:36.993594880,0.9
OCCCMS,1,COM,PABX,2021-01-17 03:12:59.218327040,2.0


In [103]:
# Reset Index for ease of look up
df_SeverityRank_LocAssetClass = df_SeverityRank_LocAssetClass.reset_index()

# Inspect Data
df_SeverityRank_LocAssetClass.head()

Unnamed: 0,ENVIRONMENT,GEOGRAPHICAL_CATEGORY,AssetClass,AssetSubClass,SCS_TIME,SeverityScore
0,OCCCMS,1,COM,PABX,2021-01-08 02:55:49.303392768,0.9
1,OCCCMS,1,COM,PABX,2021-01-16 02:37:27.319163136,2.0
2,OCCCMS,1,COM,PABX,2021-01-16 02:38:05.551353088,1.65
3,OCCCMS,1,COM,PABX,2021-01-16 03:56:36.993594880,0.9
4,OCCCMS,1,COM,PABX,2021-01-17 03:12:59.218327040,2.0


In [104]:
df_SeverityRank_LocAssetClass.shape

(307757, 6)

In [105]:
# Drop Duplicates, whilst keeping the last value
df_SeverityRank_LocAssetClass = df_SeverityRank_LocAssetClass.drop_duplicates(subset = ["ENVIRONMENT", 
                                                                                        "GEOGRAPHICAL_CATEGORY",
                                                                                        "AssetClass", 
                                                                                        "AssetSubClass",
                                                                                        "SCS_TIME"
                                                                                       ], ignore_index = True, keep = 'last')

# Inspect Data
df_SeverityRank_LocAssetClass.head()

Unnamed: 0,ENVIRONMENT,GEOGRAPHICAL_CATEGORY,AssetClass,AssetSubClass,SCS_TIME,SeverityScore
0,OCCCMS,1,COM,PABX,2021-01-08 02:55:49.303392768,0.9
1,OCCCMS,1,COM,PABX,2021-01-16 02:37:27.319163136,2.0
2,OCCCMS,1,COM,PABX,2021-01-16 02:38:05.551353088,1.65
3,OCCCMS,1,COM,PABX,2021-01-16 03:56:36.993594880,0.9
4,OCCCMS,1,COM,PABX,2021-01-17 03:12:59.218327040,2.0


In [106]:
# Inspect Data
df_SeverityRank_LocAssetClass.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271805 entries, 0 to 271804
Data columns (total 6 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   ENVIRONMENT            271805 non-null  object        
 1   GEOGRAPHICAL_CATEGORY  271805 non-null  category      
 2   AssetClass             271805 non-null  category      
 3   AssetSubClass          271805 non-null  category      
 4   SCS_TIME               271805 non-null  datetime64[ns]
 5   SeverityScore          271805 non-null  float64       
dtypes: category(3), datetime64[ns](1), float64(1), object(1)
memory usage: 7.0+ MB


In [107]:
# Rename Columns
df_SeverityRank_LocAssetClass.rename(columns = {'SeverityScore':'RWSS_LocAssetClass'}, inplace = True)

# Merge Data
df = df.merge(df_SeverityRank_LocAssetClass, "left", on = ["ENVIRONMENT", 
                                                           "GEOGRAPHICAL_CATEGORY", 
                                                           "AssetClass",
                                                           "AssetSubClass",
                                                           "SCS_TIME"])

# Delete redundant variables
del df_SeverityRank_LocAssetClass

# Inspect data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312160 entries, 0 to 312159
Data columns (total 35 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   ENTRY_CODE_SUFFIX         312160 non-null  object        
 1   EQUIPMENT_NAME            0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED  312160 non-null  bool          
 3   SCS_TIME                  312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY       312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY     312160 non-null  category      
 6   ENVIRONMENT               312160 non-null  object        
 7   ASSET_ID_RAW              312160 non-null  object        
 8   EVENT_STATUS              312157 non-null  category      
 9   ASSET_DESC_CAT            311610 non-null  category      
 10  EVENT_DESC_CAT            312160 non-null  object        
 11  AssetClass                312160 non-null  category      
 12  As

#### Rolling Window Count (10min) - SeverityRank X isAlarm Mean by ENVIRONMENT + GEOGRAPHICAL_CATEGORY
Not for baseline comparison

In [108]:
# Compute Rolling Window Count
df_SeverityRank_Loc = df_rollingWindow.groupby(["ENVIRONMENT", 
                                                "GEOGRAPHICAL_CATEGORY"]).rolling('600s', min_periods=1).mean()

# Drop redundant variables
df_SeverityRank_Loc = df_SeverityRank_Loc.drop(columns = ["FUNCTIONAL_CATEGORY", "EQUIPMENT_NAME"])

# Inspect data
df_SeverityRank_Loc.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SeverityScore
ENVIRONMENT,GEOGRAPHICAL_CATEGORY,SCS_TIME,Unnamed: 3_level_1
OCCCMS,1,2021-01-01 00:52:56.905100032,0.2
OCCCMS,1,2021-01-01 00:52:58.423104000,0.2
OCCCMS,1,2021-01-01 01:06:57.448450048,0.2
OCCCMS,1,2021-01-01 01:06:58.568749824,0.2
OCCCMS,1,2021-01-01 01:07:56.595595008,0.2


In [109]:
# Reset Index for ease of look up
df_SeverityRank_Loc = df_SeverityRank_Loc.reset_index()

# Inspect Data
df_SeverityRank_Loc.head()

Unnamed: 0,ENVIRONMENT,GEOGRAPHICAL_CATEGORY,SCS_TIME,SeverityScore
0,OCCCMS,1,2021-01-01 00:52:56.905100032,0.2
1,OCCCMS,1,2021-01-01 00:52:58.423104000,0.2
2,OCCCMS,1,2021-01-01 01:06:57.448450048,0.2
3,OCCCMS,1,2021-01-01 01:06:58.568749824,0.2
4,OCCCMS,1,2021-01-01 01:07:56.595595008,0.2


In [110]:
df_SeverityRank_Loc.shape

(312160, 4)

In [111]:
# Drop Duplicates, whilst keeping the last value
df_SeverityRank_Loc = df_SeverityRank_Loc.drop_duplicates(subset = ["ENVIRONMENT", 
                                                                    "GEOGRAPHICAL_CATEGORY",
                                                                    "SCS_TIME"
                                                                   ], ignore_index = True, keep = 'last')

# Inspect Data
df_SeverityRank_Loc.head()

Unnamed: 0,ENVIRONMENT,GEOGRAPHICAL_CATEGORY,SCS_TIME,SeverityScore
0,OCCCMS,1,2021-01-01 00:52:56.905100032,0.2
1,OCCCMS,1,2021-01-01 00:52:58.423104000,0.2
2,OCCCMS,1,2021-01-01 01:06:57.448450048,0.2
3,OCCCMS,1,2021-01-01 01:06:58.568749824,0.2
4,OCCCMS,1,2021-01-01 01:07:56.595595008,0.2


In [112]:
# Inspect Data
df_SeverityRank_Loc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270650 entries, 0 to 270649
Data columns (total 4 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   ENVIRONMENT            270650 non-null  object        
 1   GEOGRAPHICAL_CATEGORY  270650 non-null  category      
 2   SCS_TIME               270650 non-null  datetime64[ns]
 3   SeverityScore          270650 non-null  float64       
dtypes: category(1), datetime64[ns](1), float64(1), object(1)
memory usage: 6.5+ MB


In [113]:
# Rename Columns
df_SeverityRank_Loc.rename(columns = {'SeverityScore':'RWSS_Loc'}, inplace = True)

# Merge Data
df = df.merge(df_SeverityRank_Loc, "left", on = ["ENVIRONMENT", 
                                                 "GEOGRAPHICAL_CATEGORY", 
                                                 "SCS_TIME"])

# Delete redundant variables
del df_SeverityRank_Loc, df_rollingWindow

# Inspect data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312160 entries, 0 to 312159
Data columns (total 36 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   ENTRY_CODE_SUFFIX         312160 non-null  object        
 1   EQUIPMENT_NAME            0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED  312160 non-null  bool          
 3   SCS_TIME                  312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY       312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY     312160 non-null  category      
 6   ENVIRONMENT               312160 non-null  object        
 7   ASSET_ID_RAW              312160 non-null  object        
 8   EVENT_STATUS              312157 non-null  category      
 9   ASSET_DESC_CAT            311610 non-null  category      
 10  EVENT_DESC_CAT            312160 non-null  object        
 11  AssetClass                312160 non-null  category      
 12  As

#### Tidy Up Values in Main Dataframe

In [114]:
# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312160 entries, 0 to 312159
Data columns (total 36 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   ENTRY_CODE_SUFFIX         312160 non-null  object        
 1   EQUIPMENT_NAME            0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED  312160 non-null  bool          
 3   SCS_TIME                  312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY       312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY     312160 non-null  category      
 6   ENVIRONMENT               312160 non-null  object        
 7   ASSET_ID_RAW              312160 non-null  object        
 8   EVENT_STATUS              312157 non-null  category      
 9   ASSET_DESC_CAT            311610 non-null  category      
 10  EVENT_DESC_CAT            312160 non-null  object        
 11  AssetClass                312160 non-null  category      
 12  As

In [115]:
# Inspect data
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc
0,+,,True,2021-01-01 00:30:03.941113856,48,11,OCCCMS,COM/BNK/B1/PABX01,NORMAL,PABX,Fan Status,COM,PABX,2021-01-01 00:30:04.503241984,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True,0.00085,0.001284,,0.2,0.2,0.2
1,+,,True,2021-01-01 00:30:04.953128192,51,17,OCCCMS,SCS/BGK/B1/PLC01,FAULT,ISCS PLC,Mux Selection,SCS,PLC,2021-01-01 00:30:05.504389888,True,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True,0.00206,0.004139,0.002842,0.2,0.75,0.75
2,+,,True,2021-01-01 00:30:04.953128192,51,17,OCCCMS,SCS/BGK/B1/PLC01,NORMAL,ISCS PLC,Mux Selection,SCS,PLC,2021-01-01 00:30:05.504389888,False,True,False,True,True,BGK,Buangkok station,6,Maintenance-Low,2,False,2021-01-01,4,False,0,True,0.00206,0.004139,,0.2,0.75,0.75
3,+,,True,2021-01-01 00:30:04.953128192,48,11,OCCCMS,COM/BNK/B1/PABX01,FAILURE,PABX,Fan Status,COM,PABX,2021-01-01 00:30:05.504389888,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True,0.001699,0.002568,,0.2,0.2,0.2
4,+,,True,2021-01-01 00:30:07.483118080,48,11,OCCCMS,COM/BNK/B1/PABX01,NORMAL,PABX,Fan Status,COM,PABX,2021-01-01 00:30:07.503307008,False,True,False,True,True,BNK,Boon Keng station,4,Maintenance-Low,2,False,2021-01-01,4,False,0,True,0.002549,0.003853,,0.2,0.2,0.2


In [116]:
# Delete redundant variables
del AssetClassList, AssetClassList2, FCatList, FCatList2

# Sort Values By Time
df = df.sort_index().sort_values(["ENVIRONMENT", 
                                  "FUNCTIONAL_CATEGORY", 
                                  "AssetClass", 
                                  "AssetSubClass", 
                                  "EQUIPMENT_NAME", 
                                  "ASSET_ID_RAW", 
                                  "SCS_TIME"]).reset_index()
del df["index"]

# Replacing infinite with nan
try:
    df.replace([np.inf, -np.inf], 0, inplace = True)
except:
    df.replace([np.inf], 0, inplace = True)
    df.replace([-np.inf], 0, inplace = True)
    
# Inspect data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 36 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   ENTRY_CODE_SUFFIX         312160 non-null  object        
 1   EQUIPMENT_NAME            0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED  312160 non-null  bool          
 3   SCS_TIME                  312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY       312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY     312160 non-null  category      
 6   ENVIRONMENT               312160 non-null  object        
 7   ASSET_ID_RAW              312160 non-null  object        
 8   EVENT_STATUS              312157 non-null  category      
 9   ASSET_DESC_CAT            311610 non-null  category      
 10  EVENT_DESC_CAT            312160 non-null  object        
 11  AssetClass                312160 non-null  category      
 12  As

In [117]:
# Inspect data
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc
0,+,,True,2021-01-24 03:36:05.203432192,1,1,OCCCMS,TUNNEL_LIGHT_OFF,REQUESTED,Tunnel LTG,Turn OFF NB and SB and CS Tunnel Light,LIGHT,OFF,2021-01-24 03:36:05.257608960,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3
1,+,,True,2021-01-24 03:39:40.253353984,1,1,OCCCMS,TUNNEL_LIGHT_OFF,TERMINATED,Tunnel LTG,Turn OFF NB and SB and CS Tunnel Light,LIGHT,OFF,2021-01-24 03:39:40.259689984,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3
2,+,,True,2021-01-24 03:36:51.309040128,1,1,OCCCMS,TUNNEL_LIGHT_ON,REQUESTED,Tunnel LTG,Turn ON NB and SB and CS Tunnel Light,LIGHT,ON,2021-01-24 03:36:52.259663104,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3
3,+,,True,2021-01-24 03:40:54.255806976,1,1,OCCCMS,TUNNEL_LIGHT_ON,TERMINATED,Tunnel LTG,Turn ON NB and SB and CS Tunnel Light,LIGHT,ON,2021-01-24 03:40:54.260729088,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3
4,+,,True,2021-01-25 02:14:40.259355136,1,3,OCCCMS,POW/NED/13/DCES05,REMOTE,Earthing Switch (Depot),Local/Remote Status,POW,DCES,2021-01-25 02:14:41.254756864,False,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-25,0,False,2,True,0.0,,,1.35,1.14,0.979412


In [118]:
# Get summary description of Envent Attribute sentiment
df[["SeverityRank", "RWSS_LocAsset", "RWSS_LocAssetClass", "RWSS_Loc"]].describe()

Unnamed: 0,SeverityRank,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc
count,312160.0,312160.0,307757.0,312160.0
mean,2.123219,0.720215,0.720784,0.72559
std,0.456532,0.743901,0.641165,0.580235
min,2.0,0.2,0.2,0.2
25%,2.0,0.2,0.231461,0.237151
50%,2.0,0.2,0.26183,0.401025
75%,2.0,1.3,1.11338,1.1275
max,11.0,7.15,7.15,7.15


### Review the Latest Event Combos for Useful Features for Anomaly Detection

Basic
1. "FUNCTIONAL_CATEGORY
2. "EVENT_DESC_CAT"
3. "EVENT_STATUS"
4. "ACKNOWLEDGEMENT_REQUIRED"
5. "SEVERITY"
6. "isAlarm"
7. "NuisanceAlarm"
8. "SCS_TIME"
9. "RWEC_ratio_Flat"
10. "RWEC_ratio_Seasonal"
11. "RWEC_ratio_SeasonalAlarm"

For Asset ID Extraction
1. "EQUIPMENT_NAME"
2. "ASSET_ID_RAW"

For Geographical Category Extraction
1. "GEOGRAPHICAL_CATEGORY"
2. "AssetClass"
3. "AssetSubClass"

In [119]:
# Subset / Filter Data by Date Period to better manage the data volume
df = df[(df['Date'] >= windowStart1) & (df['Date'] < windowEnd1)].reset_index()
del df["index"]

# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 36 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   ENTRY_CODE_SUFFIX         312160 non-null  object        
 1   EQUIPMENT_NAME            0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED  312160 non-null  bool          
 3   SCS_TIME                  312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY       312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY     312160 non-null  category      
 6   ENVIRONMENT               312160 non-null  object        
 7   ASSET_ID_RAW              312160 non-null  object        
 8   EVENT_STATUS              312157 non-null  category      
 9   ASSET_DESC_CAT            311610 non-null  category      
 10  EVENT_DESC_CAT            312160 non-null  object        
 11  AssetClass                312160 non-null  category      
 12  As

In [120]:
# Condense Core Event Attributes
# "ENVIRONMENT"
# "AssetClass"
# "FUNCTIONAL_CATEGORY"
# "EVENT_DESC_CAT"
# "EVENT_STATUS"

df["EventAttr"] = df["FUNCTIONAL_CATEGORY"].astype(str) + " - " + df["EVENT_DESC_CAT"].astype(str)  + " - " + df["EVENT_STATUS"].astype(str)  

# Inspect Data
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 37 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   ENTRY_CODE_SUFFIX         312160 non-null  object        
 1   EQUIPMENT_NAME            0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED  312160 non-null  bool          
 3   SCS_TIME                  312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY       312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY     312160 non-null  category      
 6   ENVIRONMENT               312160 non-null  object        
 7   ASSET_ID_RAW              312160 non-null  object        
 8   EVENT_STATUS              312157 non-null  category      
 9   ASSET_DESC_CAT            311610 non-null  category      
 10  EVENT_DESC_CAT            312160 non-null  object        
 11  AssetClass                312160 non-null  category      
 12  As

In [121]:
# Inspect Data
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc,EventAttr
0,+,,True,2021-01-24 03:36:05.203432192,1,1,OCCCMS,TUNNEL_LIGHT_OFF,REQUESTED,Tunnel LTG,Turn OFF NB and SB and CS Tunnel Light,LIGHT,OFF,2021-01-24 03:36:05.257608960,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3,1 - Turn OFF NB and SB and CS Tunnel Light - R...
1,+,,True,2021-01-24 03:39:40.253353984,1,1,OCCCMS,TUNNEL_LIGHT_OFF,TERMINATED,Tunnel LTG,Turn OFF NB and SB and CS Tunnel Light,LIGHT,OFF,2021-01-24 03:39:40.259689984,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3,1 - Turn OFF NB and SB and CS Tunnel Light - T...
2,+,,True,2021-01-24 03:36:51.309040128,1,1,OCCCMS,TUNNEL_LIGHT_ON,REQUESTED,Tunnel LTG,Turn ON NB and SB and CS Tunnel Light,LIGHT,ON,2021-01-24 03:36:52.259663104,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3,1 - Turn ON NB and SB and CS Tunnel Light - RE...
3,+,,True,2021-01-24 03:40:54.255806976,1,1,OCCCMS,TUNNEL_LIGHT_ON,TERMINATED,Tunnel LTG,Turn ON NB and SB and CS Tunnel Light,LIGHT,ON,2021-01-24 03:40:54.260729088,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3,1 - Turn ON NB and SB and CS Tunnel Light - TE...
4,+,,True,2021-01-25 02:14:40.259355136,1,3,OCCCMS,POW/NED/13/DCES05,REMOTE,Earthing Switch (Depot),Local/Remote Status,POW,DCES,2021-01-25 02:14:41.254756864,False,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-25,0,False,2,True,0.0,,,1.35,1.14,0.979412,1 - Local/Remote Status - REMOTE


#### Perform Sentiment Scoring on Event Attr

In [122]:
# Create Function to get polarity
def getSentimentScore(text):
    return TextBlob(str(text)).sentiment.polarity

# Create columns with subjectivity and polarity scores
# Note: swifter is not faster for this operation so the pandas default is used
%time df["sentimentScore"] = df["EventAttr"].apply(getSentimentScore)

# Inspect data
df.head()

CPU times: total: 48.9 s
Wall time: 49 s


Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc,EventAttr,sentimentScore
0,+,,True,2021-01-24 03:36:05.203432192,1,1,OCCCMS,TUNNEL_LIGHT_OFF,REQUESTED,Tunnel LTG,Turn OFF NB and SB and CS Tunnel Light,LIGHT,OFF,2021-01-24 03:36:05.257608960,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3,1 - Turn OFF NB and SB and CS Tunnel Light - R...,0.4
1,+,,True,2021-01-24 03:39:40.253353984,1,1,OCCCMS,TUNNEL_LIGHT_OFF,TERMINATED,Tunnel LTG,Turn OFF NB and SB and CS Tunnel Light,LIGHT,OFF,2021-01-24 03:39:40.259689984,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3,1 - Turn OFF NB and SB and CS Tunnel Light - T...,0.4
2,+,,True,2021-01-24 03:36:51.309040128,1,1,OCCCMS,TUNNEL_LIGHT_ON,REQUESTED,Tunnel LTG,Turn ON NB and SB and CS Tunnel Light,LIGHT,ON,2021-01-24 03:36:52.259663104,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3,1 - Turn ON NB and SB and CS Tunnel Light - RE...,0.4
3,+,,True,2021-01-24 03:40:54.255806976,1,1,OCCCMS,TUNNEL_LIGHT_ON,TERMINATED,Tunnel LTG,Turn ON NB and SB and CS Tunnel Light,LIGHT,ON,2021-01-24 03:40:54.260729088,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3,1 - Turn ON NB and SB and CS Tunnel Light - TE...,0.4
4,+,,True,2021-01-25 02:14:40.259355136,1,3,OCCCMS,POW/NED/13/DCES05,REMOTE,Earthing Switch (Depot),Local/Remote Status,POW,DCES,2021-01-25 02:14:41.254756864,False,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-25,0,False,2,True,0.0,,,1.35,1.14,0.979412,1 - Local/Remote Status - REMOTE,-0.1


In [123]:
# Get summary description of Event Attribute sentiment
df["sentimentScore"].describe()

count    312160.000000
mean         -0.011891
std           0.193981
min          -0.500000
25%          -0.316667
50%           0.000000
75%           0.150000
max           0.500000
Name: sentimentScore, dtype: float64

In [124]:
# Create Function to classif negative, neutral and positive sentiment
def getSentimentClass(x):
    return np.where((x > 0), 1, np.where((x < 0), -1, 0))

# Gen Sentiment Class
%time df["SentimentClass"] = df["sentimentScore"].swifter.apply(getSentimentClass)

CPU times: total: 15.6 ms
Wall time: 5 ms


In [125]:
# Get summary description of Envent Attribute sentiment
df["SentimentClass"].describe()

count    312160.000000
mean          0.183992
std           0.833522
min          -1.000000
25%          -1.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: SentimentClass, dtype: float64

In [126]:
# Inspect data
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc,EventAttr,sentimentScore,SentimentClass
0,+,,True,2021-01-24 03:36:05.203432192,1,1,OCCCMS,TUNNEL_LIGHT_OFF,REQUESTED,Tunnel LTG,Turn OFF NB and SB and CS Tunnel Light,LIGHT,OFF,2021-01-24 03:36:05.257608960,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3,1 - Turn OFF NB and SB and CS Tunnel Light - R...,0.4,1
1,+,,True,2021-01-24 03:39:40.253353984,1,1,OCCCMS,TUNNEL_LIGHT_OFF,TERMINATED,Tunnel LTG,Turn OFF NB and SB and CS Tunnel Light,LIGHT,OFF,2021-01-24 03:39:40.259689984,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3,1 - Turn OFF NB and SB and CS Tunnel Light - T...,0.4,1
2,+,,True,2021-01-24 03:36:51.309040128,1,1,OCCCMS,TUNNEL_LIGHT_ON,REQUESTED,Tunnel LTG,Turn ON NB and SB and CS Tunnel Light,LIGHT,ON,2021-01-24 03:36:52.259663104,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3,1 - Turn ON NB and SB and CS Tunnel Light - RE...,0.4,1
3,+,,True,2021-01-24 03:40:54.255806976,1,1,OCCCMS,TUNNEL_LIGHT_ON,TERMINATED,Tunnel LTG,Turn ON NB and SB and CS Tunnel Light,LIGHT,ON,2021-01-24 03:40:54.260729088,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-24,6,True,3,True,0.0,,,0.3,0.3,0.3,1 - Turn ON NB and SB and CS Tunnel Light - TE...,0.4,1
4,+,,True,2021-01-25 02:14:40.259355136,1,3,OCCCMS,POW/NED/13/DCES05,REMOTE,Earthing Switch (Depot),Local/Remote Status,POW,DCES,2021-01-25 02:14:41.254756864,False,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-25,0,False,2,True,0.0,,,1.35,1.14,0.979412,1 - Local/Remote Status - REMOTE,-0.1,-1


#### Extract Information from Past 5 Events of the Same Asset (Suffix: _Asset_N-1)

In [127]:
# Sort Data
df = df.sort_values(by=['ENVIRONMENT', 
                        'FUNCTIONAL_CATEGORY', 
                        'EQUIPMENT_NAME', 
                        'ASSET_ID_RAW', 
                        'SCS_TIME'], ignore_index = True)
# Inspect data
df.head(10)


Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc,EventAttr,sentimentScore,SentimentClass
0,+,,True,2021-01-01 01:28:58.632349952,1,3,OCCCMS,POW/NED/1122/HSCB1,OPEN,DC 1500 V Backup HSCB,Open/Close Status,POW,HSC,2021-01-01 01:28:58.502737152,False,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-01,4,False,1,True,0.0,,,1.35,1.35,1.35,1 - Open/Close Status - OPEN,0.0,0
1,+,,True,2021-01-01 03:48:00.267560960,1,3,OCCCMS,POW/NED/1122/HSCB1,CLOSE,DC 1500 V Backup HSCB,Open/Close Status,POW,HSC,2021-01-01 03:48:00.503747072,True,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-01,4,False,3,True,0.0,,,3.0,3.0,3.0,1 - Open/Close Status - CLOSE,0.0,0
2,+,,True,2021-01-01 03:50:21.154497024,1,3,OCCCMS,POW/NED/1122/HSCB1,CLOSE,DC 1500 V Backup HSCB,Open/Close Status,POW,HSC,2021-01-01 03:50:21.505383936,True,True,True,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-01,4,False,3,True,0.0,,,1.95,2.475,2.475,1 - Open/Close Status - CLOSE,0.0,0
3,+,,True,2021-01-02 03:41:19.416737024,1,3,OCCCMS,POW/NED/1122/HSCB1,CLOSE,DC 1500 V Backup HSCB,Open/Close Status,POW,HSC,2021-01-02 03:41:19.503020032,True,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-02,5,True,3,True,0.0,,,3.0,3.0,3.0,1 - Open/Close Status - CLOSE,0.0,0
4,+,,True,2021-01-02 03:41:32.166320128,1,3,OCCCMS,POW/NED/1122/HSCB1,CLOSE,DC 1500 V Backup HSCB,Open/Close Status,POW,HSC,2021-01-02 03:41:32.502966016,True,True,True,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-02,5,True,3,True,0.0,,,1.95,2.65,2.65,1 - Open/Close Status - CLOSE,0.0,0
5,+,,True,2021-01-03 00:43:15.578206976,1,3,OCCCMS,POW/NED/1122/HSCB1,OPEN,DC 1500 V Backup HSCB,Open/Close Status,POW,HSC,2021-01-03 00:43:16.503361024,False,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-03,6,True,0,True,0.0,,,1.35,1.35,1.35,1 - Open/Close Status - OPEN,0.0,0
6,+,,True,2021-01-03 01:01:34.666444032,1,3,OCCCMS,POW/NED/1122/HSCB1,LOCKOUT,DC 1500 V Backup HSCB,Lockout/Release Status,POW,HSC,2021-01-03 01:01:35.503618048,True,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-03,6,True,1,True,0.0,,,3.0,3.0,3.0,1 - Lockout/Release Status - LOCKOUT,0.0,0
7,+,,True,2021-01-03 01:01:43.746215936,1,3,OCCCMS,POW/NED/1122/HSCB1,TEST,DC 1500 V Backup HSCB,Disconnect/Test/Service Position,POW,HSC,2021-01-03 01:01:44.506316032,True,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-03,6,True,1,True,0.0,,,3.0,3.0,3.0,1 - Disconnect/Test/Service Position - TEST,0.0,0
8,+,,True,2021-01-03 01:01:47.527746816,1,3,OCCCMS,POW/NED/1122/HSCB1,RELEASE,DC 1500 V Backup HSCB,Lockout/Release Status,POW,HSC,2021-01-03 01:01:48.505334016,True,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-03,6,True,1,True,0.0,,,3.0,3.0,3.0,1 - Lockout/Release Status - RELEASE,0.0,0
9,+,,True,2021-01-03 01:03:33.342024192,1,3,OCCCMS,POW/NED/1122/HSCB1,TEST,DC 1500 V Backup HSCB,Disconnect/Test/Service Position,POW,HSC,2021-01-03 01:03:33.505008896,True,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-03,6,True,1,True,0.0,,,0.3,2.46,2.359091,1 - Disconnect/Test/Service Position - TEST,0.0,0


In [128]:
# Extract Data Points from N-1
df["SEVERITY_N-1"] = df.groupby(["ENVIRONMENT", 
                                 "FUNCTIONAL_CATEGORY", 
                                 "EQUIPMENT_NAME", 
                                 "ASSET_ID_RAW"])["SeverityRank"].shift(periods = 1, fill_value = 0)
df["SEVERITY_Worsen"] = (df["SEVERITY_N-1"] - df["SeverityRank"]) < 0
df["isAlarm_N-1"] = df.groupby(["ENVIRONMENT", 
                                "FUNCTIONAL_CATEGORY", 
                                "EQUIPMENT_NAME", 
                                "ASSET_ID_RAW"])["isAlarm"].shift(periods = 1, fill_value = False)
df["NuisanceAlarm_N-1"] = df.groupby(["ENVIRONMENT", 
                                      "FUNCTIONAL_CATEGORY", 
                                      "EQUIPMENT_NAME", 
                                      "ASSET_ID_RAW"])["NuisanceAlarm"].shift(periods = 1, fill_value = False)
df["CrashWarning_N-1"] = df.groupby(["ENVIRONMENT", 
                                     "FUNCTIONAL_CATEGORY", 
                                     "EQUIPMENT_NAME", 
                                     "ASSET_ID_RAW"])["CrashWarning"].shift(periods = 1, fill_value = False)
df["sentimentScore_N-1"] = df.groupby(["ENVIRONMENT", 
                                       "FUNCTIONAL_CATEGORY", 
                                       "EQUIPMENT_NAME", 
                                       "ASSET_ID_RAW"])["sentimentScore"].shift(periods = 1, fill_value = 0) # New update
df["SentimentClass_N-1"] = df.groupby(["ENVIRONMENT", 
                                       "FUNCTIONAL_CATEGORY", 
                                       "EQUIPMENT_NAME", 
                                       "ASSET_ID_RAW"])["SentimentClass"].shift(periods = 1, fill_value = 0) # New update
df["sentimentScoreDelta"] = df["sentimentScore"] - df["sentimentScore_N-1"] # New update
df["SentimentClassImprove_N-1"] = (df["SentimentClass"] - df["SentimentClass_N-1"]) > 0 # New update
df["SCS_TIME_logDelta_N-1"] = df.groupby(["ENVIRONMENT", 
                                          "FUNCTIONAL_CATEGORY", 
                                          "EQUIPMENT_NAME", 
                                          "ASSET_ID_RAW"])["SCS_TIME"].shift(1)
df["SCS_TIME_logDelta_N-1"] = np.log10((df["SCS_TIME"] - df["SCS_TIME_logDelta_N-1"]).dt.seconds)
df = df.replace({"SCS_TIME_logDelta_N-1": {np.inf: 0, -np.inf: 0, np.nan: 0}})

# Clean up of some values prone to error for the default fill_value (UPDATE)
df["isAlarm_N-1"] = df["isAlarm_N-1"] == True
df["NuisanceAlarm_N-1"] = df["NuisanceAlarm_N-1"] == True
df["CrashWarning_N-1"] = df["CrashWarning_N-1"] == True

# Additional Field Extraction for Non-CMS Data
# Non-CMS data have less permutations and more forgiving memory requirements
if (DataReductionOn == False):
    # Extract Data Points from N-1
    df["EventAttr_N-1"] = df.groupby(["ENVIRONMENT", 
                                      "FUNCTIONAL_CATEGORY", 
                                      "EQUIPMENT_NAME", 
                                      "ASSET_ID_RAW"])["EventAttr"].shift(periods = 1)

else:
    pass

# Inspect data
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc,EventAttr,sentimentScore,SentimentClass,SEVERITY_N-1,SEVERITY_Worsen,isAlarm_N-1,NuisanceAlarm_N-1,CrashWarning_N-1,sentimentScore_N-1,SentimentClass_N-1,sentimentScoreDelta,SentimentClassImprove_N-1,SCS_TIME_logDelta_N-1
0,+,,True,2021-01-01 01:28:58.632349952,1,3,OCCCMS,POW/NED/1122/HSCB1,OPEN,DC 1500 V Backup HSCB,Open/Close Status,POW,HSC,2021-01-01 01:28:58.502737152,False,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-01,4,False,1,True,0.0,,,1.35,1.35,1.35,1 - Open/Close Status - OPEN,0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
1,+,,True,2021-01-01 03:48:00.267560960,1,3,OCCCMS,POW/NED/1122/HSCB1,CLOSE,DC 1500 V Backup HSCB,Open/Close Status,POW,HSC,2021-01-01 03:48:00.503747072,True,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-01,4,False,3,True,0.0,,,3.0,3.0,3.0,1 - Open/Close Status - CLOSE,0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
2,+,,True,2021-01-01 03:50:21.154497024,1,3,OCCCMS,POW/NED/1122/HSCB1,CLOSE,DC 1500 V Backup HSCB,Open/Close Status,POW,HSC,2021-01-01 03:50:21.505383936,True,True,True,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-01,4,False,3,True,0.0,,,1.95,2.475,2.475,1 - Open/Close Status - CLOSE,0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
3,+,,True,2021-01-02 03:41:19.416737024,1,3,OCCCMS,POW/NED/1122/HSCB1,CLOSE,DC 1500 V Backup HSCB,Open/Close Status,POW,HSC,2021-01-02 03:41:19.503020032,True,False,False,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-02,5,True,3,True,0.0,,,3.0,3.0,3.0,1 - Open/Close Status - CLOSE,0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
4,+,,True,2021-01-02 03:41:32.166320128,1,3,OCCCMS,POW/NED/1122/HSCB1,CLOSE,DC 1500 V Backup HSCB,Open/Close Status,POW,HSC,2021-01-02 03:41:32.502966016,True,True,True,False,False,NED,NEL Depot,12,Operational-Low,3,False,2021-01-02,5,True,3,True,0.0,,,1.95,2.65,2.65,1 - Open/Close Status - CLOSE,0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0


In [129]:
# Inspect data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 49 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   ENTRY_CODE_SUFFIX          312160 non-null  object        
 1   EQUIPMENT_NAME             0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED   312160 non-null  bool          
 3   SCS_TIME                   312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY        312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY      312160 non-null  category      
 6   ENVIRONMENT                312160 non-null  object        
 7   ASSET_ID_RAW               312160 non-null  object        
 8   EVENT_STATUS               312157 non-null  category      
 9   ASSET_DESC_CAT             311610 non-null  category      
 10  EVENT_DESC_CAT             312160 non-null  object        
 11  AssetClass                 312160 non-null  category

In [130]:
# Inspect data
df.tail()

Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc,EventAttr,sentimentScore,SentimentClass,SEVERITY_N-1,SEVERITY_Worsen,isAlarm_N-1,NuisanceAlarm_N-1,CrashWarning_N-1,sentimentScore_N-1,SentimentClass_N-1,sentimentScoreDelta,SentimentClassImprove_N-1,SCS_TIME_logDelta_N-1
312155,+,,True,2021-01-03 01:46:21.702497024,74,1,OCCCMS,SCS/NED/1212/GWS04,SUCCEEDED,NelVisu,Operator Logged In/Out of NelVisu,SCS,GWS,2021-01-03 01:46:22.505698048,False,False,False,False,False,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-03,6,True,1,True,0.0,,,1.35,1.582143,1.354,74 - Operator Logged In/Out of NelVisu - SUCCE...,0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
312156,+,,True,2021-01-11 03:25:26.227955200,74,1,OCCCMS,SCS/NED/1212/GWS04,SUCCEEDED,NelVisu,Operator Logged In/Out of NelVisu,SCS,GWS,2021-01-11 03:25:26.504785152,False,False,False,False,False,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-11,0,False,3,True,0.0,,,1.35,1.905882,1.403571,74 - Operator Logged In/Out of NelVisu - SUCCE...,0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
312157,+,,True,2021-01-11 03:25:26.734913024,74,1,OCCCMS,SCS/NED/1212/GWS04,SUCCEEDED,NelVisu,Operator Logged In/Out of NelVisu,SCS,GWS,2021-01-11 03:25:27.506278144,False,False,False,False,False,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-11,0,False,3,True,0.0,,,1.35,1.875,1.401724,74 - Operator Logged In/Out of NelVisu - SUCCE...,0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
312158,+,,True,2021-01-11 03:25:27.347742976,74,1,OCCCMS,SCS/NED/1212/GWS04,SUCCEEDED,NelVisu,Operator Logged In/Out of NelVisu,SCS,GWS,2021-01-11 03:25:27.506278144,False,False,False,False,False,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-11,0,False,3,True,0.0,,,1.35,1.847368,1.4,74 - Operator Logged In/Out of NelVisu - SUCCE...,0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
312159,+,,True,2021-01-13 01:19:49.153079040,74,1,OCCCMS,SCS/NED/1212/GWS04,SUCCEEDED,NelVisu,Operator Logged In/Out of NelVisu,SCS,GWS,2021-01-13 01:19:49.506489088,False,False,False,False,False,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-13,2,False,1,True,0.0,,,1.35,1.35,1.35,74 - Operator Logged In/Out of NelVisu - SUCCE...,0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0


#### Extract Information from Past 5 Events of the Same Geographical Category (Suffix: _GEO_N-1)

In [131]:
# Sort Data
df = df.sort_values(by=['ENVIRONMENT', 
                        'FUNCTIONAL_CATEGORY', 
                        'GEOGRAPHICAL_CATEGORY',
                        'EQUIPMENT_NAME', 
                        'ASSET_ID_RAW', 
                        'SCS_TIME'], ignore_index = True)

# Inspect Data
df.head(10)

Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc,EventAttr,sentimentScore,SentimentClass,SEVERITY_N-1,SEVERITY_Worsen,isAlarm_N-1,NuisanceAlarm_N-1,CrashWarning_N-1,sentimentScore_N-1,SentimentClass_N-1,sentimentScoreDelta,SentimentClassImprove_N-1,SCS_TIME_logDelta_N-1
0,+,,True,2021-01-01 01:48:02.819180032,1,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-01 01:48:03.506892800,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-01,4,False,1,True,0.0,,,0.3,0.3,1.215217,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
1,+,,True,2021-01-01 01:50:14.752684032,1,1,OCCCMS,TRACTION_BGK_OFF,TERMINATED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-01 01:50:15.503557888,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-01,4,False,1,True,0.0,,,0.3,0.3,0.994118,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
2,+,,True,2021-01-02 00:37:56.958477056,1,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-02 00:37:57.507499008,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-02,5,True,0,True,0.0,,,0.3,0.3,0.313333,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
3,+,,True,2021-01-02 00:40:23.753911040,1,1,OCCCMS,TRACTION_BGK_OFF,TERMINATED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-02 00:40:24.502917888,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-02,5,True,0,True,0.0,,,0.3,0.3,0.31,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
4,+,,True,2021-01-03 00:52:01.285080064,1,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-03 00:52:01.506671872,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-03,6,True,0,True,0.0,,,0.3,0.3,0.263636,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
5,+,,True,2021-01-03 00:53:40.752843008,1,1,OCCCMS,TRACTION_BGK_OFF,TERMINATED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-03 00:53:41.502823168,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-03,6,True,0,True,0.0,,,0.3,0.3,0.283333,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
6,+,,True,2021-01-04 00:47:12.922439168,1,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-04 00:47:13.507666944,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-04,0,False,0,True,0.0,,,0.3,0.3,0.233333,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
7,+,,True,2021-01-04 00:49:36.755499008,1,1,OCCCMS,TRACTION_BGK_OFF,TERMINATED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-04 00:49:37.504237824,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-04,0,False,0,True,0.0,,,0.3,0.3,0.253846,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
8,+,,True,2021-01-05 00:49:59.398459904,1,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-05 00:49:59.506172928,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-05,1,False,0,True,0.0,,,0.3,0.3,0.225,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0
9,+,,True,2021-01-05 00:51:45.754960128,1,1,OCCCMS,TRACTION_BGK_OFF,TERMINATED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-05 00:51:46.508548864,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-05,1,False,0,True,0.0,,,0.3,0.3,0.24,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0


In [132]:
# Extract Data Points from N-1
df["SEVERITY_GEO_N-1"] = df.groupby(["ENVIRONMENT", 
                                     "GEOGRAPHICAL_CATEGORY",
                                     "FUNCTIONAL_CATEGORY"])["SeverityRank"].shift(periods = 1, fill_value = 0)
df["isAlarm_GEO_N-1"] = df.groupby(["ENVIRONMENT", 
                                    "GEOGRAPHICAL_CATEGORY",
                                    "FUNCTIONAL_CATEGORY"])["isAlarm"].shift(periods = 1, fill_value = False)
df["NuisanceAlarm_GEO_N-1"] = df.groupby(["ENVIRONMENT", 
                                          "GEOGRAPHICAL_CATEGORY",
                                          "FUNCTIONAL_CATEGORY"])["NuisanceAlarm"].shift(periods = 1, fill_value = False)
df["CrashWarning_GEO_N-1"] = df.groupby(["ENVIRONMENT", 
                                          "GEOGRAPHICAL_CATEGORY",
                                          "FUNCTIONAL_CATEGORY"])["CrashWarning"].shift(periods = 1, fill_value = False)
df["sentimentScore_GEO_N-1"] = df.groupby(["ENVIRONMENT", 
                                           "GEOGRAPHICAL_CATEGORY",
                                           "FUNCTIONAL_CATEGORY"])["sentimentScore"].shift(periods = 1, fill_value = 0) # New update
df["SCS_TIME_logDelta_GEO_N-1"] = df.groupby(["ENVIRONMENT", 
                                              "GEOGRAPHICAL_CATEGORY",
                                              "FUNCTIONAL_CATEGORY"])["SCS_TIME"].shift(periods = 1)
df["SCS_TIME_logDelta_GEO_N-1"] = np.log10((df["SCS_TIME"] - df["SCS_TIME_logDelta_GEO_N-1"]).dt.seconds)
df = df.replace({"SCS_TIME_logDelta_GEO_N-1": {np.inf: 0, -np.inf: 0, np.nan: 0}})
df["RWEC_Ratio_Flat_GEO_N-1"] = df.groupby(["ENVIRONMENT", 
                                            "GEOGRAPHICAL_CATEGORY",
                                            "FUNCTIONAL_CATEGORY"])["RWEC_Ratio_Flat"].shift(periods = 1, fill_value = 0)
df["RWEC_Ratio_Seasonal_GEO_N-1"] = df.groupby(["ENVIRONMENT", 
                                                "GEOGRAPHICAL_CATEGORY",
                                                "FUNCTIONAL_CATEGORY"])["RWEC_Ratio_Seasonal"].shift(periods = 1, fill_value = 0)
df["RWEC_Ratio_SeasonalAlarm_GEO_N-1"] = df.groupby(["ENVIRONMENT", 
                                                     "GEOGRAPHICAL_CATEGORY",
                                                     "FUNCTIONAL_CATEGORY"])["RWEC_Ratio_SeasonalAlarm"].shift(periods = 1, fill_value = 0)

# Clean up of some values prone to error for the default fill_value (UPDATE)
df["isAlarm_GEO_N-1"] = df["isAlarm_GEO_N-1"] == True
df["NuisanceAlarm_GEO_N-1"] = df["NuisanceAlarm_GEO_N-1"] == True
df["CrashWarning_GEO_N-1"] = df["CrashWarning_GEO_N-1"] == True

# Additional Field Extraction for Non-CMS Data
# Non-CMS data have less permutations and more forgiving memory requirements
if (DataReductionOn == False):
    # Extract Data Points from N-1
    df["EventAttr_GEO_N-1"] = df.groupby(["ENVIRONMENT", 
                                          "GEOGRAPHICAL_CATEGORY",
                                          "FUNCTIONAL_CATEGORY"])["EventAttr"].shift(periods = 1)
else:
    pass

# Inspect data
df.head()


  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc,EventAttr,sentimentScore,SentimentClass,SEVERITY_N-1,SEVERITY_Worsen,isAlarm_N-1,NuisanceAlarm_N-1,CrashWarning_N-1,sentimentScore_N-1,SentimentClass_N-1,sentimentScoreDelta,SentimentClassImprove_N-1,SCS_TIME_logDelta_N-1,SEVERITY_GEO_N-1,isAlarm_GEO_N-1,NuisanceAlarm_GEO_N-1,CrashWarning_GEO_N-1,sentimentScore_GEO_N-1,SCS_TIME_logDelta_GEO_N-1,RWEC_Ratio_Flat_GEO_N-1,RWEC_Ratio_Seasonal_GEO_N-1,RWEC_Ratio_SeasonalAlarm_GEO_N-1
0,+,,True,2021-01-01 01:48:02.819180032,1,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-01 01:48:03.506892800,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-01,4,False,1,True,0.0,,,0.3,0.3,1.215217,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,0,False,False,False,0.0,0.0,0.0,0.0,0.0
1,+,,True,2021-01-01 01:50:14.752684032,1,1,OCCCMS,TRACTION_BGK_OFF,TERMINATED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-01 01:50:15.503557888,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-01,4,False,1,True,0.0,,,0.3,0.3,0.994118,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,2.117271,0.0,,
2,+,,True,2021-01-02 00:37:56.958477056,1,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-02 00:37:57.507499008,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-02,5,True,0,True,0.0,,,0.3,0.3,0.313333,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,4.914142,0.0,,
3,+,,True,2021-01-02 00:40:23.753911040,1,1,OCCCMS,TRACTION_BGK_OFF,TERMINATED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-02 00:40:24.502917888,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-02,5,True,0,True,0.0,,,0.3,0.3,0.31,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,2.164353,0.0,,
4,+,,True,2021-01-03 00:52:01.285080064,1,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-03 00:52:01.506671872,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-03,6,True,0,True,0.0,,,0.3,0.3,0.263636,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,2.843233,0.0,,


In [133]:
# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 58 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   ENTRY_CODE_SUFFIX                 312160 non-null  object        
 1   EQUIPMENT_NAME                    0 non-null       float64       
 2   ACKNOWLEDGEMENT_REQUIRED          312160 non-null  bool          
 3   SCS_TIME                          312160 non-null  datetime64[ns]
 4   FUNCTIONAL_CATEGORY               312160 non-null  category      
 5   GEOGRAPHICAL_CATEGORY             312160 non-null  category      
 6   ENVIRONMENT                       312160 non-null  object        
 7   ASSET_ID_RAW                      312160 non-null  object        
 8   EVENT_STATUS                      312157 non-null  category      
 9   ASSET_DESC_CAT                    311610 non-null  category      
 10  EVENT_DESC_CAT                  

### Prepare Data for Anomaly Detection

In [134]:
# Subset / Filter Data by Date Period to better manage the data volume
#df = df[(df['Date'] >= windowStart2) & (df['Date'] < windowEnd2)].reset_index()
#del df["index"]


# Subset / Filter Data for Revenue Hours Only
#df = df[(df['EngHours'] == EngHrFilter)].reset_index()
#del df["index"]

# Inspect Data
#df.info()

In [135]:
# Inspect Data
df.head()

Unnamed: 0,ENTRY_CODE_SUFFIX,EQUIPMENT_NAME,ACKNOWLEDGEMENT_REQUIRED,SCS_TIME,FUNCTIONAL_CATEGORY,GEOGRAPHICAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,TIME_CODE,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeographicalCat,GeoSector,Severity_Class,SeverityRank,CrashWarning,Date,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc,EventAttr,sentimentScore,SentimentClass,SEVERITY_N-1,SEVERITY_Worsen,isAlarm_N-1,NuisanceAlarm_N-1,CrashWarning_N-1,sentimentScore_N-1,SentimentClass_N-1,sentimentScoreDelta,SentimentClassImprove_N-1,SCS_TIME_logDelta_N-1,SEVERITY_GEO_N-1,isAlarm_GEO_N-1,NuisanceAlarm_GEO_N-1,CrashWarning_GEO_N-1,sentimentScore_GEO_N-1,SCS_TIME_logDelta_GEO_N-1,RWEC_Ratio_Flat_GEO_N-1,RWEC_Ratio_Seasonal_GEO_N-1,RWEC_Ratio_SeasonalAlarm_GEO_N-1
0,+,,True,2021-01-01 01:48:02.819180032,1,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-01 01:48:03.506892800,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-01,4,False,1,True,0.0,,,0.3,0.3,1.215217,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,0,False,False,False,0.0,0.0,0.0,0.0,0.0
1,+,,True,2021-01-01 01:50:14.752684032,1,1,OCCCMS,TRACTION_BGK_OFF,TERMINATED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-01 01:50:15.503557888,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-01,4,False,1,True,0.0,,,0.3,0.3,0.994118,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,2.117271,0.0,,
2,+,,True,2021-01-02 00:37:56.958477056,1,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-02 00:37:57.507499008,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-02,5,True,0,True,0.0,,,0.3,0.3,0.313333,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,4.914142,0.0,,
3,+,,True,2021-01-02 00:40:23.753911040,1,1,OCCCMS,TRACTION_BGK_OFF,TERMINATED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-02 00:40:24.502917888,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-02,5,True,0,True,0.0,,,0.3,0.3,0.31,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,2.164353,0.0,,
4,+,,True,2021-01-03 00:52:01.285080064,1,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,2021-01-03 00:52:01.506671872,False,True,False,True,True,OCC,Operation Control Centre,10,Operational-Low,3,False,2021-01-03,6,True,0,True,0.0,,,0.3,0.3,0.263636,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,2.843233,0.0,,


In [136]:
# Auto terminate script if resulting dataframe is empty
if (df.shape[0] == 0):
    sys.exit("Empty Dataframe")
else:
    pass

In [137]:
# Drop redundant variables
df = df.drop(columns = [
                        "ENTRY_CODE_SUFFIX",
                        "ACKNOWLEDGEMENT_REQUIRED", # NEW UPDATE
                        #"ENTRY_CODE",
                        #"ALARM_ID",
                        #"USER_ID",
                        #"VALUE",
                        #"VALUE_STATE",
                        #"HIDDEN",
                        #"THEME",
                        #"EQUIPMENT_DATE",
                        #"ACQUISITION_DATE",
                        #"ASSET_DESCRIPTION",
                        #"EVENT_DESCRIPTION",
                        #"OPERATOR_INITIALS",
                        #"TrainID",
                        #"CarID",
                        #"ServiceID",
                        #"DATETIME_SENT",
                        #"DATETIME_RECEIVED",
                        "TIME_CODE",
                        "GeographicalCat",
                        "GEOGRAPHICAL_CATEGORY",
                        #"FUNCTIONAL_CATEGORY",
                        "Date", 
                        #"USER1"
                        ])

# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 52 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   EQUIPMENT_NAME                    0 non-null       float64       
 1   SCS_TIME                          312160 non-null  datetime64[ns]
 2   FUNCTIONAL_CATEGORY               312160 non-null  category      
 3   ENVIRONMENT                       312160 non-null  object        
 4   ASSET_ID_RAW                      312160 non-null  object        
 5   EVENT_STATUS                      312157 non-null  category      
 6   ASSET_DESC_CAT                    311610 non-null  category      
 7   EVENT_DESC_CAT                    312160 non-null  object        
 8   AssetClass                        312160 non-null  category      
 9   AssetSubClass                     307757 non-null  category      
 10  isAlarm                         

In [138]:
# Inspect Data
df.head()

Unnamed: 0,EQUIPMENT_NAME,SCS_TIME,FUNCTIONAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeoSector,Severity_Class,SeverityRank,CrashWarning,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc,EventAttr,sentimentScore,SentimentClass,SEVERITY_N-1,SEVERITY_Worsen,isAlarm_N-1,NuisanceAlarm_N-1,CrashWarning_N-1,sentimentScore_N-1,SentimentClass_N-1,sentimentScoreDelta,SentimentClassImprove_N-1,SCS_TIME_logDelta_N-1,SEVERITY_GEO_N-1,isAlarm_GEO_N-1,NuisanceAlarm_GEO_N-1,CrashWarning_GEO_N-1,sentimentScore_GEO_N-1,SCS_TIME_logDelta_GEO_N-1,RWEC_Ratio_Flat_GEO_N-1,RWEC_Ratio_Seasonal_GEO_N-1,RWEC_Ratio_SeasonalAlarm_GEO_N-1
0,,2021-01-01 01:48:02.819180032,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,False,True,False,True,True,OCC,10,Operational-Low,3,False,4,False,1,True,0.0,,,0.3,0.3,1.215217,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,0,False,False,False,0.0,0.0,0.0,0.0,0.0
1,,2021-01-01 01:50:14.752684032,1,OCCCMS,TRACTION_BGK_OFF,TERMINATED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,False,True,False,True,True,OCC,10,Operational-Low,3,False,4,False,1,True,0.0,,,0.3,0.3,0.994118,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,2.117271,0.0,,
2,,2021-01-02 00:37:56.958477056,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,False,True,False,True,True,OCC,10,Operational-Low,3,False,5,True,0,True,0.0,,,0.3,0.3,0.313333,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,4.914142,0.0,,
3,,2021-01-02 00:40:23.753911040,1,OCCCMS,TRACTION_BGK_OFF,TERMINATED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,False,True,False,True,True,OCC,10,Operational-Low,3,False,5,True,0,True,0.0,,,0.3,0.3,0.31,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,2.164353,0.0,,
4,,2021-01-03 00:52:01.285080064,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,False,True,False,True,True,OCC,10,Operational-Low,3,False,6,True,0,True,0.0,,,0.3,0.3,0.263636,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,2.843233,0.0,,


In [139]:
# Convert All Null Values with 0
#df = df.fillna(0)
df = df.replace(np.nan, 0)

# Inspect Data
df.head()

Unnamed: 0,EQUIPMENT_NAME,SCS_TIME,FUNCTIONAL_CATEGORY,ENVIRONMENT,ASSET_ID_RAW,EVENT_STATUS,ASSET_DESC_CAT,EVENT_DESC_CAT,AssetClass,AssetSubClass,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,GeoCode,GeoSector,Severity_Class,SeverityRank,CrashWarning,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc,EventAttr,sentimentScore,SentimentClass,SEVERITY_N-1,SEVERITY_Worsen,isAlarm_N-1,NuisanceAlarm_N-1,CrashWarning_N-1,sentimentScore_N-1,SentimentClass_N-1,sentimentScoreDelta,SentimentClassImprove_N-1,SCS_TIME_logDelta_N-1,SEVERITY_GEO_N-1,isAlarm_GEO_N-1,NuisanceAlarm_GEO_N-1,CrashWarning_GEO_N-1,sentimentScore_GEO_N-1,SCS_TIME_logDelta_GEO_N-1,RWEC_Ratio_Flat_GEO_N-1,RWEC_Ratio_Seasonal_GEO_N-1,RWEC_Ratio_SeasonalAlarm_GEO_N-1
0,0.0,2021-01-01 01:48:02.819180032,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,False,True,False,True,True,OCC,10,Operational-Low,3,False,4,False,1,True,0.0,0.0,0.0,0.3,0.3,1.215217,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,0,False,False,False,0.0,0.0,0.0,0.0,0.0
1,0.0,2021-01-01 01:50:14.752684032,1,OCCCMS,TRACTION_BGK_OFF,TERMINATED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,False,True,False,True,True,OCC,10,Operational-Low,3,False,4,False,1,True,0.0,0.0,0.0,0.3,0.3,0.994118,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,2.117271,0.0,0.0,0.0
2,0.0,2021-01-02 00:37:56.958477056,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,False,True,False,True,True,OCC,10,Operational-Low,3,False,5,True,0,True,0.0,0.0,0.0,0.3,0.3,0.313333,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,4.914142,0.0,0.0,0.0
3,0.0,2021-01-02 00:40:23.753911040,1,OCCCMS,TRACTION_BGK_OFF,TERMINATED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,False,True,False,True,True,OCC,10,Operational-Low,3,False,5,True,0,True,0.0,0.0,0.0,0.3,0.3,0.31,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,2.164353,0.0,0.0,0.0
4,0.0,2021-01-03 00:52:01.285080064,1,OCCCMS,TRACTION_BGK_OFF,REQUESTED,,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TRACTION,TRACTION,False,True,False,True,True,OCC,10,Operational-Low,3,False,6,True,0,True,0.0,0.0,0.0,0.3,0.3,0.263636,"1 - DM, DM, DI, DFS, DFN, DFN, DFS - Open Cont...",0.0,0,0,True,False,False,False,0.0,0,0.0,False,0.0,3,False,True,False,0.0,2.843233,0.0,0.0,0.0


In [140]:
# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 52 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   EQUIPMENT_NAME                    312160 non-null  float64       
 1   SCS_TIME                          312160 non-null  datetime64[ns]
 2   FUNCTIONAL_CATEGORY               312160 non-null  category      
 3   ENVIRONMENT                       312160 non-null  object        
 4   ASSET_ID_RAW                      312160 non-null  object        
 5   EVENT_STATUS                      312157 non-null  category      
 6   ASSET_DESC_CAT                    311610 non-null  category      
 7   EVENT_DESC_CAT                    312160 non-null  object        
 8   AssetClass                        312160 non-null  category      
 9   AssetSubClass                     307757 non-null  category      
 10  isAlarm                         

In [141]:
# Check current directory
# For exporting raw imputs for manual labelling and inspection
cwd

'C:\\Users\\schdadmin\\Documents\\IAMS Analytics\\alarm-event-logs'

In [142]:
# Define File Save Parameters
# For exporting raw imputs for manual labelling and inspection

#FileName = "CMS"
#Run = "-B0001"
# True for single file output; # False for multiple file output; "both" for both Single & Multiple File Output
singleSave = "both"

# Get length of dataframe
df_len = len(df)
# Inspect data
print(df_len)

# Define Size of Partitioned Dataframes
partionSize = 500000

# Define Number of Partitions (Always Round Up to Nearest Interger)
if (df_len == partionSize): 
    partitions = 1
else:
    partitions = df_len // partionSize + 1

# Inspect data
print(partitions)     


312160
1


In [143]:
# Double check saving details
FuncCatCluster[targetFuncCatCluster] 

'AltRun'

In [144]:
# Double check current directory
os.getcwd()

'C:\\Users\\schdadmin\\Documents\\IAMS Analytics\\alarm-event-logs\\taggedOutput'

In [145]:
# Export file based on above settings
# For exporting raw imputs for manual labelling and inspection
if singleSave == True:
    fileNameN = "AnomalyTaggingRaw/" + SrcEnv + BatchCode + "AnomalyTagging_RAW" + EngHrTag + "-" + FuncCatCluster[targetFuncCatCluster] + ".csv"
    df.to_csv(fileNameN, index=False)
    print(fileNameN + " SAVED")
elif singleSave == "both":
    # Single File Save
    fileNameN = "AnomalyTaggingRaw/" + SrcEnv + BatchCode + "AnomalyTagging_RAW" + EngHrTag + "-" + FuncCatCluster[targetFuncCatCluster] + ".csv"
    df.to_csv(fileNameN, index=False)
    print(fileNameN + " SAVED")
    
    # Split Dataframe into batches of 500K rows
    for counter in range(partitions):
        startPoint = counter * partionSize
        df_subset = df.iloc[startPoint : (startPoint + partionSize)]
        #print(df_subset.info())

        # Save File
        fileNameN = "AnomalyTaggingRaw_Subset/" + SrcEnv + BatchCode + "AnomalyTagging_RAW" + EngHrTag + '-' + str(counter).zfill(3) + "-" + FuncCatCluster[targetFuncCatCluster] + ".csv"
        df_subset.to_csv(fileNameN, index=False)
        print(fileNameN + " SAVED")
else:
    # Split Dataframe into batches of 500K rows
    for counter in range(partitions):
        startPoint = counter * partionSize
        df_subset = df.iloc[startPoint : (startPoint + partionSize)]
        #print(df_subset.info())

        # Save File
        fileNameN = "AnomalyTaggingRaw_Subset/" + SrcEnv + BatchCode + "AnomalyTagging_RAW" + EngHrTag + '-' + str(counter).zfill(3) + "-" + FuncCatCluster[targetFuncCatCluster] + ".csv"
        df_subset.to_csv(fileNameN, index=False)
        print(fileNameN + " SAVED")

AnomalyTaggingRaw/CMS-05-AnomalyTagging_RAW-EngHr-AltRun.csv SAVED
AnomalyTaggingRaw_Subset/CMS-05-AnomalyTagging_RAW-EngHr-000-AltRun.csv SAVED


In [146]:
# Force EQUIPMENT_NAME to be a string
df["EQUIPMENT_NAME"] = df["EQUIPMENT_NAME"].astype("object")

# Select Variables for Hash Encoding
df0 = df.select_dtypes(include = ["category", 
                                  "object"]).drop(columns = ["EQUIPMENT_NAME", 
                                                             "ASSET_ID_RAW",
                                                             "FUNCTIONAL_CATEGORY", # Updated from "FUNCTIONAL_CATEGORY DESC"
                                                             "EVENT_DESC_CAT",
                                                             "ENVIRONMENT",
                                                             "EVENT_STATUS",
                                                             "Severity_Class"
                                                            ])



# Inspect data
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   ASSET_DESC_CAT  311610 non-null  category
 1   AssetClass      312160 non-null  category
 2   AssetSubClass   307757 non-null  category
 3   GeoCode         312160 non-null  object  
 4   GeoSector       312160 non-null  category
 5   EventAttr       312160 non-null  object  
dtypes: category(4), object(2)
memory usage: 6.0+ MB


In [147]:
# Select Variables Not Required for Hashing + Event Identifiers
df["GeoSector"].astype(str)
df = pd.concat([df["EQUIPMENT_NAME"], 
                df["ASSET_ID_RAW"],
                df["ENVIRONMENT"],
                df["GeoCode"],
                df["FUNCTIONAL_CATEGORY"], # Updated from "FUNCTIONAL_CATEGORY_DESC"
                df["AssetClass"],
                df["AssetSubClass"],
                df["EVENT_DESC_CAT"],
                df["EVENT_STATUS"],
                df["Severity_Class"],
                df.select_dtypes(include = ["bool", 
                                            "datetime64[ns]", 
                                            "float64", 
                                            "int64"]), 
               ], axis=1) 

# Inspect Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 47 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   EQUIPMENT_NAME                    312160 non-null  object        
 1   ASSET_ID_RAW                      312160 non-null  object        
 2   ENVIRONMENT                       312160 non-null  object        
 3   GeoCode                           312160 non-null  object        
 4   FUNCTIONAL_CATEGORY               312160 non-null  category      
 5   AssetClass                        312160 non-null  category      
 6   AssetSubClass                     307757 non-null  category      
 7   EVENT_DESC_CAT                    312160 non-null  object        
 8   EVENT_STATUS                      312157 non-null  category      
 9   Severity_Class                    312160 non-null  category      
 10  SCS_TIME                        

In [148]:
# Convert all columns to string to make it compatible with hashing algorithm
df0 = df0.astype(str)

# Inspect Data
df0.dtypes

ASSET_DESC_CAT    object
AssetClass        object
AssetSubClass     object
GeoCode           object
GeoSector         object
EventAttr         object
dtype: object

In [149]:
# Get Cardinality Estimator of each variable as a list
%time df0_K = df0.swifter.apply(pd.Series.nunique).tolist() #updated with swifter which is faster for this operation
#%time df0_K = df0.apply(pd.Series.nunique).tolist() # deprecated as swifter is faster
df0_K = [(np.ceil(np.log10(i) / np.log10(2)) + 1).astype(int)  for i in df0_K]

# Get List of Variable Names and Combine it with Cardinality Count as a Dictionary
#df0_K = dict(zip(df0.columns.values.tolist(), df0_K))

# Inspect data
print(df0_K)

Pandas Apply:   0%|          | 0/6 [00:00<?, ?it/s]

CPU times: total: 172 ms
Wall time: 173 ms
[8, 5, 8, 6, 5, 10]


In [150]:
# Load Intel patch for SKlearn for speeding it up
# PYOD runs sklearn in the background so it can benefit from the optimisation boost
#!pip install scikit-learn-intelex
from sklearnex import patch_sklearn 
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [151]:
# Import Library
from sklearn.feature_extraction import FeatureHasher
from sklearn.compose import ColumnTransformer

In [152]:
# Get Total Number of Variables / Features
n_orig_features = df0.shape[1]
n_orig_features

6

In [153]:
# Hash Encode Data & Merge it Back
hashedExport_df = df.copy()

for i in range(n_orig_features):
    hashed = FeatureHasher(n_features = df0_K[i], input_type='string')  
    hashed_df = hashed.fit_transform(df0[df0.columns[i]].astype(str))
    hashed_df = pd.DataFrame(data=hashed_df.toarray())
    hashedExport_df = pd.concat([hashedExport_df, hashed_df], axis = 1)
    del hashed, hashed_df

# Inspect Data
hashedExport_df.head()

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,SCS_TIME,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,SeverityRank,CrashWarning,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc,sentimentScore,SEVERITY_N-1,SEVERITY_Worsen,isAlarm_N-1,NuisanceAlarm_N-1,CrashWarning_N-1,sentimentScore_N-1,sentimentScoreDelta,SentimentClassImprove_N-1,SCS_TIME_logDelta_N-1,SEVERITY_GEO_N-1,isAlarm_GEO_N-1,NuisanceAlarm_GEO_N-1,CrashWarning_GEO_N-1,sentimentScore_GEO_N-1,SCS_TIME_logDelta_GEO_N-1,RWEC_Ratio_Flat_GEO_N-1,RWEC_Ratio_Seasonal_GEO_N-1,RWEC_Ratio_SeasonalAlarm_GEO_N-1,0,1,2,3,4,5,6,7,0.1,1.1,2.1,3.1,4.1,0.2,1.2,2.2,3.2,4.2,5.1,6.1,7.1,0.3,1.3,2.3,3.3,4.3,5.2,0.4,1.4,2.4,3.4,4.4,0.5,1.5,2.5,3.5,4.5,5.3,6.2,7.2,8,9
0,0.0,TRACTION_BGK_OFF,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,2021-01-01 01:48:02.819180032,False,True,False,True,True,3,False,4,False,1,True,0.0,0.0,0.0,0.3,0.3,1.215217,0.0,0,True,False,False,False,0.0,0.0,False,0.0,0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-2.0,0.0,0.0,0.0,0.0,-3.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,-3.0,2.0,-1.0,0.0,0.0,0.0,-2.0,1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,1.0,-8.0,15.0,3.0,-1.0,1.0,7.0,0.0,-7.0,-2.0
1,0.0,TRACTION_BGK_OFF,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,2021-01-01 01:50:14.752684032,False,True,False,True,True,3,False,4,False,1,True,0.0,0.0,0.0,0.3,0.3,0.994118,0.0,0,True,False,False,False,0.0,0.0,False,0.0,3,False,True,False,0.0,2.117271,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-2.0,0.0,0.0,0.0,0.0,-3.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,-3.0,2.0,-1.0,0.0,0.0,0.0,-2.0,1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,3.0,-9.0,16.0,2.0,0.0,1.0,7.0,-2.0,-6.0,-2.0
2,0.0,TRACTION_BGK_OFF,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,2021-01-02 00:37:56.958477056,False,True,False,True,True,3,False,5,True,0,True,0.0,0.0,0.0,0.3,0.3,0.313333,0.0,0,True,False,False,False,0.0,0.0,False,0.0,3,False,True,False,0.0,4.914142,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-2.0,0.0,0.0,0.0,0.0,-3.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,-3.0,2.0,-1.0,0.0,0.0,0.0,-2.0,1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,1.0,-8.0,15.0,3.0,-1.0,1.0,7.0,0.0,-7.0,-2.0
3,0.0,TRACTION_BGK_OFF,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,2021-01-02 00:40:23.753911040,False,True,False,True,True,3,False,5,True,0,True,0.0,0.0,0.0,0.3,0.3,0.31,0.0,0,True,False,False,False,0.0,0.0,False,0.0,3,False,True,False,0.0,2.164353,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-2.0,0.0,0.0,0.0,0.0,-3.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,-3.0,2.0,-1.0,0.0,0.0,0.0,-2.0,1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,3.0,-9.0,16.0,2.0,0.0,1.0,7.0,-2.0,-6.0,-2.0
4,0.0,TRACTION_BGK_OFF,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,2021-01-03 00:52:01.285080064,False,True,False,True,True,3,False,6,True,0,True,0.0,0.0,0.0,0.3,0.3,0.263636,0.0,0,True,False,False,False,0.0,0.0,False,0.0,3,False,True,False,0.0,2.843233,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-2.0,0.0,0.0,0.0,0.0,-3.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,-3.0,2.0,-1.0,0.0,0.0,0.0,-2.0,1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,1.0,-8.0,15.0,3.0,-1.0,1.0,7.0,0.0,-7.0,-2.0


In [154]:
# Inspect Data
hashedExport_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 89 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   EQUIPMENT_NAME                    312160 non-null  object        
 1   ASSET_ID_RAW                      312160 non-null  object        
 2   ENVIRONMENT                       312160 non-null  object        
 3   GeoCode                           312160 non-null  object        
 4   FUNCTIONAL_CATEGORY               312160 non-null  category      
 5   AssetClass                        312160 non-null  category      
 6   AssetSubClass                     307757 non-null  category      
 7   EVENT_DESC_CAT                    312160 non-null  object        
 8   EVENT_STATUS                      312157 non-null  category      
 9   Severity_Class                    312160 non-null  category      
 10  SCS_TIME                        

In [155]:
# Extract identifiers
hashedExportID = df[["EQUIPMENT_NAME", 
                     "ASSET_ID_RAW", 
                     "SCS_TIME", 
                     "ENVIRONMENT", 
                     "GeoCode",
                     "FUNCTIONAL_CATEGORY", # Updated from "FUNCTIONAL_CATEGORY_DESC"
                     "AssetClass",
                     "AssetSubClass",
                     "EVENT_DESC_CAT",
                     "EVENT_STATUS",
                     "Severity_Class"
                    ]].copy()

# Drop Redundant Cols
# Select only numerical data for PCA later on
hashedExport_df = hashedExport_df.select_dtypes(include = ["bool", 
                                                           "float64", 
                                                           "int64"])
#hashedExport_df = hashedExport_df.drop(columns = ["ACKNOWLEDGEMENT_REQUIRED"]) # updated: deprecated

# Delete Redundant variables
del df, df0

# Replacing infinite with nan
hashedExport_df.replace([np.inf, -np.inf], 0, inplace = True)

# Inspect Data
hashedExport_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 78 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   isAlarm                           312160 non-null  bool   
 1   NuisanceAlarm                     312160 non-null  bool   
 2   RepeatAlarm                       312160 non-null  bool   
 3   AltAlarm2                         312160 non-null  bool   
 4   AltAlarm3                         312160 non-null  bool   
 5   SeverityRank                      312160 non-null  int64  
 7   DayofWeek                         312160 non-null  int64  
 8   Weekend                           312160 non-null  bool   
 9   HourofDay                         312160 non-null  int64  
 10  EngHours                          312160 non-null  bool   
 11  RWEC_Ratio_Flat                   312160 non-null  float64
 12  RWEC_Ratio_Seasonal               312160 non-null  f

In [156]:
# Inspect Data
hashedExportID.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   EQUIPMENT_NAME       312160 non-null  object        
 1   ASSET_ID_RAW         312160 non-null  object        
 2   SCS_TIME             312160 non-null  datetime64[ns]
 3   ENVIRONMENT          312160 non-null  object        
 4   GeoCode              312160 non-null  object        
 5   FUNCTIONAL_CATEGORY  312160 non-null  category      
 6   AssetClass           312160 non-null  category      
 7   AssetSubClass        307757 non-null  category      
 8   EVENT_DESC_CAT       312160 non-null  object        
 9   EVENT_STATUS         312157 non-null  category      
 10  Severity_Class       312160 non-null  category      
dtypes: category(5), datetime64[ns](1), object(5)
memory usage: 15.8+ MB


In [157]:
# Inspect Data
hashedExport_df.head()

Unnamed: 0,isAlarm,NuisanceAlarm,RepeatAlarm,AltAlarm2,AltAlarm3,SeverityRank,CrashWarning,DayofWeek,Weekend,HourofDay,EngHours,RWEC_Ratio_Flat,RWEC_Ratio_Seasonal,RWEC_Ratio_SeasonalAlarm,RWSS_LocAsset,RWSS_LocAssetClass,RWSS_Loc,sentimentScore,SEVERITY_N-1,SEVERITY_Worsen,isAlarm_N-1,NuisanceAlarm_N-1,CrashWarning_N-1,sentimentScore_N-1,sentimentScoreDelta,SentimentClassImprove_N-1,SCS_TIME_logDelta_N-1,SEVERITY_GEO_N-1,isAlarm_GEO_N-1,NuisanceAlarm_GEO_N-1,CrashWarning_GEO_N-1,sentimentScore_GEO_N-1,SCS_TIME_logDelta_GEO_N-1,RWEC_Ratio_Flat_GEO_N-1,RWEC_Ratio_Seasonal_GEO_N-1,RWEC_Ratio_SeasonalAlarm_GEO_N-1,0,1,2,3,4,5,6,7,0.1,1.1,2.1,3.1,4.1,0.2,1.2,2.2,3.2,4.2,5.1,6.1,7.1,0.3,1.3,2.3,3.3,4.3,5.2,0.4,1.4,2.4,3.4,4.4,0.5,1.5,2.5,3.5,4.5,5.3,6.2,7.2,8,9
0,False,True,False,True,True,3,False,4,False,1,True,0.0,0.0,0.0,0.3,0.3,1.215217,0.0,0,True,False,False,False,0.0,0.0,False,0.0,0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-2.0,0.0,0.0,0.0,0.0,-3.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,-3.0,2.0,-1.0,0.0,0.0,0.0,-2.0,1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,1.0,-8.0,15.0,3.0,-1.0,1.0,7.0,0.0,-7.0,-2.0
1,False,True,False,True,True,3,False,4,False,1,True,0.0,0.0,0.0,0.3,0.3,0.994118,0.0,0,True,False,False,False,0.0,0.0,False,0.0,3,False,True,False,0.0,2.117271,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-2.0,0.0,0.0,0.0,0.0,-3.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,-3.0,2.0,-1.0,0.0,0.0,0.0,-2.0,1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,3.0,-9.0,16.0,2.0,0.0,1.0,7.0,-2.0,-6.0,-2.0
2,False,True,False,True,True,3,False,5,True,0,True,0.0,0.0,0.0,0.3,0.3,0.313333,0.0,0,True,False,False,False,0.0,0.0,False,0.0,3,False,True,False,0.0,4.914142,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-2.0,0.0,0.0,0.0,0.0,-3.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,-3.0,2.0,-1.0,0.0,0.0,0.0,-2.0,1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,1.0,-8.0,15.0,3.0,-1.0,1.0,7.0,0.0,-7.0,-2.0
3,False,True,False,True,True,3,False,5,True,0,True,0.0,0.0,0.0,0.3,0.3,0.31,0.0,0,True,False,False,False,0.0,0.0,False,0.0,3,False,True,False,0.0,2.164353,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-2.0,0.0,0.0,0.0,0.0,-3.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,-3.0,2.0,-1.0,0.0,0.0,0.0,-2.0,1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,3.0,-9.0,16.0,2.0,0.0,1.0,7.0,-2.0,-6.0,-2.0
4,False,True,False,True,True,3,False,6,True,0,True,0.0,0.0,0.0,0.3,0.3,0.263636,0.0,0,True,False,False,False,0.0,0.0,False,0.0,3,False,True,False,0.0,2.843233,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-2.0,0.0,0.0,0.0,0.0,-3.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,-3.0,2.0,-1.0,0.0,0.0,0.0,-2.0,1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,1.0,-8.0,15.0,3.0,-1.0,1.0,7.0,0.0,-7.0,-2.0


In [158]:
# Inspect Data
hashedExportID.head()

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class
0,0.0,TRACTION_BGK_OFF,2021-01-01 01:48:02.819180032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low
1,0.0,TRACTION_BGK_OFF,2021-01-01 01:50:14.752684032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low
2,0.0,TRACTION_BGK_OFF,2021-01-02 00:37:56.958477056,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low
3,0.0,TRACTION_BGK_OFF,2021-01-02 00:40:23.753911040,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low
4,0.0,TRACTION_BGK_OFF,2021-01-03 00:52:01.285080064,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low


In [159]:
hashedExport_df.shape

(312160, 78)

In [160]:
hashedExport_df.shape[0] * hashedExport_df.shape[1]

24348480

In [161]:
# Rescale data for PCA
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
%memit hashedExport_df = scaler.fit_transform(hashedExport_df)

peak memory: 13010.57 MiB, increment: 1074.04 MiB


## Anomaly Detection
Suggested Methods:
1. Isolation forest
2. Histogram-based Outlier Score (HBOS)
3. Cluster-based Local Outlier Factor (CBLOF)
4. Principle Component Analysis (PCA)
5. Lightweight On-line Detector of Anomalies (LODA)
4. Local Outlier factor - dropped due to poor performance and slow performance
5. One Class Support Vector Machine (OC-SVM) - dropped due to slow performance
6. K-Nearest Neighbout (KNN) - dropped due to slow performance
7. (Fast) Angle Based Outlier Detection (ABOD) - dropped due to slow performance

Implementation:
1. Ensemble voting of anomaly detection algorithms (current)
2. Averaging of Anomaly scoring (KIV)
3. Additional suppression of "normal states" (to be implemented)

References:
1. https://medium.com/southworks/fraud-detection-applying-unsupervised-learning-techniques-4ae6f71b266f#id_token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjQ2Mjk0OTE3NGYxZWVkZjRmOWY5NDM0ODc3YmU0ODNiMzI0MTQwZjUiLCJ0eXAiOiJKV1QifQ.eyJpc3MiOiJodHRwczovL2FjY291bnRzLmdvb2dsZS5jb20iLCJuYmYiOjE2MjkxMDEyNzYsImF1ZCI6IjIxNjI5NjAzNTgzNC1rMWs2cWUwNjBzMnRwMmEyamFtNGxqZGNtczAwc3R0Zy5hcHBzLmdvb2dsZXVzZXJjb250ZW50LmNvbSIsInN1YiI6IjExMzkxMzg3MzMzMzY5MzQwOTI2NSIsImVtYWlsIjoicmFuZGlhbmcyN0BnbWFpbC5jb20iLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwiYXpwIjoiMjE2Mjk2MDM1ODM0LWsxazZxZTA2MHMydHAyYTJqYW00bGpkY21zMDBzdHRnLmFwcHMuZ29vZ2xldXNlcmNvbnRlbnQuY29tIiwibmFtZSI6IlJhbmRpIiwicGljdHVyZSI6Imh0dHBzOi8vbGgzLmdvb2dsZXVzZXJjb250ZW50LmNvbS9hL0FBVFhBSnllY1JQUHJ4OG16UFBvQVpSLTRROVNIYU5YME9iOXFCMXZNTWxxPXM5Ni1jIiwiZ2l2ZW5fbmFtZSI6IlJhbmRpIiwiaWF0IjoxNjI5MTAxNTc2LCJleHAiOjE2MjkxMDUxNzYsImp0aSI6IjUyZTAzOWI0NjU5Y2QyYmI5YTgxZDc0OTVmNzA4ZTliMTMzNzUzYTIifQ.BFi1lJxpyJj7_yKeINZBbVHCofCxUoSl2OFbQCRT43dblEsBITzj0dHm2_coy8KgPFF99RkL-wi6--gHoPGO7m1cPP-iFzHmZJC7I8LqkirUH958hwxn2mry5QPvKhImx1cZT99McIFRpdQhm-6MNwlxnc4Bo7mvzzt2K6aO7L19TDasJIHJhM7-j8ItEp0u_lqWsdd4GQ-dQM9ntATdRH1DGESwvDfE5g6Tv_RIT3fB4aUjrFtVeijTHD2DcjwImGLXpYoW7JY-Adi_0mpjaFfxqOvAOOgOztoK6LcXEnwdt1MS0NQz3knOid3ET3JiPy920CkUQg22inC9-LU4IQ
2. https://medium.com/@jamesstradling/unsupervised-machine-learning-with-one-class-support-vector-machines-129579a49d1d
3. https://scikit-learn.org/stable/modules/outlier_detection.html#outlier-detection
4. https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0152173#:~:text=The%20idea%20is%20that%20an%20unsupervised%20anomaly%20detection,only%20focuses%20on%20this%20unsupervised%20anomaly%20detection%20setup.
5. https://www.analyticsvidhya.com/blog/2021/06/univariate-anomaly-detection-a-walkthrough-in-python/#:~:text=K-Nearest%20Neighbours%20algorithm%20K-Nearest%20Neighbours%20algorithm%20detects%20anomalies,that%20observation%20is%20considered%20to%20be%20an%20anomaly.
6. https://towardsdatascience.com/anomaly-detection-for-dummies-15f148e559c1
7. https://www.analyticsvidhya.com/blog/2019/02/outlier-detection-python-pyod/
8. https://towardsdatascience.com/anomaly-detection-in-python-part-2-multivariate-unsupervised-methods-and-code-b311a63f298b
9. https://towardsdatascience.com/5-ways-to-detect-outliers-that-every-data-scientist-should-know-python-code-70a54335a623
10. https://pyod.readthedocs.io/en/latest/pyod.html
11. https://link.springer.com/content/pdf/10.1007/s10994-015-5521-0.pdf
12. Novelty Detection: https://scikit-learn.org/stable/auto_examples/neighbors/plot_lof_novelty_detection.html?highlight=local%20outlier%20factor
13. Outlier Detection: https://scikit-learn.org/stable/auto_examples/neighbors/plot_lof_outlier_detection.html?highlight=local%20outlier%20factor


In [162]:
# Get peak memory usage at that instant of time
print(p.memory_info().peak_wset / 1024 ** 2)

24870.19921875


In [163]:
# Define estimated proportion of outliers
outliers_fraction = 0.01

### PCA For Anomaly Detection and Data Reduction

In [164]:
# Import Library
from pyod.models.pca import PCA

# Apply PCA on full data for Anomaly Detection
if (DataReductionOn == False):
    explainedVar = 0.99
else: 
    explainedVar = 0.8

def anomalyAlgo_PCA(df_input, df_id, explainedVariance, contaminationAmt = 0.0002):
    # Create PCA Model
    PCAclf = PCA(n_components = explainedVariance, contamination = contaminationAmt, random_state = 888)
    PCAclf.fit(df_input)

    # Generate predictions
    # Get Data Sample Classifications
    scores_pred = PCAclf.decision_function(df_input) * -1
    PCAOutput  = PCAclf.predict(df_input)
    # Get Data Sample Probability Score
    PCAProbScore  = PCAclf.predict_proba(df_input)

    # Append PCA Outputs new new columns
    df_id["PCA_Class"] = PCAOutput.tolist()
    df_id["PCA_AnomalyProb"] = PCAProbScore.tolist()
    temp = pd.DataFrame(df_id["PCA_AnomalyProb"].values.tolist())[1]
    df_id["PCA_AnomalyProb"] = temp

    # Delete redundant variables
    del PCAOutput, PCAProbScore, temp

    return df_id
    
try:   
    %memit hashedExportID = anomalyAlgo_PCA(hashedExport_df, hashedExportID, explainedVar)

except MemoryError as error:
    hashedExportID["PCA_Class"] = 0
    hashedExportID["PCA_AnomalyProb"] = np.nan
    print("Out of memory error")


peak memory: 12394.75 MiB, increment: 256.36 MiB


In [165]:
# Inspect Data
hashedExportID.head()

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,PCA_Class,PCA_AnomalyProb
0,0.0,TRACTION_BGK_OFF,2021-01-01 01:48:02.819180032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.316497
1,0.0,TRACTION_BGK_OFF,2021-01-01 01:50:14.752684032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.309214
2,0.0,TRACTION_BGK_OFF,2021-01-02 00:37:56.958477056,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.318976
3,0.0,TRACTION_BGK_OFF,2021-01-02 00:40:23.753911040,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.311145
4,0.0,TRACTION_BGK_OFF,2021-01-03 00:52:01.285080064,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.314038


In [166]:
# Inspect Data
hashedExportID.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   EQUIPMENT_NAME       312160 non-null  object        
 1   ASSET_ID_RAW         312160 non-null  object        
 2   SCS_TIME             312160 non-null  datetime64[ns]
 3   ENVIRONMENT          312160 non-null  object        
 4   GeoCode              312160 non-null  object        
 5   FUNCTIONAL_CATEGORY  312160 non-null  category      
 6   AssetClass           312160 non-null  category      
 7   AssetSubClass        307757 non-null  category      
 8   EVENT_DESC_CAT       312160 non-null  object        
 9   EVENT_STATUS         312157 non-null  category      
 10  Severity_Class       312160 non-null  category      
 11  PCA_Class            312160 non-null  int64         
 12  PCA_AnomalyProb      312160 non-null  float64       
dtypes: category(5)

In [167]:
# Get Count and Ratio of Outliers
baseCount = hashedExportID["PCA_Class"].shape[0]
OutlierCount = hashedExportID[hashedExportID["PCA_Class"] == 1].shape[0]
print("Outlier Count: " + str(OutlierCount) + "/" + str(baseCount))
OutlierRatio = np.round(OutlierCount/baseCount,5)
print("Outlier Ratio (5 DP): " + str(OutlierRatio))
meanProbScore = np.round(np.nanmean(hashedExportID.loc[(hashedExportID["PCA_Class"] == 1), "PCA_AnomalyProb"]), 5)
print("Mean Outlier Probability of Outlier: " + str(meanProbScore))
OutlierCount = hashedExportID[hashedExportID["PCA_AnomalyProb"] >= 0.7].shape[0]
print("Outlier Count with Outlier Probability of 70% or more: " + str(OutlierCount))

Outlier Count: 63/312160
Outlier Ratio (5 DP): 0.0002
Mean Outlier Probability of Outlier: 0.80808
Outlier Count with Outlier Probability of 70% or more: 71


In [168]:
# Variable Reduction with PCA for 95% variance
from sklearn.decomposition import PCA
if (DataReductionOn == False):
    pca_events = PCA(n_components = 0.95)
else: 
    pca_events = PCA(n_components = 0.80)
%memit hashedExport_df = pca_events.fit_transform(hashedExport_df)
%memit hashedExport_df = pd.DataFrame(hashedExport_df)

# Inspect data
hashedExport_df.head()

peak memory: 11976.21 MiB, increment: -172.88 MiB
peak memory: 11969.06 MiB, increment: 0.00 MiB


Unnamed: 0,0,1
0,-0.390502,-0.584023
1,-0.515801,-0.210623
2,-0.508968,-0.273829
3,-0.50321,-0.225835
4,-0.522406,-0.252492


In [169]:
hashedExport_df.shape

(312160, 2)

In [170]:
# Inspect data
hashedExport_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       312160 non-null  float64
 1   1       312160 non-null  float64
dtypes: float64(2)
memory usage: 4.8 MB


In [171]:
hashedExport_df.shape[0] * hashedExport_df.shape[1]

624320

#### Isolation Forest
Notes:
1. Isolation Forest Example: https://scikit-learn.org/stable/auto_examples/ensemble/plot_isolation_forest.html#sphx-glr-auto-examples-ensemble-plot-isolation-forest-py
2. Can consider Extended Isolation Forest which is a bit more robust than the vanilla form
3. https://towardsdatascience.com/outlier-detection-with-extended-isolation-forest-1e248a3fe97b
4. https://github.com/sahandha/eif/blob/master/Notebooks/EIF.ipynb

In [172]:
# Load library
#from sklearn.ensemble import IsolationForest (deprecated to switch to PYOD Library for consistency)
from pyod.models.iforest import IForest

In [173]:
def anomalyAlgo_IsoForest(df_input, df_id, contaminationAmt = outliers_fraction):
    
    # Create Isolation Forest Model
    IsoForestclf = IForest(max_samples = df_input.shape[0], contamination = contaminationAmt, random_state = 888, n_jobs = -1)
    IsoForestclf.fit(df_input)
    
    # Generate predictions
    # Get Data Sample Classifications
    scores_pred = IsoForestclf.decision_function(df_input) * -1
    IsoForestOutput  = IsoForestclf.predict(df_input)
    # Get Data Sample Probability Score
    IsoForestProbScore  = IsoForestclf.predict_proba(df_input)
    
    # Append Isolation Forest Outputs new new columns
    df_id["IsoForest_Class"] = IsoForestOutput.tolist()
    df_id["IsoForest_AnomalyProb"] = IsoForestProbScore.tolist()
    temp = pd.DataFrame(df_id["IsoForest_AnomalyProb"].values.tolist())[1]
    df_id["IsoForest_AnomalyProb"] = temp

    # Delete redundant variables
    del IsoForestOutput, IsoForestProbScore, temp
    
    return df_id

try:   
    %memit hashedExportID = anomalyAlgo_IsoForest(hashedExport_df, hashedExportID)
    
except MemoryError as error:
    hashedExportID["IsoForest_Class"] = 0
    hashedExportID["IsoForest_AnomalyProb"] = np.nan
    print("Out of memory error")

peak memory: 12500.92 MiB, increment: 531.86 MiB


In [174]:
# Inspect Data
hashedExportID.head()

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,PCA_Class,PCA_AnomalyProb,IsoForest_Class,IsoForest_AnomalyProb
0,0.0,TRACTION_BGK_OFF,2021-01-01 01:48:02.819180032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.316497,1,0.643964
1,0.0,TRACTION_BGK_OFF,2021-01-01 01:50:14.752684032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.309214,0,0.362036
2,0.0,TRACTION_BGK_OFF,2021-01-02 00:37:56.958477056,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.318976,0,0.425982
3,0.0,TRACTION_BGK_OFF,2021-01-02 00:40:23.753911040,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.311145,0,0.359097
4,0.0,TRACTION_BGK_OFF,2021-01-03 00:52:01.285080064,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.314038,0,0.364794


In [175]:
# Inspect Data
hashedExportID.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   EQUIPMENT_NAME         312160 non-null  object        
 1   ASSET_ID_RAW           312160 non-null  object        
 2   SCS_TIME               312160 non-null  datetime64[ns]
 3   ENVIRONMENT            312160 non-null  object        
 4   GeoCode                312160 non-null  object        
 5   FUNCTIONAL_CATEGORY    312160 non-null  category      
 6   AssetClass             312160 non-null  category      
 7   AssetSubClass          307757 non-null  category      
 8   EVENT_DESC_CAT         312160 non-null  object        
 9   EVENT_STATUS           312157 non-null  category      
 10  Severity_Class         312160 non-null  category      
 11  PCA_Class              312160 non-null  int64         
 12  PCA_AnomalyProb        312160 non-null  floa

In [176]:
# Get Count and Ratio of Outliers
baseCount = hashedExportID["IsoForest_Class"].shape[0]
OutlierCount = hashedExportID[hashedExportID["IsoForest_Class"] == 1].shape[0]
print("Outlier Count: " + str(OutlierCount) + "/" + str(baseCount))
OutlierRatio = np.round(OutlierCount/baseCount,5)
print("Outlier Ratio (5 DP): " + str(OutlierRatio))
meanProbScore = np.round(np.nanmean(hashedExportID.loc[(hashedExportID["IsoForest_Class"] == 1), "IsoForest_AnomalyProb"]), 5)
print("Mean Outlier Probability of Outlier: " + str(meanProbScore))
OutlierCount = hashedExportID[hashedExportID["IsoForest_AnomalyProb"] >= 0.7].shape[0]
print("Outlier Count with Outlier Probability of 70% or more: " + str(OutlierCount))

Outlier Count: 3122/312160
Outlier Ratio (5 DP): 0.01
Mean Outlier Probability of Outlier: 0.57521
Outlier Count with Outlier Probability of 70% or more: 180


#### Histogram-based Outlier Score (HBOS)

In [177]:
# Import library
from pyod.models.hbos import HBOS

In [178]:
def anomalyAlgo_HBOS(df_input, df_id, contaminationAmt = outliers_fraction):
    
    # Create HBOS model
    HBOSclf = HBOS(contamination = contaminationAmt, n_bins = 10, alpha = 0.1, tol = 0.5)
    HBOSclf.fit(df_input)

    # Generate predictions
    # Get Data Sample Classifications
    scores_pred = HBOSclf.decision_function(df_input) * -1
    HBOSOutput = HBOSclf.predict(df_input)
    # Get Data Sample Probability Score
    HBOSProbScore  = HBOSclf.predict_proba(df_input)

    # Append HBOS Outputs new new columns
    df_id["HBOS_Class"] = HBOSOutput.tolist()
    df_id["HBOS_AnomalyProb"] = HBOSProbScore.tolist()
    temp = pd.DataFrame(df_id["HBOS_AnomalyProb"].values.tolist())[1]
    df_id["HBOS_AnomalyProb"] = temp

    # Delete redundant variables
    del HBOSOutput, HBOSProbScore, temp

    return df_id

try:
    %memit hashedExportID = anomalyAlgo_HBOS(hashedExport_df, hashedExportID)
    
except MemoryError as error:
    hashedExportID["HBOS_Class"] = 0
    hashedExportID["HBOS_AnomalyProb"] = np.nan
    print("Out of memory error")

peak memory: 12066.25 MiB, increment: 92.41 MiB


In [179]:
# Inspect Data
hashedExportID.head()

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,PCA_Class,PCA_AnomalyProb,IsoForest_Class,IsoForest_AnomalyProb,HBOS_Class,HBOS_AnomalyProb
0,0.0,TRACTION_BGK_OFF,2021-01-01 01:48:02.819180032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.316497,1,0.643964,0,0.837822
1,0.0,TRACTION_BGK_OFF,2021-01-01 01:50:14.752684032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.309214,0,0.362036,0,0.776042
2,0.0,TRACTION_BGK_OFF,2021-01-02 00:37:56.958477056,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.318976,0,0.425982,0,0.776042
3,0.0,TRACTION_BGK_OFF,2021-01-02 00:40:23.753911040,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.311145,0,0.359097,0,0.776042
4,0.0,TRACTION_BGK_OFF,2021-01-03 00:52:01.285080064,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.314038,0,0.364794,0,0.776042


In [180]:
# Inspect Data
hashedExportID.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   EQUIPMENT_NAME         312160 non-null  object        
 1   ASSET_ID_RAW           312160 non-null  object        
 2   SCS_TIME               312160 non-null  datetime64[ns]
 3   ENVIRONMENT            312160 non-null  object        
 4   GeoCode                312160 non-null  object        
 5   FUNCTIONAL_CATEGORY    312160 non-null  category      
 6   AssetClass             312160 non-null  category      
 7   AssetSubClass          307757 non-null  category      
 8   EVENT_DESC_CAT         312160 non-null  object        
 9   EVENT_STATUS           312157 non-null  category      
 10  Severity_Class         312160 non-null  category      
 11  PCA_Class              312160 non-null  int64         
 12  PCA_AnomalyProb        312160 non-null  floa

In [181]:
# Get Count and Ratio of Outliers
baseCount = hashedExportID["HBOS_Class"].shape[0]
OutlierCount = hashedExportID[hashedExportID["HBOS_Class"] == 1].shape[0]
print("Outlier Count: " + str(OutlierCount) + "/" + str(baseCount))
OutlierRatio = np.round(OutlierCount/baseCount,5)
print("Outlier Ratio (5 DP): " + str(OutlierRatio))
meanProbScore = np.round(np.nanmean(hashedExportID.loc[(hashedExportID["HBOS_Class"] == 1), "HBOS_AnomalyProb"]), 5)
print("Mean Outlier Probability of Outlier: " + str(meanProbScore))
OutlierCount = hashedExportID[hashedExportID["HBOS_AnomalyProb"] >= 0.7].shape[0]
print("Outlier Count with Outlier Probability of 70% or more: " + str(OutlierCount))

Outlier Count: 1300/312160
Outlier Ratio (5 DP): 0.00416
Mean Outlier Probability of Outlier: 0.95984
Outlier Count with Outlier Probability of 70% or more: 66754


#### Cluster-based Local Outlier Factor (CBLOF)

In [182]:
# Import library
from pyod.models.cblof import CBLOF

In [183]:
def anomalyAlgo_CBLOF(df_input, df_id, contaminationAmt = outliers_fraction):
    # Create CBLOF model
    CBLOFclf = CBLOF(contamination = contaminationAmt, check_estimator = False, random_state = 888, n_jobs = -1)
    CBLOFclf.fit(df_input)

    # Generate predictions
    # Get Data Sample Classifications
    scores_pred = CBLOFclf.decision_function(df_input) * -1
    CBLOFOutput = CBLOFclf.predict(df_input)
    # Get Data Sample Probability Score
    CBLOFProbScore  = CBLOFclf.predict_proba(df_input)

    # Append CBLOF Outputs new new columns
    df_id["CBLOF_Class"] = CBLOFOutput.tolist()
    df_id["CBLOF_AnomalyProb"] = CBLOFProbScore.tolist()
    temp = pd.DataFrame(df_id["CBLOF_AnomalyProb"].values.tolist())[1]
    df_id["CBLOF_AnomalyProb"] = temp

    # Delete redundant variables
    del CBLOFOutput, temp, CBLOFProbScore
    
    return df_id
    
try:
    %memit hashedExportID = anomalyAlgo_CBLOF(hashedExport_df, hashedExportID)

except MemoryError as error:
    hashedExportID["CBLOF_Class"] = 0
    hashedExportID["CBLOF_AnomalyProb"] = np.nan
    print("Out of memory error")

peak memory: 12095.36 MiB, increment: 72.96 MiB


In [184]:
# Inspect Data
hashedExportID.head()

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,PCA_Class,PCA_AnomalyProb,IsoForest_Class,IsoForest_AnomalyProb,HBOS_Class,HBOS_AnomalyProb,CBLOF_Class,CBLOF_AnomalyProb
0,0.0,TRACTION_BGK_OFF,2021-01-01 01:48:02.819180032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.316497,1,0.643964,0,0.837822,0,0.772957
1,0.0,TRACTION_BGK_OFF,2021-01-01 01:50:14.752684032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.309214,0,0.362036,0,0.776042,0,0.520069
2,0.0,TRACTION_BGK_OFF,2021-01-02 00:37:56.958477056,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.318976,0,0.425982,0,0.776042,0,0.550849
3,0.0,TRACTION_BGK_OFF,2021-01-02 00:40:23.753911040,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.311145,0,0.359097,0,0.776042,0,0.534248
4,0.0,TRACTION_BGK_OFF,2021-01-03 00:52:01.285080064,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.314038,0,0.364794,0,0.776042,0,0.533225


In [185]:
# Inspect Data
hashedExportID.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   EQUIPMENT_NAME         312160 non-null  object        
 1   ASSET_ID_RAW           312160 non-null  object        
 2   SCS_TIME               312160 non-null  datetime64[ns]
 3   ENVIRONMENT            312160 non-null  object        
 4   GeoCode                312160 non-null  object        
 5   FUNCTIONAL_CATEGORY    312160 non-null  category      
 6   AssetClass             312160 non-null  category      
 7   AssetSubClass          307757 non-null  category      
 8   EVENT_DESC_CAT         312160 non-null  object        
 9   EVENT_STATUS           312157 non-null  category      
 10  Severity_Class         312160 non-null  category      
 11  PCA_Class              312160 non-null  int64         
 12  PCA_AnomalyProb        312160 non-null  floa

In [186]:
# Get Count and Ratio of Outliers
baseCount = hashedExportID["CBLOF_Class"].shape[0]
OutlierCount = hashedExportID[hashedExportID["CBLOF_Class"] == 1].shape[0]
print("Outlier Count: " + str(OutlierCount) + "/" + str(baseCount))
OutlierRatio = np.round(OutlierCount/baseCount,5)
print("Outlier Ratio (5 DP): " + str(OutlierRatio))
meanProbScore = np.round(np.nanmean(hashedExportID.loc[(hashedExportID["CBLOF_Class"] == 1), "CBLOF_AnomalyProb"]), 5)
print("Mean Outlier Probability of Outlier: " + str(meanProbScore))
OutlierCount = hashedExportID[hashedExportID["CBLOF_AnomalyProb"] >= 0.7].shape[0]
print("Outlier Count with Outlier Probability of 70% or more: " + str(OutlierCount))

Outlier Count: 3122/312160
Outlier Ratio (5 DP): 0.01
Mean Outlier Probability of Outlier: 0.88064
Outlier Count with Outlier Probability of 70% or more: 13671


#### Lightweight On-line Detector of Anomalies (LODA)

In [187]:
# Import library
from pyod.models.loda import LODA

In [188]:
def anomalyAlgo_LODA(df_input, df_id, contaminationAmt = outliers_fraction):
    # Create LODA model
    LODAclf = LODA(contamination = contaminationAmt, n_bins = 10, n_random_cuts = 100)
    LODAclf.fit(df_input)
    
    # Generate predictions
    # Get Data Sample Classifications
    scores_pred = LODAclf.decision_function(df_input) * -1
    LODAOutput = LODAclf.predict(df_input)
    # Get Data Sample Probability Score
    LODAProbScore  = LODAclf.predict_proba(df_input)

    # Append LODA Outputs new new columns
    df_id["LODA_Class"] = LODAOutput.tolist()
    df_id["LODA_AnomalyProb"] = LODAProbScore.tolist()
    temp = pd.DataFrame(df_id["LODA_AnomalyProb"].values.tolist())[1]
    df_id["LODA_AnomalyProb"] = temp
    
    # Delete redundant variables
    del LODAOutput, temp, LODAProbScore
    
    return df_id
try:
    %memit hashedExportID = anomalyAlgo_LODA(hashedExport_df, hashedExportID)
        
except MemoryError as error:
    hashedExportID["LODA_Class"] = 0
    hashedExportID["LODA_AnomalyProb"] = np.nan
    print("Out of memory error")

peak memory: 12120.40 MiB, increment: 66.95 MiB


In [189]:
# Inspect Data
hashedExportID.head()

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,PCA_Class,PCA_AnomalyProb,IsoForest_Class,IsoForest_AnomalyProb,HBOS_Class,HBOS_AnomalyProb,CBLOF_Class,CBLOF_AnomalyProb,LODA_Class,LODA_AnomalyProb
0,0.0,TRACTION_BGK_OFF,2021-01-01 01:48:02.819180032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.316497,1,0.643964,0,0.837822,0,0.772957,0,0.746246
1,0.0,TRACTION_BGK_OFF,2021-01-01 01:50:14.752684032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.309214,0,0.362036,0,0.776042,0,0.520069,0,0.031734
2,0.0,TRACTION_BGK_OFF,2021-01-02 00:37:56.958477056,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.318976,0,0.425982,0,0.776042,0,0.550849,0,0.031734
3,0.0,TRACTION_BGK_OFF,2021-01-02 00:40:23.753911040,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.311145,0,0.359097,0,0.776042,0,0.534248,0,0.031734
4,0.0,TRACTION_BGK_OFF,2021-01-03 00:52:01.285080064,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.314038,0,0.364794,0,0.776042,0,0.533225,0,0.031734


In [190]:
# Inspect Data
hashedExportID.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 21 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   EQUIPMENT_NAME         312160 non-null  object        
 1   ASSET_ID_RAW           312160 non-null  object        
 2   SCS_TIME               312160 non-null  datetime64[ns]
 3   ENVIRONMENT            312160 non-null  object        
 4   GeoCode                312160 non-null  object        
 5   FUNCTIONAL_CATEGORY    312160 non-null  category      
 6   AssetClass             312160 non-null  category      
 7   AssetSubClass          307757 non-null  category      
 8   EVENT_DESC_CAT         312160 non-null  object        
 9   EVENT_STATUS           312157 non-null  category      
 10  Severity_Class         312160 non-null  category      
 11  PCA_Class              312160 non-null  int64         
 12  PCA_AnomalyProb        312160 non-null  floa

In [191]:
# Get Count and Ratio of Outliers
baseCount = hashedExportID["LODA_Class"].shape[0]
OutlierCount = hashedExportID[hashedExportID["LODA_Class"] == 1].shape[0]
print("Outlier Count: " + str(OutlierCount) + "/" + str(baseCount))
OutlierRatio = np.round(OutlierCount/baseCount,5)
print("Outlier Ratio (5 DP): " + str(OutlierRatio))
meanProbScore = np.round(np.nanmean(hashedExportID.loc[(hashedExportID["LODA_Class"] == 1), "LODA_AnomalyProb"]), 5)
print("Mean Outlier Probability of Outlier: " + str(meanProbScore))
OutlierCount = hashedExportID[hashedExportID["LODA_AnomalyProb"] >= 0.7].shape[0]
print("Outlier Count with Outlier Probability of 70% or more: " + str(OutlierCount))

Outlier Count: 1581/312160
Outlier Ratio (5 DP): 0.00506
Mean Outlier Probability of Outlier: 0.98134
Outlier Count with Outlier Probability of 70% or more: 27731


### Get Average Anomaly Probability

In [192]:
# Get Average Anomaly Probaility
hashedExportID["AnomalyProb"] = hashedExportID[["IsoForest_AnomalyProb",
                                                "PCA_AnomalyProb",
                                                "HBOS_AnomalyProb", 
                                                "CBLOF_AnomalyProb", 
                                                "LODA_AnomalyProb"]].mean(axis = 1, skipna = True)

# Inspect Data
hashedExportID.head()

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,PCA_Class,PCA_AnomalyProb,IsoForest_Class,IsoForest_AnomalyProb,HBOS_Class,HBOS_AnomalyProb,CBLOF_Class,CBLOF_AnomalyProb,LODA_Class,LODA_AnomalyProb,AnomalyProb
0,0.0,TRACTION_BGK_OFF,2021-01-01 01:48:02.819180032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.316497,1,0.643964,0,0.837822,0,0.772957,0,0.746246,0.663497
1,0.0,TRACTION_BGK_OFF,2021-01-01 01:50:14.752684032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.309214,0,0.362036,0,0.776042,0,0.520069,0,0.031734,0.399819
2,0.0,TRACTION_BGK_OFF,2021-01-02 00:37:56.958477056,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.318976,0,0.425982,0,0.776042,0,0.550849,0,0.031734,0.420717
3,0.0,TRACTION_BGK_OFF,2021-01-02 00:40:23.753911040,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.311145,0,0.359097,0,0.776042,0,0.534248,0,0.031734,0.402453
4,0.0,TRACTION_BGK_OFF,2021-01-03 00:52:01.285080064,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.314038,0,0.364794,0,0.776042,0,0.533225,0,0.031734,0.403967


In [193]:
# Get Anomaly Classification based on Average Anomaly Probaility
hashedExportID["AnomalyProbClass"] = hashedExportID["AnomalyProb"] >= 0.7

# Inspect Data
hashedExportID.head()

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,PCA_Class,PCA_AnomalyProb,IsoForest_Class,IsoForest_AnomalyProb,HBOS_Class,HBOS_AnomalyProb,CBLOF_Class,CBLOF_AnomalyProb,LODA_Class,LODA_AnomalyProb,AnomalyProb,AnomalyProbClass
0,0.0,TRACTION_BGK_OFF,2021-01-01 01:48:02.819180032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.316497,1,0.643964,0,0.837822,0,0.772957,0,0.746246,0.663497,False
1,0.0,TRACTION_BGK_OFF,2021-01-01 01:50:14.752684032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.309214,0,0.362036,0,0.776042,0,0.520069,0,0.031734,0.399819,False
2,0.0,TRACTION_BGK_OFF,2021-01-02 00:37:56.958477056,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.318976,0,0.425982,0,0.776042,0,0.550849,0,0.031734,0.420717,False
3,0.0,TRACTION_BGK_OFF,2021-01-02 00:40:23.753911040,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.311145,0,0.359097,0,0.776042,0,0.534248,0,0.031734,0.402453,False
4,0.0,TRACTION_BGK_OFF,2021-01-03 00:52:01.285080064,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.314038,0,0.364794,0,0.776042,0,0.533225,0,0.031734,0.403967,False


In [194]:
OutlierCount = hashedExportID[hashedExportID["AnomalyProbClass"] == True].shape[0]
print("Outlier Count with Mean Outlier Probability of 70% or more: " + str(OutlierCount))

Outlier Count with Mean Outlier Probability of 70% or more: 4


In [195]:
# Inspect Data
hashedExportID.loc[hashedExportID["AnomalyProbClass"] == True].head(10)

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,PCA_Class,PCA_AnomalyProb,IsoForest_Class,IsoForest_AnomalyProb,HBOS_Class,HBOS_AnomalyProb,CBLOF_Class,CBLOF_AnomalyProb,LODA_Class,LODA_AnomalyProb,AnomalyProb,AnomalyProbClass
199542,0.0,COM/DBG/B5/ACM07,2021-01-06 03:00:22.773919232,OCCCMS,DBG,51,COM,ACM,Rack Fan,NORMAL,Maintenance-Low,1,0.777334,0,0.482115,1,1.0,1,0.882022,0,0.394304,0.707155,True
199556,0.0,COM/DBG/B5/ACM07,2021-01-06 03:50:37.187553792,OCCCMS,DBG,51,COM,ACM,Rack Fan,NORMAL,Maintenance-Low,1,0.816611,0,0.501167,1,1.0,1,0.882544,0,0.394304,0.718925,True
199557,0.0,COM/DBG/B5/ACM07,2021-01-06 03:52:36.273238784,OCCCMS,DBG,51,COM,ACM,Rack Fan,ALARM,Maintenance-Low,1,0.855925,0,0.342261,1,1.0,1,0.919531,0,0.394304,0.702404,True
199558,0.0,COM/DBG/B5/ACM07,2021-01-06 03:52:36.779228928,OCCCMS,DBG,51,COM,ACM,Rack Fan,NORMAL,Maintenance-Low,1,0.895177,1,0.526619,1,1.0,1,0.882568,0,0.394304,0.739734,True


### Clean Up Presentation

In [196]:
# Inspect Data
hashedExportID.head()

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,PCA_Class,PCA_AnomalyProb,IsoForest_Class,IsoForest_AnomalyProb,HBOS_Class,HBOS_AnomalyProb,CBLOF_Class,CBLOF_AnomalyProb,LODA_Class,LODA_AnomalyProb,AnomalyProb,AnomalyProbClass
0,0.0,TRACTION_BGK_OFF,2021-01-01 01:48:02.819180032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.316497,1,0.643964,0,0.837822,0,0.772957,0,0.746246,0.663497,False
1,0.0,TRACTION_BGK_OFF,2021-01-01 01:50:14.752684032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.309214,0,0.362036,0,0.776042,0,0.520069,0,0.031734,0.399819,False
2,0.0,TRACTION_BGK_OFF,2021-01-02 00:37:56.958477056,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.318976,0,0.425982,0,0.776042,0,0.550849,0,0.031734,0.420717,False
3,0.0,TRACTION_BGK_OFF,2021-01-02 00:40:23.753911040,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.311145,0,0.359097,0,0.776042,0,0.534248,0,0.031734,0.402453,False
4,0.0,TRACTION_BGK_OFF,2021-01-03 00:52:01.285080064,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.314038,0,0.364794,0,0.776042,0,0.533225,0,0.031734,0.403967,False


In [197]:
# Inspect Data
hashedExportID.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   EQUIPMENT_NAME         312160 non-null  object        
 1   ASSET_ID_RAW           312160 non-null  object        
 2   SCS_TIME               312160 non-null  datetime64[ns]
 3   ENVIRONMENT            312160 non-null  object        
 4   GeoCode                312160 non-null  object        
 5   FUNCTIONAL_CATEGORY    312160 non-null  category      
 6   AssetClass             312160 non-null  category      
 7   AssetSubClass          307757 non-null  category      
 8   EVENT_DESC_CAT         312160 non-null  object        
 9   EVENT_STATUS           312157 non-null  category      
 10  Severity_Class         312160 non-null  category      
 11  PCA_Class              312160 non-null  int64         
 12  PCA_AnomalyProb        312160 non-null  floa

In [198]:
# Get Outlier Strength by ensemble voting
# Note that the default value of each classifier output being summed is 0 if it is a null value
hashedExportID["Outlier_Strength"] = hashedExportID[["IsoForest_Class",
                                                     "PCA_Class",
                                                     "HBOS_Class", 
                                                     "CBLOF_Class", 
                                                     "LODA_Class"]].sum(axis = 1, skipna = True) 

# Inspect Data
hashedExportID.head()

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,PCA_Class,PCA_AnomalyProb,IsoForest_Class,IsoForest_AnomalyProb,HBOS_Class,HBOS_AnomalyProb,CBLOF_Class,CBLOF_AnomalyProb,LODA_Class,LODA_AnomalyProb,AnomalyProb,AnomalyProbClass,Outlier_Strength
0,0.0,TRACTION_BGK_OFF,2021-01-01 01:48:02.819180032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.316497,1,0.643964,0,0.837822,0,0.772957,0,0.746246,0.663497,False,1
1,0.0,TRACTION_BGK_OFF,2021-01-01 01:50:14.752684032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.309214,0,0.362036,0,0.776042,0,0.520069,0,0.031734,0.399819,False,0
2,0.0,TRACTION_BGK_OFF,2021-01-02 00:37:56.958477056,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.318976,0,0.425982,0,0.776042,0,0.550849,0,0.031734,0.420717,False,0
3,0.0,TRACTION_BGK_OFF,2021-01-02 00:40:23.753911040,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.311145,0,0.359097,0,0.776042,0,0.534248,0,0.031734,0.402453,False,0
4,0.0,TRACTION_BGK_OFF,2021-01-03 00:52:01.285080064,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.314038,0,0.364794,0,0.776042,0,0.533225,0,0.031734,0.403967,False,0


In [199]:
# Rename Column
hashedExportID.rename(columns = {
                                 'IsoForest_Class':'Outlier_IsoForest', 
                                 'PCA_Class':'Outlier_PCA', 
                                 'HBOS_Class':'Outlier_HBOS', 
                                 'CBLOF_Class':'Outlier_CBLOF',
                                 'LODA_Class':'Outlier_LODA'
                                }, inplace = True)

In [200]:
# Inspect Data
hashedExportID.head()

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,Outlier_PCA,PCA_AnomalyProb,Outlier_IsoForest,IsoForest_AnomalyProb,Outlier_HBOS,HBOS_AnomalyProb,Outlier_CBLOF,CBLOF_AnomalyProb,Outlier_LODA,LODA_AnomalyProb,AnomalyProb,AnomalyProbClass,Outlier_Strength
0,0.0,TRACTION_BGK_OFF,2021-01-01 01:48:02.819180032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.316497,1,0.643964,0,0.837822,0,0.772957,0,0.746246,0.663497,False,1
1,0.0,TRACTION_BGK_OFF,2021-01-01 01:50:14.752684032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.309214,0,0.362036,0,0.776042,0,0.520069,0,0.031734,0.399819,False,0
2,0.0,TRACTION_BGK_OFF,2021-01-02 00:37:56.958477056,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.318976,0,0.425982,0,0.776042,0,0.550849,0,0.031734,0.420717,False,0
3,0.0,TRACTION_BGK_OFF,2021-01-02 00:40:23.753911040,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,0,0.311145,0,0.359097,0,0.776042,0,0.534248,0,0.031734,0.402453,False,0
4,0.0,TRACTION_BGK_OFF,2021-01-03 00:52:01.285080064,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,0,0.314038,0,0.364794,0,0.776042,0,0.533225,0,0.031734,0.403967,False,0


In [201]:
# Convert Datatype
hashedExportID = hashedExportID.astype({
                                        'Outlier_IsoForest': 'bool',
                                        'Outlier_PCA': 'bool',                                        'Outlier_HBOS': 'bool',
                                        'Outlier_HBOS': 'bool',                                        'Outlier_HBOS': 'bool',
                                        'Outlier_CBLOF': 'bool',
                                        'Outlier_LODA': 'bool'
                                       })

# Inspect Data
hashedExportID.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312160 entries, 0 to 312159
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   EQUIPMENT_NAME         312160 non-null  object        
 1   ASSET_ID_RAW           312160 non-null  object        
 2   SCS_TIME               312160 non-null  datetime64[ns]
 3   ENVIRONMENT            312160 non-null  object        
 4   GeoCode                312160 non-null  object        
 5   FUNCTIONAL_CATEGORY    312160 non-null  category      
 6   AssetClass             312160 non-null  category      
 7   AssetSubClass          307757 non-null  category      
 8   EVENT_DESC_CAT         312160 non-null  object        
 9   EVENT_STATUS           312157 non-null  category      
 10  Severity_Class         312160 non-null  category      
 11  Outlier_PCA            312160 non-null  bool          
 12  PCA_AnomalyProb        312160 non-null  floa

In [202]:
# Inspect Data
hashedExportID.head()

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,Outlier_PCA,PCA_AnomalyProb,Outlier_IsoForest,IsoForest_AnomalyProb,Outlier_HBOS,HBOS_AnomalyProb,Outlier_CBLOF,CBLOF_AnomalyProb,Outlier_LODA,LODA_AnomalyProb,AnomalyProb,AnomalyProbClass,Outlier_Strength
0,0.0,TRACTION_BGK_OFF,2021-01-01 01:48:02.819180032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,False,0.316497,True,0.643964,False,0.837822,False,0.772957,False,0.746246,0.663497,False,1
1,0.0,TRACTION_BGK_OFF,2021-01-01 01:50:14.752684032,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,False,0.309214,False,0.362036,False,0.776042,False,0.520069,False,0.031734,0.399819,False,0
2,0.0,TRACTION_BGK_OFF,2021-01-02 00:37:56.958477056,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,False,0.318976,False,0.425982,False,0.776042,False,0.550849,False,0.031734,0.420717,False,0
3,0.0,TRACTION_BGK_OFF,2021-01-02 00:40:23.753911040,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",TERMINATED,Operational-Low,False,0.311145,False,0.359097,False,0.776042,False,0.534248,False,0.031734,0.402453,False,0
4,0.0,TRACTION_BGK_OFF,2021-01-03 00:52:01.285080064,OCCCMS,OCC,1,TRACTION,TRACTION,"DM, DM, DI, DFS, DFN, DFN, DFS - Open Control",REQUESTED,Operational-Low,False,0.314038,False,0.364794,False,0.776042,False,0.533225,False,0.031734,0.403967,False,0


In [203]:
# Get Count and Ratio of Outliers
# Max range is the number of outlier detection algorithms used + 1
maxRange = 6
for n in range(maxRange):
    print("Outlier Vote of: " + str(n))
    baseCount = hashedExportID["Outlier_Strength"].shape[0]
    OutlierCount = hashedExportID[hashedExportID["Outlier_Strength"] == n].shape[0]
    print("Outlier Count: " + str(OutlierCount) + "/" + str(baseCount))
    OutlierRatio = np.round(OutlierCount/baseCount,5)
    print("Outlier Ratio (5 DP): " + str(OutlierRatio))
    meanProbScore = np.round(np.nanmean(hashedExportID.loc[(hashedExportID["Outlier_Strength"] == n), "AnomalyProb"]), 5)
    print("Mean Outlier Probability: " + str(meanProbScore))
    print()
    print("--------------------------------")
    print()

Outlier Vote of: 0
Outlier Count: 304776/312160
Outlier Ratio (5 DP): 0.97635
Mean Outlier Probability: 0.2426

--------------------------------

Outlier Vote of: 1
Outlier Count: 5651/312160
Outlier Ratio (5 DP): 0.0181
Mean Outlier Probability: 0.47537

--------------------------------

Outlier Vote of: 2
Outlier Count: 1663/312160
Outlier Ratio (5 DP): 0.00533
Mean Outlier Probability: 0.56856

--------------------------------

Outlier Vote of: 3
Outlier Count: 69/312160
Outlier Ratio (5 DP): 0.00022
Mean Outlier Probability: 0.61054

--------------------------------

Outlier Vote of: 4
Outlier Count: 1/312160
Outlier Ratio (5 DP): 0.0
Mean Outlier Probability: 0.73973

--------------------------------

Outlier Vote of: 5
Outlier Count: 0/312160
Outlier Ratio (5 DP): 0.0
Mean Outlier Probability: nan

--------------------------------



  meanProbScore = np.round(np.nanmean(hashedExportID.loc[(hashedExportID["Outlier_Strength"] == n), "AnomalyProb"]), 5)


In [204]:
# Inspect Data
hashedExportID.loc[hashedExportID["Outlier_Strength"] == 5]

Unnamed: 0,EQUIPMENT_NAME,ASSET_ID_RAW,SCS_TIME,ENVIRONMENT,GeoCode,FUNCTIONAL_CATEGORY,AssetClass,AssetSubClass,EVENT_DESC_CAT,EVENT_STATUS,Severity_Class,Outlier_PCA,PCA_AnomalyProb,Outlier_IsoForest,IsoForest_AnomalyProb,Outlier_HBOS,HBOS_AnomalyProb,Outlier_CBLOF,CBLOF_AnomalyProb,Outlier_LODA,LODA_AnomalyProb,AnomalyProb,AnomalyProbClass,Outlier_Strength


In [205]:
# Get peak memory usage at that instant of time
print(p.memory_info().peak_wset / 1024 ** 2)

24870.19921875


## Export File

In [206]:
# Check current directory
cwd

'C:\\Users\\schdadmin\\Documents\\IAMS Analytics\\alarm-event-logs'

In [207]:
# Define File Save Parameters
#FileName = "CMS"
#Run = "-B0001"
# True for single file output; # False for multiple file output; "both" for both Single & Multiple File Output
singleSave = "both"

# Get length of dataframe
df_len = len(hashedExportID)
# Inspect data
print(df_len)

# Define Size of Partitioned Dataframes
partionSize = 500000

# Define Number of Partitions (Always Round Up to Nearest Interger)
if (df_len == partionSize): 
    partitions = 1
else:
    partitions = df_len // partionSize + 1

# Inspect data
print(partitions)


312160
1


In [208]:
# Export file based on above settings
if singleSave == True:
    fileNameN = "AnomalyTaggingResults/" + SrcEnv + BatchCode + "AnomalyTagging_Output" + EngHrTag + "-" + FuncCatCluster[targetFuncCatCluster] + ".csv"
    hashedExportID.to_csv(fileNameN, index=False)
    print(fileNameN + " SAVED")
elif singleSave == "both":
    # Single File Save
    fileNameN = "AnomalyTaggingResults/" + SrcEnv + BatchCode + "AnomalyTagging_Output" + EngHrTag + "-" + FuncCatCluster[targetFuncCatCluster] + ".csv"
    hashedExportID.to_csv(fileNameN, index=False)
    print(fileNameN + " SAVED")
    
    # Split Dataframe into batches of 500K rows
    for counter in range(partitions):
        startPoint = counter * partionSize
        df_subset = hashedExportID.iloc[startPoint : (startPoint + partionSize)]
        #print(df_subset.info())

        # Save File
        fileNameN = "AnomalyTaggingResults_Subset/" + SrcEnv + BatchCode + "AnomalyTagging_Output" + EngHrTag  + '-' + str(counter).zfill(3) + "-" + FuncCatCluster[targetFuncCatCluster] + ".csv"
        df_subset.to_csv(fileNameN, index=False)
        print(fileNameN + " SAVED")
else:
    # Split Dataframe into batches of 500K rows
    for counter in range(partitions):
        startPoint = counter * partionSize
        df_subset = hashedExportID.iloc[startPoint : (startPoint + partionSize)]
        #print(df_subset.info())

        # Save File
        fileNameN = "AnomalyTaggingResults_Subset/" + SrcEnv + BatchCode + "AnomalyTagging_Output" + EngHrTag  + '-' + str(counter).zfill(3) + "-" + FuncCatCluster[targetFuncCatCluster] + ".csv"
        df_subset.to_csv(fileNameN, index=False)
        print(fileNameN + " SAVED")
        
# Ring Beeper When Complete
import winsound
import time

duration1 = 400  # milliseconds
freq1 = 400  # Hz
duration2 = 600  # milliseconds
freq2 = 300  # Hz
repeatCount = 7
try:
    for n in range(repeatCount):
        winsound.Beep(freq1, duration1)
        winsound.Beep(freq2, duration2)
        time.sleep(1)
    print("File Save Complete")
except:
    print("File Save Complete")

AnomalyTaggingResults/CMS-05-AnomalyTagging_Output-EngHr-AltRun.csv SAVED
AnomalyTaggingResults_Subset/CMS-05-AnomalyTagging_Output-EngHr-000-AltRun.csv SAVED
File Save Complete
