# Documentation

## Objective(s)
1. Preprocessing of HISevent log files for further analysis

    
## Change Log
1. Bug Fix
2. Scrip Optimisation
3. Enhanced cleaning of data
    


# Initialisation

### Adjust Display Theme of Jupyter Notebook
Optional Step

Key Hotkeys:
1. Run cell: ctrl + enter
2. Delete Cell: DD
3. Undo Delete Cell: Z
4. Cut Cell: X
5. Copy Cell: C
6. Insert Cell Above: A
7. Insert Cell Below: B
8. Convert Cell to Code: Y
9. Convert Cell to Markdown: M

In [1]:
# install jupyterthemes
#!pip install jupyterthemes

# upgrade to latest version
#!pip install --upgrade jupyterthemes

In [2]:
#import jupyterthemes

In [3]:
# Adjust to Dark Theme
#jt -t chesterish

## Load Libraries

In [4]:
#pip install --user --upgrade pandas


In [5]:
# Install a pip package in the current Jupyter kernel
import sys
#!{sys.executable} -m pip install schedule

In [6]:
# Import libraries
#import modin.pandas as pd
import pandas as pd
import numpy as np
import os
import pathlib
import datetime as dt
import time
import os
#import re

## Set Options

In [7]:
# Enable display of all columns for dataframes with many variables
pd.set_option('display.max_columns', None)

## Set Up Core Directories

In [8]:
# Check current directory location
cwd = os.getcwd()
cwd

'C:\\Users\\cftfda01\\Documents\\SBST Train IAMS Project\\scripts'

In [9]:
# Define root file directory folder where the files are being stored
#os.chdir(cwd + alarmLoc)
os.chdir(os.path.dirname(os.getcwd()) + '\\alarm-event-logs')

# Check current directory location
cwd = os.getcwd()

# Check directory location
cwd

'C:\\Users\\cftfda01\\Documents\\SBST Train IAMS Project\\alarm-event-logs'

In [10]:
# Create Directory for Output Files Generated
if not os.path.exists('dataCleaned'):
    os.makedirs('dataCleaned')
    
# Inspect files in directory
fileList = os.listdir()
fileList

['dataCleaned',
 'desktop.ini',
 'dummyLog',
 'Original Sample from 27 Oct 2020 (simplified)',
 'Repair Logs',
 'Sample from 27 Oct 2020 (OG).zip',
 'sample_data_ats',
 'sample_data_ats.zip',
 'sample_data_cms',
 'sample_data_cms.zip',
 'sample_data_ecs',
 'sample_data_ecs.zip',
 'sample_data_HISevent',
 'testLog']

In [11]:
# Location of Alarm and Normal Event Files
HISeventLoc = '\\sample_data_HISevent\\B001'

# Set variables
prefix = 'HIS' 
batch = 'B001-'
serverEnv = 'Assorted-'

## Access Files to be Processed

In [12]:
# Define root file directory folder where the files are being stored
os.chdir(cwd + HISeventLoc)
#os.chdir(cwd + eventLoc)
# Check directory location
os.getcwd()

'C:\\Users\\cftfda01\\Documents\\SBST Train IAMS Project\\alarm-event-logs\\sample_data_HISevent\\B001'

In [13]:
# Get the list of all files in directory tree at given path
fileList = list()
for (dirpath, dirnames, filenames) in os.walk(os.getcwd()):
    fileList += [os.path.join(dirpath, file) for file in filenames] # use this if you want to append full URL
    #fileList += filenames
    
# Inspect data
len(fileList)

5

In [14]:
inputFile = fileList[0]
inputFile

'C:\\Users\\cftfda01\\Documents\\SBST Train IAMS Project\\alarm-event-logs\\sample_data_HISevent\\B001\\HISEVENT_2020_08_24_00_00_00'

# Process Log Files

In [15]:
# read in file
file = open(inputFile, 'r')
# Convert file contents to a list
fileContents = list(file)
# Close file
file.close()
                    
# Load File Data as a Dataframe
df = pd.DataFrame(fileContents,columns=['rawData'])

# Drop non-relevant rows and reset index
df = df.drop([0,1], axis=0).reset_index().drop(["index"], axis=1)
#df = pd.read_csv(testFile, sep = ";", skiprows = [1])

df.head()

Unnamed: 0,rawData
0,97496;NEDSMS;0;0;2;2147483647;10;3;24/08/2020 ...
1,97458;NEDSMS;0;0;2;2147483647;12;27;24/08/2020...
2,97472;NEDSMS;1;0;2;2147483647;12;27;24/08/2020...
3,97066;NEDSMS;1;0;2;2147483647;12;27;24/08/2020...
4,97223;NEDSMS;0;0;2;2147483647;12;27;24/08/2020...


In [16]:
# Define Header Names
headerList_core = [
                "ALARMID",
                "ENVIRONMENT",
                "VALUE",
                "ACKREQUIRED",
                "SEVERITY",
                "EQUIPMENTCLASS",
                "FUNCTIONALCAT",
                "GEOGRAPHICALCAT",
                "DATEANDTIME",
                "EQUIPMENTNAME",
                "ASSETNAME",
                "MESSAGE",
                "STATUS",
                "GROUP1",
                "GROUP2",
                "FORMAT",
                "DSSEVENTTYPE",
                "OPER",
                "UnknownVariable"
            ]
df = df["rawData"].str.split(pat=";", n=18, expand=True).rename(columns={0: headerList_core[0],
                                                                          1: headerList_core[1],
                                                                          2: headerList_core[2],
                                                                          3: headerList_core[3],
                                                                          4: headerList_core[4],
                                                                          5: headerList_core[5],
                                                                          6: headerList_core[6],
                                                                          7: headerList_core[7],
                                                                          8: headerList_core[8],
                                                                          9: headerList_core[9],
                                                                         10: headerList_core[10],
                                                                         11: headerList_core[11],
                                                                         12: headerList_core[12],
                                                                         13: headerList_core[13],
                                                                         14: headerList_core[14],
                                                                         15: headerList_core[15],
                                                                         16: headerList_core[16],
                                                                         17: headerList_core[17],
                                                                         18: headerList_core[18]})
del df["UnknownVariable"]

In [17]:
df.head()

Unnamed: 0,ALARMID,ENVIRONMENT,VALUE,ACKREQUIRED,SEVERITY,EQUIPMENTCLASS,FUNCTIONALCAT,GEOGRAPHICALCAT,DATEANDTIME,EQUIPMENTNAME,ASSETNAME,MESSAGE,STATUS,GROUP1,GROUP2,FORMAT,DSSEVENTTYPE,OPER
0,97496,NEDSMS,0,0,2,2147483647,10,3,24/08/2020 00:00:00,:TRAD:DMS__0001:ATC__0010,SIG/NED/1133/ATCS0013,NED ATC Sector 10: Gp A BCU link 3,OK,3211313,3232000,2147483647,,KHK
1,97458,NEDSMS,0,0,2,2147483647,12,27,24/08/2020 00:00:00,:T36:TRTR:HATR2:CADT,EMU/172/CAR/XXXXXXXX,Train 036 Car 072 DT: Emergency Damper Summary...,NORMAL & CLOSED,49,0,2147483647,,KHK
2,97472,NEDSMS,1,0,2,2147483647,12,27,24/08/2020 00:00:00,:T06:TRTR:HATR2,EMU/006/TRN/XXXXXXXX,Train 006 Car 012: DT ATO/ATP Status,NOT APPLICABLE,49,0,2147483647,,KHK
3,97066,NEDSMS,1,0,2,2147483647,12,27,24/08/2020 00:00:00,:T34:TRTR:HATR2,EMU/034/TRN/XXXXXXXX,Train 034 Car 068: DT ATO/ATP Status,NOT APPLICABLE,3211313,3232000,2147483647,,KHK
4,97223,NEDSMS,0,0,2,2147483647,12,27,24/08/2020 00:00:00,:T06:TRTR:HATR2,EMU/006/TRN/XXXXXXXX,Train 006 Car 012: DT External CCTV Camera Status,NORMAL,49,0,2147483647,,KHK


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705074 entries, 0 to 705073
Data columns (total 18 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   ALARMID          705074 non-null  object
 1   ENVIRONMENT      705074 non-null  object
 2   VALUE            705074 non-null  object
 3   ACKREQUIRED      705074 non-null  object
 4   SEVERITY         705074 non-null  object
 5   EQUIPMENTCLASS   705074 non-null  object
 6   FUNCTIONALCAT    705074 non-null  object
 7   GEOGRAPHICALCAT  705074 non-null  object
 8   DATEANDTIME      705074 non-null  object
 9   EQUIPMENTNAME    705074 non-null  object
 10  ASSETNAME        705074 non-null  object
 11  MESSAGE          705074 non-null  object
 12  STATUS           705074 non-null  object
 13  GROUP1           705074 non-null  object
 14  GROUP2           705074 non-null  object
 15  FORMAT           705074 non-null  object
 16  DSSEVENTTYPE     705074 non-null  object
 17  OPER      

In [19]:
# Clean up exception cases
df.loc[df["EQUIPMENTNAME"].str.contains("\n", na = False, regex = False), "EQUIPMENTNAME"] = ""
df["EQUIPMENTNAME"] = df["EQUIPMENTNAME"].str.replace("", "", regex = False)
df["ASSETNAME"] = df["ASSETNAME"].str.replace("", "", regex = False)
df["MESSAGE"] = df["MESSAGE"].str.replace("", "", regex = False)
df["EQUIPMENTNAME"] = df["EQUIPMENTNAME"].str.replace("?", "", regex = False)
df["ASSETNAME"] = df["ASSETNAME"].str.replace("?", "", regex = False)
df["MESSAGE"] = df["MESSAGE"].str.replace("?", "", regex = False)

df.loc[df["DATEANDTIME"] == "SKY", ["DATEANDTIME", "OPER"]] = None, "SKY"
df.loc[df["ACKREQUIRED"] == 1, "ACKREQUIRED"] = True
df.loc[df["ACKREQUIRED"] == 0, "ACKREQUIRED"] = False
df.loc[(df["OPER"] == "\x18\x1a\x11") | 
       (df["OPER"] == "(^+") | 
       (df["OPER"] == "\x18*V") | 
       (df["OPER"] == "\x18\x1aM") |
       (df["OPER"] == "\x18Z\x0c") |
       (df["OPER"] == "\x18JI") |
       (df["OPER"] == "\x18\n") |
       (df["OPER"] == "\x18j\x14") |
       (df["OPER"] == "\x18:H"), "OPER"] = None

# Clean up dataframe of blank values
#df.loc[df["ALARMID"] == "", "ALARMID"] = None
#df.loc[df["GEOGRAPHICALCAT"] == "", "GEOGRAPHICALCAT"] = None
#df.loc[df["EQUIPMENTCLASS"] == "", "EQUIPMENTCLASS"] = None
#df.loc[df["EQUIPMENTNAME"] == "", "EQUIPMENTNAME"] = None
#df.loc[df["STATUS"] == "", "STATUS"] = None
#df.loc[df["DSSEVENTTYPE"] == "", "DSSEVENTTYPE"] = None
#df.loc[df["OPER"] == "", "OPER"] = None

# Delete corrupted rows
df = df.drop(df[df["ENVIRONMENT"].str.contains("TRACTION", na = False, regex = False) | 
                (df["ENVIRONMENT"] == "Executed")].index)
df = df.drop(df[(df["ENVIRONMENT"] == "\n")].index)

# Clean up partially corrupted rows
df.loc[(df["GROUP1"] == "NORMAL"),
      ["EQUIPMENTNAME",
       "ASSETNAME",
       "MESSAGE",
       "STATUS",
       "GROUP1",
       "GROUP2",
       "FORMAT",
       "DSSEVENTTYPE" 
      ]] = df.iloc [::, 9:17].shift(periods=-1, axis="columns")
df.loc[(df["GROUP1"] == "REQUESTED") & 
       (df["MESSAGE"].str.contains("TRACTION", na = False, regex = False)),
      "EQUIPMENTNAME"] = df["EQUIPMENTNAME"] + df["ASSETNAME"]
df.loc[(df["GROUP1"] == "REQUESTED") & 
       (df["MESSAGE"].str.contains("TRACTION", na = False, regex = False)),
      ["ASSETNAME", 
       "MESSAGE",
       "STATUS",
       "GROUP1",
       "GROUP2",
       "FORMAT",
       "DSSEVENTTYPE" 
      ]] = df.iloc [::, 10:17].shift(periods=-1, axis="columns")

# Remove trailing non-printable characters
df["ALARMID"] = df["ALARMID"].str.strip()
df["EQUIPMENTNAME"] = df["EQUIPMENTNAME"].str.strip()
df["MESSAGE"] = df["MESSAGE"].str.strip()
df["ASSETNAME"] = df["ASSETNAME"].str.strip()

In [20]:
locNamesList = {
                'NED', #001
                'FRP', #002
                'SKG', #003
                'HGN', #004
                'KVN', #005
                'SER', #006
                'HBF', #007
                'DBG', #008
                'OTP', #009
                'CNT', #010
                'LTI', #011
                'CQY', #012
                'BGK', #013
                'OCC', #014
                'WLH', #015
                'PTP', #016
                'BNK', #017
                'PGL', #018
                'TUNNEL', #019
                'Sector', #020
                'Concourse', #021
                'Mezzaninne', #022
                'Mid-Landing Entrance', #023
                'AL', #024
                'Dirty Area', #025
                'IAP', #026
                '1st Storey', #027
                '2nd Storey', #028
                '3rd Storey', #029
                'B1', #030
                'B2', #031
                'B3', #032
                'Entrance', #033
                'Mid Landing', #034
                'Mid-Landing', #035
                'Subway', #036
                'Underpass Link', #037
                "Underpass To EXT'G  STN", #038
                "1st", #039 NEW UPDATE
                "2nd", #040 NEW UPDATE
                "SUBLOCATIONN", #041 NEW UPDATE
                "SUBLOCATIONS", #042 NEW UPDATE
                "North End", #043 NEW UPDATE
                "South End", #044 NEW UPDATE
                "South Adjacent", #045 NEW UPDATE
                "North Adjacent", #046 NEW UPDATE
                "Mezzanine", #047 NEW UPDATE
                "Linkway", #048 NEW UPDATE
                "Smoke Free Lobby", #049 NEW UPDATE
                "Storey", #050 NEW UPDATE
                "Underpass to EXT'G STN", #051 NEW UPDATE
                "-SUBLOCATION", #052 NEW UPDATE
                "SUBLOCATION-" #053 NEW UPDATE
                
                }


locNamesVal = [
            '', #001
            '', #002
            '', #003
            '', #004
            '', #005
            '', #006
            '', #007
            '', #008
            '', #009
            '', #010
            '', #011
            '', #012
            '', #013
            '', #014
            '', #015
            '', #016
            '', #017
            '', #018
            '', #019
            '', #020
            'SUBLOCATION', #021
            'SUBLOCATION', #022
            'SUBLOCATION', #023
            'SUBLOCATION', #024
            'SUBLOCATION', #025
            'SUBLOCATION', #026
            'SUBLOCATION', #027
            'SUBLOCATION', #028
            'SUBLOCATION', #029
            '', #030
            '', #031
            '', #032
            'SUBLOCATION', #033
            '', #034
            'SUBLOCATION', #035
            'SUBLOCATION', #036
            'SUBLOCATION', #037
            'SUBLOCATION', #038
            "", #039 NEW UPDATE
            "", #040 NEW UPDATE
            "SUBLOCATION", #041 NEW UPDATE
            "SUBLOCATION", #042 NEW UPDATE
            "", #043 NEW UPDATE
            "", #044 NEW UPDATE
            "", #045 NEW UPDATE
            "", #046 NEW UPDATE
            "SUBLOCATION", #047 NEW UPDATE
            "SUBLOCATION", #048 NEW UPDATE
            "SUBLOCATION", #049 NEW UPDATE
            "SUBLOCATION", #050 NEW UPDATE
            "SUBLOCATION", #051 NEW UPDATE
            "", #052 NEW UPDATE
            "" #053 NEW UPDATE
            ]

# Create fields for "ASSET_DESC_CAT" and "EVENT_DESC_CAT" # NEW UPDATE FIX
df[["ASSET_DESC_CAT", "EVENT_DESC_CAT"]] = df["MESSAGE"].str.split(pat = ": ", expand=True, n = 1)   
df["ASSET_DESC_CAT"] = df["ASSET_DESC_CAT"].str.strip() # Remove leading and trailing whitespaces
df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.strip() # Remove leading and trailing whitespaces
try:
    df.loc[df["EVENT_DESC_CAT"].isna(), "EVENT_DESC_CAT"] = df["ASSET_DESC_CAT"]
    df.loc[df["EVENT_DESC_CAT"] == df["ASSET_DESC_CAT"], "ASSET_DESC_CAT"] = np.nan
except:
    pass

# Remove Location Names    
try: # Error catch if the entire column is empty
    #df= df.replace({"ASSET_DESC_CAT": locNames}, regex = True) # not compatible with modin; slower than list method
    df["ASSET_DESC_CAT"] = df["ASSET_DESC_CAT"].replace(regex = locNamesList, value = locNamesVal)
except:
    pass


###############################################################
# Get Asset Description Category (Remove Numbers)
# Remove Numbers
df['ASSET_DESC_CAT'] = df['ASSET_DESC_CAT'].str.replace(r'\d+', '', regex = True)

# Account for exceptions to interpolate ASSET_DESC_CAT based on ASSET_DESCRIPTION

df['ASSET_DESC_CAT'].str.replace(" kV", "22 kV", regex = False) # NEW UPDATE FIX
df['ASSET_DESC_CAT'].str.replace("at KV SW", "at 22 kV SW", regex = False) # NEW UPDATE FIX
df['ASSET_DESC_CAT'].str.replace("DC  V", "DC 1500 V", regex = False) # NEW UPDATE FIX




###############################################################
# Get Asset Description Category (Remove Redundant White Spaces)
# Remove redundant white spaces    
df["ASSET_DESC_CAT"] = df["ASSET_DESC_CAT"].str.strip().str.replace(r'\s+', ' ', regex = True)  

###############################################################
#Get Asset Description Category (Account for Misc Exceptions)

# Account for exceptions # NEW UPDATE
df['ASSET_DESC_CAT'] = df['ASSET_DESC_CAT'].str.replace('SUBLOCATION SUBLOCATION', 'SUBLOCATION', regex = False)
df['ASSET_DESC_CAT'] = df['ASSET_DESC_CAT'].str.replace('( ', '(', regex = False)
df["ASSET_DESC_CAT"] = df["ASSET_DESC_CAT"].str.replace(r'\A(: )','', regex = True)
df["ASSET_DESC_CAT"] = df["ASSET_DESC_CAT"].str.replace('Cameras','Camera', case = False, regex = False)

try:
    df.loc[df["MESSAGE"].str.contains("CCTV Controller Power Supply", na = False, regex = False), "ASSET_DESC_CAT"] = "CCTV Controller Power Supply"
except:
    pass
try:
    df.loc[df["ASSET_DESC_CAT"].str.contains("CBN Access Multiplexer", na = False, regex = False), "ASSET_DESC_CAT"] = "CBN Access Multiplexer"
except:
    pass
try:
    df.loc[df["ASSET_DESC_CAT"].str.contains("CI Gas Panel", case = False, na = False, regex = False), "ASSET_DESC_CAT"] = "CI Gas Panel"
except:
    pass
try:
    df.loc[df["ASSET_DESC_CAT"].str.contains("RI Gas Panel", case = False, na = False, regex = False), "ASSET_DESC_CAT"] = "CI Gas Panel"
except:
    pass
try:
    df.loc[df["ASSET_DESC_CAT"].str.contains("CROSS-CONNECT ACCESS Multiplexer", na = False, regex = False), "ASSET_DESC_CAT"] = "CROSS-CONNECT ACCESS Multiplexer"
except:
    pass
try:
    df.loc[df["ASSET_DESC_CAT"].str.contains("Electrically Supervised Valve", na = False, regex = False), "ASSET_DESC_CAT"] = "Electrically Supervised Valve"
except:
    pass
try:
    df.loc[df["ASSET_DESC_CAT"].str.contains("Hosereel Pump", na = False, regex = False), "ASSET_DESC_CAT"] = "Hosereel Pump"
except:
    pass
try:
    df.loc[df["ASSET_DESC_CAT"].str.contains("Level Fire Shutter", na = False, regex = False), "ASSET_DESC_CAT"] = "Level Fire Shutter"
except:
    pass
try:
    df.loc[df["ASSET_DESC_CAT"].str.contains("Level Roller Shutter", na = False, regex = False), "ASSET_DESC_CAT"] = "Level Roller Shutter"
except:
    pass
try:
    df.loc[df["ASSET_DESC_CAT"].str.contains("Main Fire Alarm Panel", na = False, regex = False), "ASSET_DESC_CAT"] = "Main Fire Alarm Panel"
except:
    pass
try:
    df.loc[df["ASSET_DESC_CAT"].str.contains("Traffic Direction", case = False, na = False, regex = False), "ASSET_DESC_CAT"] = "Traffic Direction"
except:
    pass
try:
    df.loc[df["ASSET_DESC_CAT"].str.contains("Tunnel LTG Ctrl Panel", case = False, na = False, regex = False), "ASSET_DESC_CAT"] = "Tunnel LTG Ctrl Panel"
except:
    pass
try:
    df.loc[df["ASSET_DESC_CAT"].str.contains("Zone -", case = False, na = False, regex = False), "ASSET_DESC_CAT"] = "ZONE SUBLOCATION"
except:
    pass


# Remove additional locations
df["ASSET_DESC_CAT"] = df["ASSET_DESC_CAT"].str.split(" at ", n = 1, expand = True)[0]
df["ASSET_DESC_CAT"] = df["ASSET_DESC_CAT"].str.split(" for ", n = 1, expand = True)[0]
df['ASSET_DESC_CAT'] = df['ASSET_DESC_CAT'].str.replace('SUBLOCATION-SUBLOCATION', 'SUBLOCATION', regex = False) # New Update
df['ASSET_DESC_CAT'] = df['ASSET_DESC_CAT'].str.replace('SUBLOCATION-', 'SUBLOCATION', regex = False) # New Update
df['ASSET_DESC_CAT'] = df['ASSET_DESC_CAT'].str.replace('-SUBLOCATION', 'SUBLOCATION', regex = False) # New Update
df['ASSET_DESC_CAT'] = df['ASSET_DESC_CAT'].str.replace(r'( at)$', '', case = False, regex = True) # New Update
df['ASSET_DESC_CAT'] = df['ASSET_DESC_CAT'].str.replace(r'( for)$', '', case = False, regex = True) # New Update
df['ASSET_DESC_CAT'] = df['ASSET_DESC_CAT'].str.replace(r'^(:)', '', regex = True) # New Update



In [21]:
df.head()

Unnamed: 0,ALARMID,ENVIRONMENT,VALUE,ACKREQUIRED,SEVERITY,EQUIPMENTCLASS,FUNCTIONALCAT,GEOGRAPHICALCAT,DATEANDTIME,EQUIPMENTNAME,ASSETNAME,MESSAGE,STATUS,GROUP1,GROUP2,FORMAT,DSSEVENTTYPE,OPER,ASSET_DESC_CAT,EVENT_DESC_CAT
0,97496,NEDSMS,0,0,2,2147483647,10,3,24/08/2020 00:00:00,:TRAD:DMS__0001:ATC__0010,SIG/NED/1133/ATCS0013,NED ATC Sector 10: Gp A BCU link 3,OK,3211313,3232000,2147483647,,KHK,SUBLOCATION ATC,Gp A BCU link 3
1,97458,NEDSMS,0,0,2,2147483647,12,27,24/08/2020 00:00:00,:T36:TRTR:HATR2:CADT,EMU/172/CAR/XXXXXXXX,Train 036 Car 072 DT: Emergency Damper Summary...,NORMAL & CLOSED,49,0,2147483647,,KHK,Train Car DT,Emergency Damper Summary Status
2,97472,NEDSMS,1,0,2,2147483647,12,27,24/08/2020 00:00:00,:T06:TRTR:HATR2,EMU/006/TRN/XXXXXXXX,Train 006 Car 012: DT ATO/ATP Status,NOT APPLICABLE,49,0,2147483647,,KHK,Train Car,DT ATO/ATP Status
3,97066,NEDSMS,1,0,2,2147483647,12,27,24/08/2020 00:00:00,:T34:TRTR:HATR2,EMU/034/TRN/XXXXXXXX,Train 034 Car 068: DT ATO/ATP Status,NOT APPLICABLE,3211313,3232000,2147483647,,KHK,Train Car,DT ATO/ATP Status
4,97223,NEDSMS,0,0,2,2147483647,12,27,24/08/2020 00:00:00,:T06:TRTR:HATR2,EMU/006/TRN/XXXXXXXX,Train 006 Car 012: DT External CCTV Camera Status,NORMAL,49,0,2147483647,,KHK,Train Car,DT External CCTV Camera Status


In [22]:
###############################################################

# Create "EVENT_DESCRIPTION" field
#df["EVENT_DESC_CAT"] = df["MESSAGE"].copy()  

# Remove Location Names      
try: # Error catch if the entire column is empty
    #df = df.replace({"EVENT_DESC_CAT": locNames}, regex = True) # Does not work with Modin; slower than list method
    df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].replace(regex = locNamesList, value = locNamesVal)
except:
    pass


###############################################################
# Get Event Description Category (Remove Numbers)
# Remove Numbers
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'\d+', '', regex = True)

# Fix entries which still need to preserve number info
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(" kV", "22 kV", regex = False) # NEW UPDATE FIX
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace("at KV SW", "at 22 kV SW", regex = False) # NEW UPDATE FIX
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace("DC  V", "DC 1500 V", regex = False) # NEW UPDATE FIX

try: # New Update
    df.loc[df["EVENT_DESC_CAT"] == "MP  VDC Status", "EVENT_DESC_CAT"] = "MP 1500 VDC Status"
except:
    pass

###############################################################
# Get Event Description Category (Remove Redundant White Spaces)
# Remove redundant white spaces    
df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.strip().str.replace(r'\s+', ' ', regex = True)

In [23]:
df.head()

Unnamed: 0,ALARMID,ENVIRONMENT,VALUE,ACKREQUIRED,SEVERITY,EQUIPMENTCLASS,FUNCTIONALCAT,GEOGRAPHICALCAT,DATEANDTIME,EQUIPMENTNAME,ASSETNAME,MESSAGE,STATUS,GROUP1,GROUP2,FORMAT,DSSEVENTTYPE,OPER,ASSET_DESC_CAT,EVENT_DESC_CAT
0,97496,NEDSMS,0,0,2,2147483647,10,3,24/08/2020 00:00:00,:TRAD:DMS__0001:ATC__0010,SIG/NED/1133/ATCS0013,NED ATC Sector 10: Gp A BCU link 3,OK,3211313,3232000,2147483647,,KHK,SUBLOCATION ATC,Gp A BCU link
1,97458,NEDSMS,0,0,2,2147483647,12,27,24/08/2020 00:00:00,:T36:TRTR:HATR2:CADT,EMU/172/CAR/XXXXXXXX,Train 036 Car 072 DT: Emergency Damper Summary...,NORMAL & CLOSED,49,0,2147483647,,KHK,Train Car DT,Emergency Damper Summary Status
2,97472,NEDSMS,1,0,2,2147483647,12,27,24/08/2020 00:00:00,:T06:TRTR:HATR2,EMU/006/TRN/XXXXXXXX,Train 006 Car 012: DT ATO/ATP Status,NOT APPLICABLE,49,0,2147483647,,KHK,Train Car,DT ATO/ATP Status
3,97066,NEDSMS,1,0,2,2147483647,12,27,24/08/2020 00:00:00,:T34:TRTR:HATR2,EMU/034/TRN/XXXXXXXX,Train 034 Car 068: DT ATO/ATP Status,NOT APPLICABLE,3211313,3232000,2147483647,,KHK,Train Car,DT ATO/ATP Status
4,97223,NEDSMS,0,0,2,2147483647,12,27,24/08/2020 00:00:00,:T06:TRTR:HATR2,EMU/006/TRN/XXXXXXXX,Train 006 Car 012: DT External CCTV Camera Status,NORMAL,49,0,2147483647,,KHK,Train Car,DT External CCTV Camera Status


In [24]:


###############################################################
# Get Event Description Category (Account for Misc Exceptions)
# Account for Exceptions
try:
    df.loc[(df['EVENT_DESC_CAT'].str.contains("logged", na = False, regex = False)) & 
           (df['EVENT_DESC_CAT'].str.contains("Operator", na = False, regex = False)) &
           (df['EVENT_DESC_CAT'].str.contains("NelVisu", na = False, regex = False)),
           "EVENT_DESC_CAT"] = "Operator Logged In/Out of NelVisu"
except:
    pass


df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r' /, /...', '', regex = False)
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'__:', '', regex = False) 
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'_:', '', regex = False) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'([.]+){2}', '', regex = True) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'(_+){2}', '', regex = True) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'::', '', regex = False) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r' : ', ': ', regex = False) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'@n', '', regex = False) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r' ,', ',', regex = False) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'< >', '', regex = False) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'-:', ':', regex = False) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'^(:)', '', regex = True) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'( -)\S', ' - ', regex = True) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r' )', ')', regex = False) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'()', '', regex = False) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r' ump Rm', ' Pump Rm', regex = False) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r' latform', ' Platform', regex = False) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'( )+', ' ', regex = True) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace('SUBLOCATION-SUBLOCATION', 'SUBLOCATION', regex = False) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace('SUBLOCATION-', 'SUBLOCATION', regex = False) # New Update
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace('-SUBLOCATION', 'SUBLOCATION', regex = False) # New Update
#df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace()

# New Update
try:
    df.loc[(df['EVENT_DESC_CAT'].str.contains(" at ", na = False, regex = False)) & 
           (df['EVENT_DESC_CAT'].str.contains(": ", na = False, regex = False)), "EVENT_DESC_CAT"] = df['EVENT_DESC_CAT'].str.split("at", 1, expand = True)[0] + ": " + df['EVENT_DESC_CAT'].str.split(": ", 1, expand = True)[1]
    df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r' : ', ': ', regex = False) # New Update
except:
    pass

# New Update
try:
    df.loc[(df['EVENT_DESC_CAT'].str.contains("Gws", na = False, regex = False)) & 
           (df['EVENT_DESC_CAT'].str.contains("msg in", na = False, regex = False)), "EVENT_DESC_CAT"] = df['EVENT_DESC_CAT'].str.split("msg in", 1, expand = True)[0] + "msg in SUBLOCATION"
except:
    pass

# New Update
try:
    df.loc[(df['EVENT_DESC_CAT'].str.contains("Gws", na = False, regex = False)) & 
           (df['EVENT_DESC_CAT'].str.contains("bcast in", na = False, regex = False)), "EVENT_DESC_CAT"] = df['EVENT_DESC_CAT'].str.split("bcast in", 1, expand = True)[0] + "bcast in SUBLOCATION"
except:
    pass

# New Update
try:
    df.loc[(df['EVENT_DESC_CAT'].str.contains("Train", na = False, regex = False)) & 
           (df['EVENT_DESC_CAT'].str.contains("Car", na = False, regex = False)) &
           (df['EVENT_DESC_CAT'].str.contains("assigned", na = False, regex = False)) &
           (df['EVENT_DESC_CAT'].str.contains("Manoeuvre", na = False, regex = False)), "EVENT_DESC_CAT"] = "Manoeuvre assigned to Train Car"
except:
    pass

# New Update
try:
    df.loc[(df['EVENT_DESC_CAT'].str.contains("Train", na = False, regex = False)) & 
           (df['EVENT_DESC_CAT'].str.contains("Car", na = False, regex = False)) &
           (df['EVENT_DESC_CAT'].str.contains("abandoned", na = False, regex = False)) &
           (df['EVENT_DESC_CAT'].str.contains("Manoeuvre", na = False, regex = False)), "EVENT_DESC_CAT"] = "Manoeuvre abandoned by Train Car"
except:
    pass

# New Update
try:
    df.loc[(df['MESSAGE'].str.contains("Display of Free-Text", na = False, regex = False)), "EVENT_DESC_CAT"] = "Display of Free-Text"
except:
    pass

# New Update
try:
    df.loc[(df['EVENT_DESC_CAT'].str.contains("DVA version mismatch", na = False, regex = False)), "EVENT_DESC_CAT"] = "DVA version mismatch"
except:
    pass

# New Update
try:
    df.loc[(df['EVENT_DESC_CAT'].str.contains("Automatic hand-over", na = False, regex = False)), "EVENT_DESC_CAT"] = "Automatic hand-over"
except:
    pass

# New Update
try:
    df.loc[(df['EVENT_DESC_CAT'].str.contains("Automatic Hold Applied", na = False, regex = False)), "EVENT_DESC_CAT"] = "Automatic Hold Applied"
except:
    pass

# New Update
try:
    df.loc[(df['EVENT_DESC_CAT'].str.contains("Communication between", na = False, regex = False)), "EVENT_DESC_CAT"] = "Communication between Nodes"
except:
    pass

df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.strip() # New Update

###############################################################
# Extract Train Information
# Get Train ID (# NEW UPDATE)
df[["TrainID", "TrainID_temp"]] = df["MESSAGE"].str.extract(r"TR___(\d+)|Train (\d+)[ :]") 
try:
    df.loc[df["TrainID"].isna(), "TrainID"] =  df["TrainID_temp"]
except:
    pass

# Get CarID and ServiceID (NEW UPDATE)
df[["CarID", "ServiceID", "CarID_temp"]] = df["MESSAGE"].str.extract(r"cars (\d+)/(\d+)|Car (\d+)[ :]")
try:
    df.loc[df["CarID"].isna(), "CarID"] =  df["CarID_temp"]
except:
    pass

del df["TrainID_temp"], df["CarID_temp"] # NEW UPDATE

###############################################################
# Extract Asset Information

df["AssetClass"] = df["ASSETNAME"] 

# Remove Location Names  
try: # Error catch if entire column is empty
    #df = df.replace({"AssetClass": locNames}, regex = True) # Does not work with Modin; slower than list method
    df["AssetClass"] = df["AssetClass"].replace(regex = locNamesList, value = "")
except:
    pass

# Remove Numbers
df['AssetClass'] = df['AssetClass'].str.replace(r'\d+', '', regex = True)

# Remove Exceptions
try:
    df.loc[df['AssetClass'].str.contains("TRACTION", regex = False), 'AssetClass'] = "TRACTION/TRACTION"
except:
    pass

try:
    df.loc[(df['AssetClass'].str.contains("TUNNEL", regex = False)) & 
       (df['AssetClass'].str.contains("LIGHT", regex = False)), 'AssetClass'] = "TUNNEL/LIGHT"
except:
    pass

# Clean up string prior to delimiting
df['AssetClass'] = df['AssetClass'].str.replace(r'\A(_)', '', regex = True)
df['AssetClass'] = df['AssetClass'].str.replace(r'(_)\Z', '', regex = True)
df['AssetClass'] = df['AssetClass'].str.replace(r'_+', '/', regex = True)

# Get AssetSubClass
#df['AssetSubClass'] = df['AssetClass'].str.split("/").str[-1] # this method doesn't work with modin
df['AssetSubClass'] = df['AssetClass'].str.extract(r'/(\w+)$')

# Get AssetClass
#df['AssetClass'] = df['AssetClass'].str.split("/").str[0] # this method doesn't work with modin
df['AssetClass'] = df['AssetClass'].str.extract(r'(\w+)/')[0]

In [25]:
df.head()

Unnamed: 0,ALARMID,ENVIRONMENT,VALUE,ACKREQUIRED,SEVERITY,EQUIPMENTCLASS,FUNCTIONALCAT,GEOGRAPHICALCAT,DATEANDTIME,EQUIPMENTNAME,ASSETNAME,MESSAGE,STATUS,GROUP1,GROUP2,FORMAT,DSSEVENTTYPE,OPER,ASSET_DESC_CAT,EVENT_DESC_CAT,TrainID,CarID,ServiceID,AssetClass,AssetSubClass
0,97496,NEDSMS,0,0,2,2147483647,10,3,24/08/2020 00:00:00,:TRAD:DMS__0001:ATC__0010,SIG/NED/1133/ATCS0013,NED ATC Sector 10: Gp A BCU link 3,OK,3211313,3232000,2147483647,,KHK,SUBLOCATION ATC,Gp A BCU link,,,,SIG,ATCS
1,97458,NEDSMS,0,0,2,2147483647,12,27,24/08/2020 00:00:00,:T36:TRTR:HATR2:CADT,EMU/172/CAR/XXXXXXXX,Train 036 Car 072 DT: Emergency Damper Summary...,NORMAL & CLOSED,49,0,2147483647,,KHK,Train Car DT,Emergency Damper Summary Status,36.0,72.0,,EMU,XXXXXXXX
2,97472,NEDSMS,1,0,2,2147483647,12,27,24/08/2020 00:00:00,:T06:TRTR:HATR2,EMU/006/TRN/XXXXXXXX,Train 006 Car 012: DT ATO/ATP Status,NOT APPLICABLE,49,0,2147483647,,KHK,Train Car,DT ATO/ATP Status,6.0,12.0,,EMU,XXXXXXXX
3,97066,NEDSMS,1,0,2,2147483647,12,27,24/08/2020 00:00:00,:T34:TRTR:HATR2,EMU/034/TRN/XXXXXXXX,Train 034 Car 068: DT ATO/ATP Status,NOT APPLICABLE,3211313,3232000,2147483647,,KHK,Train Car,DT ATO/ATP Status,34.0,68.0,,EMU,XXXXXXXX
4,97223,NEDSMS,0,0,2,2147483647,12,27,24/08/2020 00:00:00,:T06:TRTR:HATR2,EMU/006/TRN/XXXXXXXX,Train 006 Car 012: DT External CCTV Camera Status,NORMAL,49,0,2147483647,,KHK,Train Car,DT External CCTV Camera Status,6.0,12.0,,EMU,XXXXXXXX


In [26]:
# Standardise null values
df = df.replace("", np.nan).fillna(value=np.nan) #NEW UPDATE

# Convert data format
df["ALARMID"] = df["ALARMID"].astype("int")
df['DATEANDTIME'] = pd.to_datetime(df['DATEANDTIME'], dayfirst=True)
df["ACKREQUIRED"] = df["ACKREQUIRED"].astype("bool")
df["SEVERITY"] = df["SEVERITY"].astype("int")
df["EQUIPMENTCLASS"] = df["EQUIPMENTCLASS"].astype("int")
df["FUNCTIONALCAT"] = df["FUNCTIONALCAT"].astype("float")
df["GEOGRAPHICALCAT"] = df["GEOGRAPHICALCAT"].astype("float")
df["GROUP1"] = df["GROUP1"].astype("float")
df["GROUP2"] = df["GROUP2"].astype("float")
df["FORMAT"] = df["FORMAT"].astype("float")

# Infer datatypes of columns
df = df.infer_objects()

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 705074 entries, 0 to 705073
Data columns (total 25 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   ALARMID          705074 non-null  int32         
 1   ENVIRONMENT      705074 non-null  object        
 2   VALUE            705074 non-null  object        
 3   ACKREQUIRED      705074 non-null  bool          
 4   SEVERITY         705074 non-null  int32         
 5   EQUIPMENTCLASS   705074 non-null  int32         
 6   FUNCTIONALCAT    705074 non-null  float64       
 7   GEOGRAPHICALCAT  705074 non-null  float64       
 8   DATEANDTIME      705074 non-null  datetime64[ns]
 9   EQUIPMENTNAME    705035 non-null  object        
 10  ASSETNAME        705074 non-null  object        
 11  MESSAGE          705074 non-null  object        
 12  STATUS           682968 non-null  object        
 13  GROUP1           705074 non-null  float64       
 14  GROUP2           705

In [27]:
df.head()




Unnamed: 0,ALARMID,ENVIRONMENT,VALUE,ACKREQUIRED,SEVERITY,EQUIPMENTCLASS,FUNCTIONALCAT,GEOGRAPHICALCAT,DATEANDTIME,EQUIPMENTNAME,ASSETNAME,MESSAGE,STATUS,GROUP1,GROUP2,FORMAT,DSSEVENTTYPE,OPER,ASSET_DESC_CAT,EVENT_DESC_CAT,TrainID,CarID,ServiceID,AssetClass,AssetSubClass
0,97496,NEDSMS,0,True,2,2147483647,10.0,3.0,2020-08-24,:TRAD:DMS__0001:ATC__0010,SIG/NED/1133/ATCS0013,NED ATC Sector 10: Gp A BCU link 3,OK,3211313.0,3232000.0,2147484000.0,,KHK,SUBLOCATION ATC,Gp A BCU link,,,,SIG,ATCS
1,97458,NEDSMS,0,True,2,2147483647,12.0,27.0,2020-08-24,:T36:TRTR:HATR2:CADT,EMU/172/CAR/XXXXXXXX,Train 036 Car 072 DT: Emergency Damper Summary...,NORMAL & CLOSED,49.0,0.0,2147484000.0,,KHK,Train Car DT,Emergency Damper Summary Status,36.0,72.0,,EMU,XXXXXXXX
2,97472,NEDSMS,1,True,2,2147483647,12.0,27.0,2020-08-24,:T06:TRTR:HATR2,EMU/006/TRN/XXXXXXXX,Train 006 Car 012: DT ATO/ATP Status,NOT APPLICABLE,49.0,0.0,2147484000.0,,KHK,Train Car,DT ATO/ATP Status,6.0,12.0,,EMU,XXXXXXXX
3,97066,NEDSMS,1,True,2,2147483647,12.0,27.0,2020-08-24,:T34:TRTR:HATR2,EMU/034/TRN/XXXXXXXX,Train 034 Car 068: DT ATO/ATP Status,NOT APPLICABLE,3211313.0,3232000.0,2147484000.0,,KHK,Train Car,DT ATO/ATP Status,34.0,68.0,,EMU,XXXXXXXX
4,97223,NEDSMS,0,True,2,2147483647,12.0,27.0,2020-08-24,:T06:TRTR:HATR2,EMU/006/TRN/XXXXXXXX,Train 006 Car 012: DT External CCTV Camera Status,NORMAL,49.0,0.0,2147484000.0,,KHK,Train Car,DT External CCTV Camera Status,6.0,12.0,,EMU,XXXXXXXX


In [28]:
df.tail()

Unnamed: 0,ALARMID,ENVIRONMENT,VALUE,ACKREQUIRED,SEVERITY,EQUIPMENTCLASS,FUNCTIONALCAT,GEOGRAPHICALCAT,DATEANDTIME,EQUIPMENTNAME,ASSETNAME,MESSAGE,STATUS,GROUP1,GROUP2,FORMAT,DSSEVENTTYPE,OPER,ASSET_DESC_CAT,EVENT_DESC_CAT,TrainID,CarID,ServiceID,AssetClass,AssetSubClass
705069,688214,OCCCMS,0,True,3,2147483647,40.0,6.0,2020-08-24 23:59:59,:COMS:CCTS_0001:CAMS_0205,COM/CNT/B1/CAM17,CNT:205 AFG 3: Status,FAILURE,138739761.0,138749696.0,2147484000.0,,,SUBLOCATION: AFG,Status,,,,COM,CAM
705070,688217,OCCCMS,0,True,3,2147483647,40.0,6.0,2020-08-24 23:59:59,:COMS:CCTS_0001:CAMS_0205,COM/CNT/B1/CAM17,CNT:205 AFG 3: Status,NORMAL,49.0,0.0,2147484000.0,,,SUBLOCATION: AFG,Status,,,,COM,CAM
705071,688216,OCCCMS,0,True,3,2147483647,40.0,14.0,2020-08-24 23:59:59,:COMS:CCTS_0001:CAMS_0208,COM/SER/B1/CAM20,SER:208 LWY1: Status,NORMAL,49.0,0.0,2147484000.0,,,LWY,Status,,,,COM,CAM
705072,688213,OCCCMS,0,True,3,2147483647,40.0,18.0,2020-08-24 23:59:59,:COMS:CCTS_0001:CAMS_0106,COM/SKG/B1/CAM06,SKG:106 SB HWC: Status,FAILURE,49.0,0.0,2147484000.0,,,SUBLOCATION: SB HWC,Status,,,,COM,CAM
705073,688215,OCCCMS,0,True,3,2147483647,40.0,18.0,2020-08-24 23:59:59,:COMS:CCTS_0001:CAMS_0106,COM/SKG/B1/CAM06,SKG:106 SB HWC: Status,NORMAL,138739761.0,138749440.0,2147484000.0,,,SUBLOCATION: SB HWC,Status,,,,COM,CAM


In [29]:
cwd

'C:\\Users\\cftfda01\\Documents\\SBST Train IAMS Project\\alarm-event-logs'

In [30]:
os.path.basename(inputFile)

'HISEVENT_2020_08_24_00_00_00'

In [31]:
# Save File
df.to_csv('../CleanedOutput/' + prefix + '-cleaned-' + batch + serverEnv + os.path.basename(inputFile) + '.csv', index=False)
print(os.path.basename(inputFile) + " saved")


HISEVENT_2020_08_24_00_00_00 saved
