# Documentation

## Objective(s)
1. Ingest in partially processed message log and clean it up

# Initialisation

## Load Libraries

In [1]:
pip install --user --upgrade pandas


Requirement already up-to-date: pandas in c:\users\cftfda01\appdata\roaming\python\python38\site-packages (1.2.1)
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Install a pip package in the current Jupyter kernel
import sys
#!{sys.executable} -m pip install schedule

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import os
import pathlib
import datetime as dt
import time

## Set Options

In [4]:
# Enable display of all columns for dataframes with many variables
pd.set_option('display.max_columns', None)

## Set Up Core Directories

In [5]:
# Check current directory location
cwd = os.getcwd()
cwd

'C:\\Users\\cftfda01\\Documents\\SBST Train IAMS Project\\scripts'

In [6]:
# Define root file directory folder where the files are being stored
#os.chdir(cwd + alarmLoc)
os.chdir(os.path.dirname(os.getcwd()) + '\\alarm-event-logs')

# Check current directory location
cwd = os.getcwd()

# Check directory location
cwd

'C:\\Users\\cftfda01\\Documents\\SBST Train IAMS Project\\alarm-event-logs'

In [7]:
# Create Directory for Output Files Generated
if not os.path.exists('dataCleaned'):
    os.makedirs('dataCleaned')
    
# Inspect files in directory
fileList = os.listdir()
fileList

['alarmLog',
 'dataCleaned',
 'dummyLog',
 'dummyLog - Holding',
 'eventLog',
 'Original Sample from 27 Oct 2020 (simplified)',
 'Sample from 27 Oct 2020 (OG).zip',
 'testLog']

In [8]:
# Location of Alarm and Normal Event Files
testLoc = '\\testLog'

## Access Files to be Processed

In [9]:
# Define root file directory folder where the files are being stored
os.chdir(cwd + testLoc)


# Check directory location
os.getcwd()

'C:\\Users\\cftfda01\\Documents\\SBST Train IAMS Project\\alarm-event-logs\\testLog'

In [10]:
# Get the list of all files in directory tree at given path
fileList = list()
for (dirpath, dirnames, filenames) in os.walk(os.getcwd()):
    #fileList += [os.path.join(dirpath, file) for file in filenames] # use this if you want to append full URL
    fileList += filenames
    
# Inspect data
fileList

['cleaningTest.csv', 'Event_Msg_Extract_b-001.csv', 'Messagelog_Test-001b.csv']

# Start File Processing (Single File)

## Define List of Stopwords

In [11]:
# Define Key Location Names
locNames = (
            ('NED', ''),
            ('FRP', ''),
            ('SKG', ''),
            ('HGN', ''),
            ('KVN', ''),
            ('SER', ''),
            ('HBF', ''),
            ('DBG', ''),
            ('OTP', ''),
            ('CNT', ''),
            ('LTI', ''),
            ('CQY', ''),
            ('BGK', ''),
            ('OCC', ''),
            ('WLH', ''),
            ('PTP', ''),
            ('BNK', ''),
            ('PGL', ''),
            ('TUNNEL', '')
            )

## Process Alarm Files

### Ingest Partially Processed File

In [12]:
# Define File Name
fileName = fileList[2]

In [13]:
# Load File
df = pd.read_csv(fileName)

# Check Output
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3216 entries, 0 to 3215
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ASSET_ID_RAW       3216 non-null   object
 1   ASSET_DESCRIPTION  2683 non-null   object
 2   EVENT_DESCRIPTION  3216 non-null   object
 3   EVENT_STATUS       3215 non-null   object
 4   OPERATOR_INITIALS  845 non-null    object
dtypes: object(5)
memory usage: 125.8+ KB


In [14]:
# Check Output
df.head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
0,OCC_LENV_CMS_,CMS SCS Server - Environment OCCCMS,Environment 1 Status,HOT,
1,COM/SER/B1/CAM20,SER:208 LWY1,Status,FAILURE,
2,COM/BNK/B1/PABX01,PABX,Fan 1 Status,FAILURE,
3,COM/DBG/B2/CAM57,DBG:424 AFG 3,Status,FAILURE,
4,COM/SER/B1/CAM20,SER:208 LWY1,Status,NORMAL,


### Clean Up Data

#### GWS Data

In [15]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("msg")) & (df["EVENT_DESCRIPTION"].str.contains("Gws"))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
1890,OCC_ANNO_0001,,Gws 5 starts DVA bcast msg 599 in cars 69/301,Executed,NBC
1891,OCC_ANNO_0001,,Gws 0 suspends DVA bcast msg 599 in cars 69/301,Executed,
1893,OCC_ANNO_0001,,Gws 5 stops DVA bcast msg 599 in cars 69/301,Executed,NBC
1894,OCC_ANNO_0001,,Gws 5 starts DVA bcast msg 599 in cars 51/330,Executed,NBC
1895,OCC_ANNO_0001,,Gws 0 suspends DVA bcast msg 599 in cars 51/330,Executed,


In [16]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("msg")) & (df["EVENT_DESCRIPTION"].str.contains("Gws"))].shape

(369, 5)

In [17]:
# Update for "GWS Broadcast"
try:
    df.loc[(df["EVENT_DESCRIPTION"].str.contains("msg")) & (df["EVENT_DESCRIPTION"].str.contains("Gws")), "ASSET_DESCRIPTION"] = "GWS Broadcast"
except:
    pass

# Inspect data
df.loc[(df["EVENT_DESCRIPTION"].str.contains("msg")) & (df["EVENT_DESCRIPTION"].str.contains("Gws"))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
1890,OCC_ANNO_0001,GWS Broadcast,Gws 5 starts DVA bcast msg 599 in cars 69/301,Executed,NBC
1891,OCC_ANNO_0001,GWS Broadcast,Gws 0 suspends DVA bcast msg 599 in cars 69/301,Executed,
1893,OCC_ANNO_0001,GWS Broadcast,Gws 5 stops DVA bcast msg 599 in cars 69/301,Executed,NBC
1894,OCC_ANNO_0001,GWS Broadcast,Gws 5 starts DVA bcast msg 599 in cars 51/330,Executed,NBC
1895,OCC_ANNO_0001,GWS Broadcast,Gws 0 suspends DVA bcast msg 599 in cars 51/330,Executed,


In [18]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("msg")) & (df["EVENT_DESCRIPTION"].str.contains("Gws"))].shape

(369, 5)

#### NelVisu Data

In [19]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("NelVisu"))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
365,SCS/NED/1212/GWS07,,Operator 755 logged in as TSR1 on NelVisu1,SUCCEEDED,755
371,SCS/NED/1212/GWS07,,Operator 755 logged in as TSR1 on NelVisu0,SUCCEEDED,755
376,SCS/NED/1212/GWS07,,Operator 755 logged in as TSR1 on NelVisu2,SUCCEEDED,755
1115,SCS/NED/1212/GWS07,,Operator 755 logged out as TSR1 on NelVisu0,SUCCEEDED,755
1116,SCS/NED/1212/GWS07,,Operator 755 logged out as TSR1 on NelVisu2,SUCCEEDED,755


In [20]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("NelVisu"))].shape

(79, 5)

In [21]:
# Update for "NelVisu"
try:
    df.loc[(df["EVENT_DESCRIPTION"].str.contains("NelVisu")), "ASSET_DESCRIPTION"] = "NelVisu"
except:
    pass

# Inspect data
df.loc[(df["EVENT_DESCRIPTION"].str.contains("NelVisu"))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
365,SCS/NED/1212/GWS07,NelVisu,Operator 755 logged in as TSR1 on NelVisu1,SUCCEEDED,755
371,SCS/NED/1212/GWS07,NelVisu,Operator 755 logged in as TSR1 on NelVisu0,SUCCEEDED,755
376,SCS/NED/1212/GWS07,NelVisu,Operator 755 logged in as TSR1 on NelVisu2,SUCCEEDED,755
1115,SCS/NED/1212/GWS07,NelVisu,Operator 755 logged out as TSR1 on NelVisu0,SUCCEEDED,755
1116,SCS/NED/1212/GWS07,NelVisu,Operator 755 logged out as TSR1 on NelVisu2,SUCCEEDED,755


In [22]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("NelVisu"))].shape

(79, 5)

#### Train Radio

In [23]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("TR___")) &
       (df["EVENT_DESCRIPTION"].str.contains("radio"))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
2459,OCC_CCTS_0001,,Free radio path for TR___0034,Executed,NBC
2460,OCC_CCTS_0001,,Free radio path for TR___0022,Executed,NBC
2461,OCC_CCTS_0001,,Free radio path for TR___0018,Executed,NBC
2462,OCC_CCTS_0001,,Free radio path for TR___0006,Executed,NBC
2551,OCC_CCTS_0001,,Free radio path for TR___0008,Executed,CCH


In [24]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("TR___")) &
       (df["EVENT_DESCRIPTION"].str.contains("radio"))].shape

(21, 5)

In [25]:
# Update for Train Radio
try:
    df.loc[(df["EVENT_DESCRIPTION"].str.contains("TR___")) &
           (df["EVENT_DESCRIPTION"].str.contains("radio")), "ASSET_DESCRIPTION"] = "Train Radio"
except:
    pass

# Inspect data
df.loc[(df["EVENT_DESCRIPTION"].str.contains("TR___")) &
       (df["EVENT_DESCRIPTION"].str.contains("radio"))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
2459,OCC_CCTS_0001,Train Radio,Free radio path for TR___0034,Executed,NBC
2460,OCC_CCTS_0001,Train Radio,Free radio path for TR___0022,Executed,NBC
2461,OCC_CCTS_0001,Train Radio,Free radio path for TR___0018,Executed,NBC
2462,OCC_CCTS_0001,Train Radio,Free radio path for TR___0006,Executed,NBC
2551,OCC_CCTS_0001,Train Radio,Free radio path for TR___0008,Executed,CCH


In [26]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("TR___")) &
       (df["EVENT_DESCRIPTION"].str.contains("radio"))].shape

(21, 5)

#### Trainborne Camera

In [27]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Trainborne Camera"))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
2320,OCC_CCTS_0001,,Selected Trainborne Camera <13> for <TR___0011>,EXECUTED,MHM
2324,OCC_CCTS_0001,,Selected Trainborne Camera <13> for <TR___0011>,EXECUTED,NBC
2328,OCC_CCTS_0001,,Selected Trainborne Camera <13> for <TR___0030>,EXECUTED,NBC
2343,OCC_CCTS_0001,,Selected Trainborne Camera <13> for <TR___0018>,EXECUTED,NBC
2350,OCC_CCTS_0001,,Selected Trainborne Camera <13> for <TR___0006>,EXECUTED,NBC


In [28]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Trainborne Camera"))].shape

(10, 5)

In [29]:
# Update for "Trainborne Camera"
try:
    df.loc[(df["EVENT_DESCRIPTION"].str.contains("Trainborne Camera")), "ASSET_DESCRIPTION"] = "Trainborne Camera"
except:
    pass

# Inspect data
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Trainborne Camera"))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
2320,OCC_CCTS_0001,Trainborne Camera,Selected Trainborne Camera <13> for <TR___0011>,EXECUTED,MHM
2324,OCC_CCTS_0001,Trainborne Camera,Selected Trainborne Camera <13> for <TR___0011>,EXECUTED,NBC
2328,OCC_CCTS_0001,Trainborne Camera,Selected Trainborne Camera <13> for <TR___0030>,EXECUTED,NBC
2343,OCC_CCTS_0001,Trainborne Camera,Selected Trainborne Camera <13> for <TR___0018>,EXECUTED,NBC
2350,OCC_CCTS_0001,Trainborne Camera,Selected Trainborne Camera <13> for <TR___0006>,EXECUTED,NBC


In [30]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Trainborne Camera"))].shape

(10, 5)

#### Trainborne Quad

In [31]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Trainborne Quad"))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
2321,OCC_CCTS_0001,,Selected Trainborne Quad <5 6 7 8> for TR___0011,EXECUTED,MHM
2322,OCC_CCTS_0001,,Selected Trainborne Quad <9 10 11 12> for TR__...,EXECUTED,MHM
2325,OCC_CCTS_0001,,Selected Trainborne Quad <5 6 7 8> for TR___0011,EXECUTED,NBC
2326,OCC_CCTS_0001,,Selected Trainborne Quad <9 10 11 12> for TR__...,EXECUTED,NBC
2329,OCC_CCTS_0001,,Selected Trainborne Quad <5 6 7 8> for TR___0030,EXECUTED,NBC


In [32]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Trainborne Quad"))].shape

(18, 5)

In [33]:
# Update for "Trainborne Quad"
try:
    df.loc[(df["EVENT_DESCRIPTION"].str.contains("Trainborne Quad")), "ASSET_DESCRIPTION"] = "Trainborne Quad"
except:
    pass

# Inspect data
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Trainborne Quad"))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
2321,OCC_CCTS_0001,Trainborne Quad,Selected Trainborne Quad <5 6 7 8> for TR___0011,EXECUTED,MHM
2322,OCC_CCTS_0001,Trainborne Quad,Selected Trainborne Quad <9 10 11 12> for TR__...,EXECUTED,MHM
2325,OCC_CCTS_0001,Trainborne Quad,Selected Trainborne Quad <5 6 7 8> for TR___0011,EXECUTED,NBC
2326,OCC_CCTS_0001,Trainborne Quad,Selected Trainborne Quad <9 10 11 12> for TR__...,EXECUTED,NBC
2329,OCC_CCTS_0001,Trainborne Quad,Selected Trainborne Quad <5 6 7 8> for TR___0030,EXECUTED,NBC


In [34]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Trainborne Quad"))].shape

(18, 5)

#### Tunnel Light

In [35]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Tunnel Light")) & 
        ((df["ASSET_DESCRIPTION"].isnull()) | 
         (df["ASSET_DESCRIPTION"] == None) | 
         (df["ASSET_DESCRIPTION"] == ""))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
2079,TUNNEL_LIGHT_OFF,,Turn OFF NB and SB and CS Tunnel Light,REQUESTED,CCH
2218,TUNNEL_LIGHT_OFF,,Turn OFF NB and SB and CS Tunnel Light,TERMINATED,CCH
2368,TUNNEL_LIGHT_ON,,Turn ON NB and SB and CS Tunnel Light,REQUESTED,CCH
2458,TUNNEL_LIGHT_ON,,Turn ON NB and SB and CS Tunnel Light,TERMINATED,CCH


In [36]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Tunnel Light")) & 
        ((df["ASSET_DESCRIPTION"].isnull()) | 
         (df["ASSET_DESCRIPTION"] == None) | 
         (df["ASSET_DESCRIPTION"] == ""))].shape

(4, 5)

In [37]:
# Update for "Tunnel Light"
try:
    df.loc[(df["EVENT_DESCRIPTION"].str.contains("Tunnel Light")) & 
            ((df["ASSET_DESCRIPTION"].isnull()) | 
             (df["ASSET_DESCRIPTION"] == None) | 
             (df["ASSET_DESCRIPTION"] == "")), "ASSET_DESCRIPTION"] = "Tunnel LTG"
except:
    pass

# Inspect data
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Tunnel Light")) & 
        (df["ASSET_DESCRIPTION"] == "Tunnel LTG")].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
2079,TUNNEL_LIGHT_OFF,Tunnel LTG,Turn OFF NB and SB and CS Tunnel Light,REQUESTED,CCH
2218,TUNNEL_LIGHT_OFF,Tunnel LTG,Turn OFF NB and SB and CS Tunnel Light,TERMINATED,CCH
2368,TUNNEL_LIGHT_ON,Tunnel LTG,Turn ON NB and SB and CS Tunnel Light,REQUESTED,CCH
2458,TUNNEL_LIGHT_ON,Tunnel LTG,Turn ON NB and SB and CS Tunnel Light,TERMINATED,CCH


In [38]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Tunnel Light")) & 
        (df["ASSET_DESCRIPTION"] == "Tunnel LTG")].shape

(4, 5)

#### Control Take Over

In [39]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Control Take Over for "))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
321,OCC_LENV_CMS_,,Control Take Over for All Functions from KVN t...,FORCED,CKY
620,KVN_LENV_SMS_,,Control Take Over for ECS - Smoke Extraction S...,FORCED,
621,KVN_LENV_SMS_,,Control Take Over for PIS - Passenger Informat...,FORCED,


In [40]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Control Take Over for "))].shape

(3, 5)

In [41]:
# Update for "Control Take Over"
try:
    df["ASSET_DESCRIPTION2"] = df["EVENT_DESCRIPTION"].str.extract(r"Control Take Over for (\w+) ")
    df.loc[(df["EVENT_DESCRIPTION"].str.contains("Control Take Over for")), "ASSET_DESCRIPTION"] = df["ASSET_DESCRIPTION2"]
    del df["ASSET_DESCRIPTION2"]
except:
    pass

# Inspect data
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Control Take Over for"))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
321,OCC_LENV_CMS_,All,Control Take Over for All Functions from KVN t...,FORCED,CKY
620,KVN_LENV_SMS_,ECS,Control Take Over for ECS - Smoke Extraction S...,FORCED,
621,KVN_LENV_SMS_,PIS,Control Take Over for PIS - Passenger Informat...,FORCED,


In [42]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Control Take Over for"))].shape

(3, 5)

#### Close Control

In [43]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Close Control")) & 
      ((df["ASSET_DESCRIPTION"].isnull()) | 
       (df["ASSET_DESCRIPTION"] == None) | 
       (df["ASSET_DESCRIPTION"] == "")
      ) & 
      ((df["EVENT_DESCRIPTION"].str.contains("Close Control"))
      )
      ].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
1161,TRACTION_HBF_ON,,"DM101, DM102, DFS01, DFN02, DFN03, DFS04 - Clo...",REQUESTED,TCY
1170,TRACTION_CNT_ON,,"DM101, DM102, DI201, DFS01, DFN02, DFN03, DFS0...",REQUESTED,TCY
1174,TRACTION_LTI_ON,,"DM101, DM102, DFS01, DFN02, DFN03, DFS04 - Clo...",REQUESTED,TCY
1261,TRACTION_HBF_ON,,"DM101, DM102, DFS01, DFN02, DFN03, DFS04 - Clo...",TERMINATED,TCY
1326,TRACTION_OTP_ON,,"DTS01, DTN02, DTN03, DTS04 - Close Control",REQUESTED,TCY


In [44]:
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Close Control")) & 
      ((df["ASSET_DESCRIPTION"].isnull()) | 
       (df["ASSET_DESCRIPTION"] == None) | 
       (df["ASSET_DESCRIPTION"] == "")
      )].shape

(20, 5)

In [45]:
# Update for "Close Control"
try:
    df.loc[(df["EVENT_DESCRIPTION"].str.contains("Close Control")) & 
          ((df["ASSET_DESCRIPTION"].isnull()) | 
           (df["ASSET_DESCRIPTION"] == None) | 
           (df["ASSET_DESCRIPTION"] == "")
          ), "ASSET_DESCRIPTION"] = "Traction Control"
except:
    pass

# Inspect data
df.loc[(df["EVENT_DESCRIPTION"].str.contains("Close Control")) & 
      (df["ASSET_DESCRIPTION"].str.contains("Traction Control"))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
1161,TRACTION_HBF_ON,Traction Control,"DM101, DM102, DFS01, DFN02, DFN03, DFS04 - Clo...",REQUESTED,TCY
1170,TRACTION_CNT_ON,Traction Control,"DM101, DM102, DI201, DFS01, DFN02, DFN03, DFS0...",REQUESTED,TCY
1174,TRACTION_LTI_ON,Traction Control,"DM101, DM102, DFS01, DFN02, DFN03, DFS04 - Clo...",REQUESTED,TCY
1261,TRACTION_HBF_ON,Traction Control,"DM101, DM102, DFS01, DFN02, DFN03, DFS04 - Clo...",TERMINATED,TCY
1326,TRACTION_OTP_ON,Traction Control,"DTS01, DTN02, DTN03, DTS04 - Close Control",REQUESTED,TCY


In [46]:
df.loc[(df["ASSET_DESCRIPTION"].astype(str).str.contains("Traction Control"))].head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS
1161,TRACTION_HBF_ON,Traction Control,"DM101, DM102, DFS01, DFN02, DFN03, DFS04 - Clo...",REQUESTED,TCY
1170,TRACTION_CNT_ON,Traction Control,"DM101, DM102, DI201, DFS01, DFN02, DFN03, DFS0...",REQUESTED,TCY
1174,TRACTION_LTI_ON,Traction Control,"DM101, DM102, DFS01, DFN02, DFN03, DFS04 - Clo...",REQUESTED,TCY
1261,TRACTION_HBF_ON,Traction Control,"DM101, DM102, DFS01, DFN02, DFN03, DFS04 - Clo...",TERMINATED,TCY
1326,TRACTION_OTP_ON,Traction Control,"DTS01, DTN02, DTN03, DTS04 - Close Control",REQUESTED,TCY


In [47]:
df.loc[(df["ASSET_DESCRIPTION"].astype(str).str.contains("Traction Control"))].shape

(20, 5)

#### Get Asset Description Category (Remove Stopwords)

In [48]:
# Define Key Location Names
locNames = (
            ('NED', ''),
            ('FRP', ''),
            ('SKG', ''),
            ('HGN', ''),
            ('KVN', ''),
            ('SER', ''),
            ('HBF', ''),
            ('DBG', ''),
            ('OTP', ''),
            ('CNT', ''),
            ('LTI', ''),
            ('CQY', ''),
            ('BGK', ''),
            ('OCC', ''),
            ('WLH', ''),
            ('PTP', ''),
            ('BNK', ''),
            ('PGL', ''),
            ('TUNNEL', ''),
            ('Concourse', 'SUBLOCATION'),
            ('Mezzaninne', 'SUBLOCATION'),
            ('Mid-Landing Entrance', 'SUBLOCATION'),
            ('AL', 'SUBLOCATION'),
            ('Dirty Area', 'SUBLOCATION'),
            ('IAP', 'SUBLOCATION'),
            ('1st Storey', 'SUBLOCATION'),
            ('2nd Storey', 'SUBLOCATION'),
            ('3rd Storey', 'SUBLOCATION'),
            ('B1', ''),
            ('B2', ''),
            ('B3', ''),
            ('Entrance', 'SUBLOCATION'),
            ('Mid Landing', ''),
            ('Mid-Landing', 'SUBLOCATION'),
            ('Subway', 'SUBLOCATION'),
            ('Underpass Link', 'SUBLOCATION'),
            ("Underpass To EXT'G  STN", 'SUBLOCATION')
            )

In [49]:
# Remove Location Names    
df["ASSET_DESC_CAT"] = df["ASSET_DESCRIPTION"].copy()    
for stopword in locNames:
    df["ASSET_DESC_CAT"] = df["ASSET_DESC_CAT"].str.replace(stopword[0],stopword[1], regex = True)

# Remove leading special characters
df["ASSET_DESC_CAT"] = df["ASSET_DESC_CAT"].str.strip().str.replace(r'\A(:)\w+', ' ', regex = True)

#### Get Asset Description Category (Remove Numbers)

In [50]:
# Remove Numbers
df['ASSET_DESC_CAT'] = df['ASSET_DESC_CAT'].str.replace(r'\d+', '', regex = True)

# Account for exceptions
df.loc[df['ASSET_DESCRIPTION'] == "22 kV Feeder CB", "ASSET_DESC_CAT"] = "22 kV Feeder CB"
df.loc[df['ASSET_DESCRIPTION'] == "22 kV Loop CB", "ASSET_DESC_CAT"] = "22 kV Loop CB"
df.loc[df['ASSET_DESCRIPTION'] == "22 kV Rectifier CB", "ASSET_DESC_CAT"] = "22 kV Rectifier CB"

df.loc[df['ASSET_DESCRIPTION'] == "DC 1500 V Backup HSCB", "ASSET_DESC_CAT"] = "DC 1500 V Backup HSCB"
df.loc[df['ASSET_DESCRIPTION'] == "DC 1500 V Bus Section", "ASSET_DESC_CAT"] = "DC 1500 V Bus Section"
df.loc[df['ASSET_DESCRIPTION'] == "DC 1500 V Feeder CB", "ASSET_DESC_CAT"] = "DC 1500 V Feeder CB"
df.loc[df['ASSET_DESCRIPTION'] == "DC 1500 V Rectifier CB", "ASSET_DESC_CAT"] = "DC 1500 V Rectifier CB"
df.loc[df['ASSET_DESCRIPTION'] == "DC 1500 V Inverter CB", "ASSET_DESC_CAT"] = "DC 1500 V Inverter CB"

#### Get Asset Description Category (Remove Redundant White Spaces)

In [51]:
# Remove redundant white spaces    
df["ASSET_DESC_CAT"] = df["ASSET_DESC_CAT"].str.strip().str.replace(r'\s+', ' ', regex = True)

#### Get Asset Description Category (Account for Misc Exceptions)

In [52]:
# Account for exceptions
df['ASSET_DESC_CAT'] = df['ASSET_DESC_CAT'].str.replace('SUBLOCATION SUBLOCATION', 'SUBLOCATION', regex = True)
df['ASSET_DESC_CAT'] = df['ASSET_DESC_CAT'].str.replace('( ', '(', regex = False)

#### Get Event Description Category (Stopwords)

In [53]:
locNames2 = (
            ('NED', ''),
            ('FRP', ''),
            ('SKG', ''),
            ('HGN', ''),
            ('KVN', ''),
            ('SER', ''),
            ('HBF', ''),
            ('DBG', ''),
            ('OTP', ''),
            ('CNT', ''),
            ('LTI', ''),
            ('CQY', ''),
            ('BGK', ''),
            ('OCC', ''),
            ('WLH', ''),
            ('PTP', ''),
            ('BNK', ''),
            ('PGL', '')
            )

In [54]:
# Remove Location Names    
df["EVENT_DESC_CAT"] = df["EVENT_DESCRIPTION"].copy()    
for stopword in locNames2:
    df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.replace(stopword[0],stopword[1], regex = True)

#### Get Event Description Category (Remove Numbers)

In [55]:
# Remove Numbers
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r'\d+', '', regex = True)

#### Get Event Description Category (Remove Redundant White Spaces)

In [56]:
# Remove redundant white spaces    
df["EVENT_DESC_CAT"] = df["EVENT_DESC_CAT"].str.strip().str.replace(r'\s+', ' ', regex = True)

#### Get Event Description Category (Account for Misc Exceptions)

In [57]:
# Account for Exceptions
try:
    df.loc[(df['EVENT_DESC_CAT'].str.contains("logged", regex = False)) & 
           (df['EVENT_DESC_CAT'].str.contains("Operator", regex = False)) &
           (df['EVENT_DESC_CAT'].str.contains("NelVisu", regex = False)),
           "EVENT_DESC_CAT"
          ] = "Operator Logged In/Out of NelVisu"
except:
    pass
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r' /, /...', '', regex = False)
df['EVENT_DESC_CAT'] = df['EVENT_DESC_CAT'].str.replace(r' /', '', regex = False)

#### Extract Train Information

In [58]:
# Get Train ID
df["TrainID"] = df["EVENT_DESCRIPTION"].str.extract(r"TR___(\d+)")
# Get CarID
df["CarID"] = df["EVENT_DESCRIPTION"].str.extract(r"cars (\d+)/")
# Get ServiceID
df["ServiceID"] = df["EVENT_DESCRIPTION"].str.extract(r"cars \d+/(\d+)")

#### Extract Asset Information

In [59]:
# Remove Location Names    
df["AssetClass"] = df["ASSET_ID_RAW"].copy()    
for stopword in locNames2:
    df["AssetClass"] = df["AssetClass"].str.replace(stopword[0],stopword[1], regex = True)
    
# Remove Numbers
df['AssetClass'] = df['AssetClass'].str.replace(r'\d+', '', regex = True)

# Remove Exceptions
try:
    df.loc[df['AssetClass'].str.contains("TRACTION", regex = False), 'AssetClass'] = "TRACTION/TRACTION"
except:
    pass

try:
    df.loc[(df['AssetClass'].str.contains("TUNNEL", regex = False)) & 
       (df['AssetClass'].str.contains("LIGHT", regex = False)), 'AssetClass'] = "TUNNEL/LIGHT"
except:
    pass

# Clean up string prior to delimiting
df['AssetClass'] = df['AssetClass'].str.replace(r'\A(_)', '', regex = True)
df['AssetClass'] = df['AssetClass'].str.replace(r'(_)\Z', '', regex = True)
df['AssetClass'] = df['AssetClass'].str.replace('_', '/', regex = False)

In [60]:
# Get AssetSubClass
df['AssetSubClass'] = df['AssetClass'].str.split("/").str[-1]

# Get AssetClass
df['AssetClass'] = df['AssetClass'].str.split("/").str[0]

## Export File

In [61]:
df.to_csv('../testLog/' + 'cleaningTest' +'.csv', index=False)

In [62]:
# View data
df.head()

Unnamed: 0,ASSET_ID_RAW,ASSET_DESCRIPTION,EVENT_DESCRIPTION,EVENT_STATUS,OPERATOR_INITIALS,ASSET_DESC_CAT,EVENT_DESC_CAT,TrainID,CarID,ServiceID,AssetClass,AssetSubClass
0,OCC_LENV_CMS_,CMS SCS Server - Environment OCCCMS,Environment 1 Status,HOT,,CMS SCS Server - Environment CMS,Environment Status,,,,LENV,CMS
1,COM/SER/B1/CAM20,SER:208 LWY1,Status,FAILURE,,LWY,Status,,,,COM,CAM
2,COM/BNK/B1/PABX01,PABX,Fan 1 Status,FAILURE,,PABX,Fan Status,,,,COM,PABX
3,COM/DBG/B2/CAM57,DBG:424 AFG 3,Status,FAILURE,,AFG,Status,,,,COM,CAM
4,COM/SER/B1/CAM20,SER:208 LWY1,Status,NORMAL,,LWY,Status,,,,COM,CAM


In [63]:
# View data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3216 entries, 0 to 3215
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ASSET_ID_RAW       3216 non-null   object
 1   ASSET_DESCRIPTION  3207 non-null   object
 2   EVENT_DESCRIPTION  3216 non-null   object
 3   EVENT_STATUS       3215 non-null   object
 4   OPERATOR_INITIALS  845 non-null    object
 5   ASSET_DESC_CAT     3207 non-null   object
 6   EVENT_DESC_CAT     3216 non-null   object
 7   TrainID            49 non-null     object
 8   CarID              369 non-null    object
 9   ServiceID          369 non-null    object
 10  AssetClass         3216 non-null   object
 11  AssetSubClass      3216 non-null   object
dtypes: object(12)
memory usage: 301.6+ KB
