In [41]:
!pip3 install pandas
import pandas as pd
import os

raw_data_dir = os.path.join('..', 'data', 'raw')
inspection = os.path.join(raw_data_dir, 'inspection.csv')
inspection_order = os.path.join(raw_data_dir, 'order.csv')

inspection_df = pd.read_csv(inspection)
inspection_df = inspection_df.rename(columns={
    'originatingservicerequestnumber': 'Originating Service Request Number',
    'InspectionCustomer': 'Inspection Customer',
    'ElevatingDevicesNumber': 'Elevating Devices Number',
    'InspectionNumber': 'Inspection Number',
    'InspectionLocation': 'Inspection Location',
    'InspectionType': 'Inspection Type',
    'Earliest_INSPECTION_Date': 'Earliest Inspection Date',
    'Latest_INSPECTION_Date': 'Latest Inspection Date',
    'InspectionOutcome': 'Inspection Outcome'
})

inspection_order_df = pd.read_csv(inspection_order)
inspection_order_df = inspection_order_df.rename(columns={
    'ElevatingDevicesNumber': 'Elevating Devices Number',
    'TSSAStandardOrderNumber': 'TSSA Standard Order Number',
    'RegulationReference': 'Regulation Reference',
    'ClauseNumber': 'Clause Number',
    'ClauseText': 'Clause Text',
    'DIRECTIVE': 'Directive',
    'Inspectionsadditionalinformation': 'Inspections Additional Information',
    'RISKSCORE': 'Risk Score',
    'Inspection_type': 'Inspection Type',
    'DateofIssue': 'Date of Issue',
    'StatusofInspectionOrder': 'Status of Inspection Order',
    'inspectionnumber': 'Inspection Number',
    'DaystoComply': 'Days to Comply',
    'ComplianceDate': 'Compliance Date',
    'customerorderedtocomply': 'Customer Ordered to Comply'
})

print(inspection_order_df.columns)


Index(['Elevating Devices Number', 'TSSA Standard Order Number',
       'Regulation Reference', 'Clause Number', 'Clause Text', 'Directive',
       'Inspections Additional Information', 'Risk Score', 'Inspection Type',
       'Date of Issue', 'Status of Inspection Order', 'Inspection Number',
       'Days to Comply', 'Compliance Date', 'Customer Ordered to Comply'],
      dtype='object')


### a) Inspection Dataset and Inspection Order Dataset Matching: 

In [42]:
missing_inspections = inspection_df[~inspection_df['Inspection Number'].isin(inspection_order_df['Inspection Number'])]

print(f"Total missing inspections: {len(missing_inspections)}")

cleaned_inspections = inspection_df[inspection_df['Inspection Number'].isin(inspection_order_df['Inspection Number'])]
cleaned_inspections = cleaned_inspections.reset_index(drop=True)

cleaned_orders = inspection_order_df[inspection_order_df['Inspection Number'].isin(cleaned_inspections['Inspection Number'])]
cleaned_orders = cleaned_orders.reset_index(drop=True)

Total missing inspections: 95570


### b) Sort Datasets by Time

In [43]:
sorted_inspections = cleaned_inspections.sort_values(by=['Earliest Inspection Date'])
sorted_inspections = cleaned_inspections.reset_index(drop=True)

sorted_inspections_order = cleaned_orders.sort_values(by=['Date of Issue'])
sorted_inspections_order = cleaned_orders.reset_index(drop=True)

# print(sorted_inspections.tail(20))

### c) Inspection Orders Dataset:

In [49]:
sorted_inspections_order['Directive'] = sorted_inspections_order['Directive'].astype(str).where(sorted_inspections_order['Directive'].notnull(), 'No Directive.')
sorted_inspections_order['Inspections Additional Information'] = sorted_inspections_order['Inspections Additional Information'].astype(str).where(sorted_inspections_order['Inspections Additional Information'].notnull(), 'No Additional Information.')

sorted_inspections_order['Combined Inspection Info'] = (sorted_inspections_order['Directive'] + ' ' + sorted_inspections_order['Inspections Additional Information'])
print(sorted_inspections_order['Combined Inspection Info'].head(20))

0     No Directive Top of Car and Hoistway -other**m...
1     No Directive Machine room other:**the 5 year t...
2     No Directive Top of Car and Hoistway -other**m...
3     No Directive Top of Car and Hoistway -other**i...
4     No Directive Machine room other:**post the cur...
5     No Directive General- other**the alteration co...
6     No Directive Top of Car and Hoistway -other**s...
7     No Directive Machine room other:**install a co...
8     No Directive Pit other:**install a maximum run...
9     No Directive Top of Car and Hoistway -other**i...
10    The automatic emergency recall operation shall...
11    Emergency power operation shall be tested by s...
12    The two way communication means in the car sha...
13    Provide means of two-way voice communication i...
14    Provide a means accessible to authorized perso...
15    The two way communication inside the car shall...
16    Inspector's order: Provide a firmly secured pl...
17    The elevating device shall conform with th

### d) Handling Missing "RISK SCORE" Values: 

The values are huge floats, so values have to be normalized to get a better idea and improve the model performance when used. Normalization will help to do meaningful conparisons and avoid bias. In this case the column is using very different scales and also NaN values. They will be converted to 

In [59]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np


scaler = MinMaxScaler()

sorted_inspections_order['Risk Score'] = pd.to_numeric(sorted_inspections_order['Risk Score'], errors='coerce')
sorted_inspections_order['Risk Score'] = sorted_inspections_order['Risk Score'].where(sorted_inspections_order['Risk Score'].notnull(), 0)

sorted_inspections_order['Risk Score'] = scaler.fit_transform(sorted_inspections_order[['Risk Score']])

test = sorted_inspections_order['Risk Score'].unique()
print(test)

[1.100e-03 7.000e-04 0.000e+00 1.000e-04 8.000e-04 1.700e-03 1.000e-03
 2.000e-04 3.930e-02 1.650e-02 1.400e-03 1.200e-03 9.000e-04 1.393e-01
 1.500e-03 1.300e-03 1.800e-03 6.000e-04 1.600e-03 3.000e-04 1.000e+00
 9.900e-02 3.900e-03 1.020e-02 7.800e-03 5.000e-04 8.115e-01 4.000e-04
 3.000e-03 2.400e-03 1.150e-02 4.100e-03 2.328e-01 6.700e-03 5.100e-03
 1.343e-01 2.352e-01 1.700e-02 4.900e-03 5.900e-03 1.640e-02 2.450e-02
 3.690e-02 2.300e-03]
