## 1) Preliminary analysis/preparation of aggregation

In [3]:
!pip3 install pandas
import pandas as pd
import os

raw_data_dir = os.path.join('..', 'data', 'raw')
inspection = os.path.join(raw_data_dir, 'inspection.csv')
inspection_order = os.path.join(raw_data_dir, 'order.csv')

inspection_df = pd.read_csv(inspection)
inspection_df = inspection_df.rename(columns={
    'originatingservicerequestnumber': 'Originating_Service_Request_Number',
    'InspectionCustomer': 'Inspection_Customer',
    'ElevatingDevicesNumber': 'Elevating_Devices_Number',
    'InspectionNumber': 'Inspection_Number',
    'InspectionLocation': 'Inspection_Location',
    'InspectionType': 'Inspection_Type',
    'Earliest_INSPECTION_Date': 'Earliest_Inspection_Date',
    'Latest_INSPECTION_Date': 'Latest_Inspection_Date',
    'InspectionOutcome': 'Inspection_Outcome'
})

inspection_order_df = pd.read_csv(inspection_order)
inspection_order_df = inspection_order_df.rename(columns={
    'ElevatingDevicesNumber': 'Elevating_Devices_Number',
    'TSSAStandardOrderNumber': 'TSSA_Standard_Order_Number',
    'RegulationReference': 'Regulation_Reference',
    'ClauseNumber': 'Clause_Number',
    'ClauseText': 'Clause_Text',
    'DIRECTIVE': 'Directive',
    'Inspectionsadditionalinformation': 'Inspections_Additional_Information',
    'RISKSCORE': 'Risk_Score',
    'Inspection_type': 'Inspection_Type',
    'DateofIssue': 'Date_of_Issue',
    'StatusofInspectionOrder': 'Status_of_Inspection_Order',
    'inspectionnumber': 'Inspection_Number',
    'DaystoComply': 'Days_to_Comply',
    'ComplianceDate': 'Compliance_Date',
    'customerorderedtocomply': 'Customer_Ordered_to_Comply'
})

print(inspection_order_df.columns)


Index(['Elevating_Devices_Number', 'TSSA_Standard_Order_Number',
       'Regulation_Reference', 'Clause_Number', 'Clause_Text', 'Directive',
       'Inspections_Additional_Information', 'Risk_Score', 'Inspection_Type',
       'Date_of_Issue', 'Status_of_Inspection_Order', 'Inspection_Number',
       'Days_to_Comply', 'Compliance_Date', 'Customer_Ordered_to_Comply'],
      dtype='object')


### a) Inspection Dataset and Inspection Order Dataset Matching: 

In [4]:
# Check for missing inspections
missing_inspections = inspection_df[~inspection_df['Inspection_Number'].isin(inspection_order_df['Inspection_Number'])]
print(f"Total missing inspections: {len(missing_inspections)}")

# Verify inspections with associated orders
has_associated_orders = inspection_df['Inspection_Number'].isin(inspection_order_df['Inspection_Number'])
inspection_df['Has Associated Orders'] = has_associated_orders

# Verify orders with associated inspections
has_associated_inspections = inspection_order_df['Inspection_Number'].isin(inspection_df['Inspection_Number'])
inspection_order_df['Has_Associated_Inspections'] = has_associated_inspections

# Print summary
print(f"Inspections with associated orders: {has_associated_orders.sum()}")
print(f"Orders with associated inspections: {has_associated_inspections.sum()}")

Total missing inspections: 95570
Inspections with associated orders: 47611
Orders with associated inspections: 162172


### b) Sort Datasets by Time

In [5]:
sorted_inspections = inspection_df.sort_values(by=['Earliest_Inspection_Date'])

sorted_inspections_order = inspection_order_df.sort_values(by=['Date_of_Issue'])

print(sorted_inspections_order.head(20))
print(sorted_inspections.tail(20))

        Elevating_Devices_Number TSSA_Standard_Order_Number  \
35384                       2110                        NaN   
89927                      37591                        NaN   
43513                      22202                        NaN   
89928                      37591                        NaN   
43517                      22202                        NaN   
43514                      22202                        NaN   
89313                      37592                        NaN   
43439                      22203                        NaN   
125368                     68476                        NaN   
126644                     68248                        NaN   
126645                     68248                        NaN   
126643                     68248                        NaN   
125546                     68477                        NaN   
125545                     68477                        NaN   
80142                      35068                       

### c) Inspection Orders Dataset:

In [6]:
sorted_inspections_order['Directive'] = sorted_inspections_order['Directive'].astype(str).where(sorted_inspections_order['Directive'].notnull(), 'No Directive.')
sorted_inspections_order['Inspections_Additional_Information'] = sorted_inspections_order['Inspections_Additional_Information'].astype(str).where(sorted_inspections_order['Inspections_Additional_Information'].notnull(), 'No Additional Information.')

sorted_inspections_order['Combined_Inspection_Info'] = (
    sorted_inspections_order['Directive'].str.replace(r'[\*\_\-]', ' ', regex=True).str.strip() + ' ' +
    sorted_inspections_order['Inspections_Additional_Information'].str.replace(r'[\*\_\-]', ' ', regex=True).str.strip()
)
print(sorted_inspections_order['Combined_Inspection_Info'].head(20))

35384     No Directive. Monthly Maintenance other:  All ...
89927     No Directive. submitt for alteration of device...
43513     No Directive. Monthly Maintenance other:  MAKE...
89928     No Directive. the fire recall keys shall be ke...
43517     No Directive. Monthly Maintenance other:  CLEA...
43514     No Directive. The annual test of the safeties ...
89313     No Directive. submitt for alteration of device...
43439     No Directive. THE OWNER SHALL REPORT TO TSSA W...
125368    No Directive. Make both in car emergency light...
126644    No Directive. oil log book other:  CLEAN THE PIT.
126645    No Directive. Car enclosure other:  RETURN THE...
126643    No Directive. Car enclosure other:  RE FASTEN/...
125546    No Directive. Car enclosure other:  REPLACE TH...
125545    No Directive. The car false ceiling shall be r...
80142     No Directive. Pit other:  remove non elevator ...
80141     No Directive. General  other  the alteration c...
80140     No Directive. General  other  

### d) Handling Missing "RISK SCORE" Values: 

The values are huge floats, so values have to be normalized to get a better idea and improve the model performance when used. Normalization will help to do meaningful conparisons and avoid bias. In this case the column is using very different scales and also NaN values. They will be converted to 

In [7]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()

# Convert 'Risk_Score' to numeric and handle errors
sorted_inspections_order['Risk_Score'] = pd.to_numeric(sorted_inspections_order['Risk_Score'], errors='coerce')

# Replace null, empty, or undefined values with the median
median_risk_score = sorted_inspections_order['Risk_Score'].median()
sorted_inspections_order['Risk_Score'] = sorted_inspections_order['Risk_Score'].fillna(median_risk_score)

# Apply MinMaxScaler
sorted_inspections_order['Risk_Score'] = scaler.fit_transform(sorted_inspections_order[['Risk_Score']])

# Check unique values
test = sorted_inspections_order['Risk_Score'].unique()
print(test)

[7.38313896e-04 1.08286038e-03 1.13208131e-03 4.92209264e-05
 1.77195335e-03 8.36755748e-04 1.47662779e-04 7.87534822e-04
 1.03363945e-03 1.37818594e-03 8.85976675e-04 1.47662779e-03
 1.42740686e-03 1.67351150e-03 1.18130223e-03 0.00000000e+00
 4.48191227e-05 8.80779011e-04 2.24095613e-05 3.78143646e-06
 8.10066106e-05 2.53100437e-06 3.63804197e-06 1.62422412e-07
 5.30290510e-08 9.58488944e-09 1.81625218e-09 1.00902899e-10
 1.27482199e-09 3.13733859e-05 3.22782320e-07 5.23887566e-05
 1.49397076e-05 1.44217314e-10 1.25733016e-06 1.05541945e-04
 9.35807941e-09 1.79618594e-06 7.95435765e-08 6.22824828e-06
 3.25842533e-09 3.11338896e-04 1.30435455e-09 2.09743676e-04
 3.52518306e-07 6.10402834e-07 4.79006315e-04 7.88391266e-07
 6.81362281e-07 1.52421301e-06 5.56595325e-06 5.89621710e-07
 3.52934070e-05 1.39313792e-01 1.15136252e-02 1.86055102e-09
 2.66466325e-06 1.16245652e-07 8.20626267e-06 1.63820205e-05
 2.81830283e-06 4.30108018e-06 4.06352055e-06 7.62849149e-06
 5.20766911e-06 4.144402

### e) Inspection Outcome Variable (inspection dataset):

In [8]:
counts = sorted_inspections['Inspection_Outcome'].value_counts()

to_replace = counts[counts < 500].index

sorted_inspections['Inspection_Outcome'] = sorted_inspections['Inspection_Outcome'].replace(to_replace, 'Other')

grouped_counts = sorted_inspections['Inspection_Outcome'].value_counts()

print(grouped_counts)


Inspection_Outcome
Follow up              54605
Passed                 26064
DC Follow up           22302
All Orders Resolved    19555
Complete                7506
Shutdown                6110
Other                   2201
Follow up Major         1117
Follow up Sub Major     1002
Follow Up Initial        877
Unable to Inspect        689
Fail Initial             602
Passed Major             551
Name: count, dtype: int64


### f) Inspection Type Variable (inspection dataset): 

In [9]:
unique_values = sorted_inspections['Inspection_Type'].unique()
print(unique_values)
print('-------------')

corrections = {
    'ED-Sub  Inspection': 'Elevating Device Sub Inspection',
    'ED-FU Enforcement Action Insp': 'Elevating Device Followup Enforcement Action Inspection',
    'ED-Followup Lic Insp': 'Elevating Device -Followup License Inspection',
    'ED-Followup No-Lic Insp': 'Elevating Device Followup No-License Inspection',
    'ED-PWGSC Insp': 'Elevating Device PWGSC Inspection',
    'ED-PWGSC Foll-Up': 'Elevating Device PWGSC Follow-Up',
    'ED-Followup Ownership Change': 'Elevating Device Followup Ownership Change Inspection',
    'ED-Followup Minor Alt': 'ED-Followup Minor Alteration Inspection',
    'ED-MCP Follow up': 'Elevating Device MCP Follow-Up',
    'ED-MCP Enforcement Insp': 'Elevating Device MCP Enforcement Inspection',
    'ED-Perform L1 Incident Insp': 'Elevating Device Perform Level 1 Incident Inspection',
    'ED-Perform L1 Near Miss Insp': 'Elevating Device Perform Level 1 Near Miss Inspection',
    'ED-Inspection Temp Lic': 'Elevating Device Inspection Temporary License',
    'ED-Reg Non-Compliance': 'Elevating Device Regulatory Non-Compliance',
    'ED-Followup Reg Non-Compliance': 'Elevating Device Followup Regulatory Non-Compliance',
    'ED-Non-Mandated Insp ON': 'Elevating Device Non-Mandated Inspection Ontario',
    'ED-Non-Mandated Followup ON': 'Elevating Device Non-Mandated Follow-Up Ontario',
}

sorted_inspections['Inspection_Type'] = sorted_inspections['Inspection_Type'].replace(corrections)

print(sorted_inspections['Inspection_Type'].unique())

['ED-Followup Inspection' 'ED-Periodic Inspection' 'ED-Minor B Inspection'
 'ED-Sub Inspection' 'ED-Initial Inspection' 'ED-Enforcement Action'
 'ED-Minor A Inspection' 'ED-Sub  Inspection' 'ED-Unscheduled Inspection'
 'ED-Sub Inspection Major' 'ED-Sub Failed Initial'
 'ED-Major Alteration Inspection' 'ED-Followup Minor Alt'
 'ED-FU Enforcement Action Insp' 'ED-Followup Ownership Change'
 'ED-Followup Lic Insp' 'ED-MCP Follow up' 'ED-Re-Activate Inspection'
 'ED-Non-Mandated Insp ON' 'ED-MCP Enforcement Insp'
 'ED-Followup No-Lic Insp' 'ED-PWGSC Insp'
 'ED-Followup Reg Non-Compliance' 'ED-Inspection Temp Lic'
 'ED-Reg Non-Compliance' 'ED-PWGSC Foll-Up' 'ED-Non-Mandated Followup ON'
 'ED-Perform L1 Incident Insp' 'ED-Perform L1 Near Miss Insp']
-------------
['ED-Followup Inspection' 'ED-Periodic Inspection' 'ED-Minor B Inspection'
 'ED-Sub Inspection' 'ED-Initial Inspection' 'ED-Enforcement Action'
 'ED-Minor A Inspection' 'Elevating Device Sub Inspection'
 'ED-Unscheduled Inspection' 

### g) Create Dummy Variables (inspection dataset): 

In [10]:
# Create dummy variables for 'Inspection Outcome' and 'Inspection Type'
dummy_cols = ['Inspection_Outcome', 'Inspection_Type']
inspection_with_dummies = pd.get_dummies(sorted_inspections, columns=dummy_cols, prefix_sep='_', drop_first=False)

# Add the dummy columns back to sorted_inspections
sorted_inspections = pd.concat([sorted_inspections, inspection_with_dummies.filter(like='Inspection_Outcome_')], axis=1)
sorted_inspections = pd.concat([sorted_inspections, inspection_with_dummies.filter(like='Inspection_Type_')], axis=1)

# Check the updated DataFrame
print(sorted_inspections.head())

    Originating_Service_Request_Number  \
0                                55620   
123                             483984   
124                             114320   
125                             114320   
126                             509560   

                                   Inspection_Customer  \
0                       LANDSCAPE COURT APARTMENTS LTD   
123                           HOME HARDWARE STORES LTD   
124                              A.M.S. PROPERTIES INC   
125                              A.M.S. PROPERTIES INC   
126  CORPORATION OF THE CITY OF WINDSOR C/O TRAFFIC...   

     Elevating_Devices_Number  Inspection_Number  \
0                        9948            3157237   
123                     20330            3133462   
124                     39953            3193902   
125                     39953            2877417   
126                     32802            3187485   

                         Inspection_Location         Inspection_Type  \
0     10 WYCO

## 2- Aggregation of the Inspection Dataset

### a & b) Group Data by Key Identifiers & Group Data by Key Identifiers:

In [11]:
sorted_inspections['Earliest_Inspection_Date'] = pd.to_datetime(sorted_inspections['Earliest_Inspection_Date'], errors='coerce')
sorted_inspections['Latest_Inspection_Date'] = pd.to_datetime(sorted_inspections['Latest_Inspection_Date'], errors='coerce')

grouped_inspections = sorted_inspections.groupby('Elevating_Devices_Number').agg({
    'Earliest_Inspection_Date': 'min',
    'Latest_Inspection_Date': 'max',
    'Inspection_Number': lambda x: list(x),
    'Inspection_Customer': 'first'
}).reset_index()

print(grouped_inspections.head(5))

   Elevating_Devices_Number Earliest_Inspection_Date Latest_Inspection_Date  \
0                         8               2012-03-05             2015-03-27   
1                         9               2012-03-05             2015-03-27   
2                        10               2012-03-05             2015-03-27   
3                        11               2012-03-05             2015-03-27   
4                        13               2012-03-06             2015-04-30   

                                   Inspection_Number  \
0  [3930603, 4076073, 3747295, 5248300, 3132984, ...   
1  [3930608, 4076078, 3747272, 5248304, 2708468, ...   
2  [4184747, 5248292, 5312245, 3930613, 4076085, ...   
3  [4184734, 3930616, 4076090, 3747339, 5248299, ...   
4  [3930620, 4076117, 3749130, 3763512, 5248307, ...   

                                 Inspection_Customer  
0  LEGISLATIVE ASSEMBLY OF ONTARIO ATTN:  JOHN ED...  
1  LEGISLATIVE ASSEMBLY OF ONTARIO ATTN:  JOHN ED...  
2  LEGISLATIVE ASSEMBLY

### c) Format Data for Current Inspection Outcome: Separate the current inspection

In [12]:
grouped_outcomes = sorted_inspections.groupby(['Elevating_Devices_Number', 'Inspection_Outcome']) \
    .size().reset_index(name='Count')

pivoted_outcomes = grouped_outcomes.pivot(
    index='Elevating_Devices_Number',
    columns='Inspection_Outcome',
    values='Count'
).fillna(0).astype(int).reset_index()

final_df = pd.merge(grouped_inspections, pivoted_outcomes, on='Elevating_Devices_Number', how='left')
final_df.columns = [col.replace(" ", "_") for col in final_df.columns]

print(final_df.head(1))

   Elevating_Devices_Number Earliest_Inspection_Date Latest_Inspection_Date  \
0                         8               2012-03-05             2015-03-27   

                                   Inspection_Number  \
0  [3930603, 4076073, 3747295, 5248300, 3132984, ...   

                                 Inspection_Customer  All_Orders_Resolved  \
0  LEGISLATIVE ASSEMBLY OF ONTARIO ATTN:  JOHN ED...                    0   

   Complete  DC_Follow_up  Fail_Initial  Follow_Up_Initial  Follow_up  \
0         0             0             0                  0          5   

   Follow_up_Major  Follow_up_Sub_Major  Other  Passed  Passed_Major  \
0                0                    0      0       1             0   

   Shutdown  Unable_to_Inspect  
0         0                  0  


#### 4. Add time dependent new features

In [15]:
print(inspections_df.columns)

# Ensure 'Latest_Inspection_Date' is in datetime format
sorted_inspections['Latest_Inspection_Date'] = pd.to_datetime(
    sorted_inspections_order['Latest_Inspection_Date'], errors='coerce'
)
# Drop rows with NaT in 'Latest_Inspection_Date'
sorted_inspections_order = sorted_inspections_order.dropna(subset=['Latest_Inspection_Date'])

# Sort by 'Elevating_Devices_Number' and 'Latest_Inspection_Date'
sorted_inspections_order = sorted_inspections_order.sort_values(
    by=['Elevating_Devices_Number', 'Latest_Inspection_Date']
)

# Rolling average grouped by 'Elevating_Devices_Number'
rolled = (
    sorted_inspections_order
    .set_index('Latest_Inspection_Date')
    .groupby('Elevating_Devices_Number')['Risk_Score']
    .rolling('180D', min_periods=1, closed='right')
    .mean()
    .reset_index()
    .rename(columns={'Risk_Score': 'Avg_last_6_months'})
)

# Merge back into final_df
final_df = pd.merge(
    sorted_inspections_order.reset_index(drop=True),
    rolled,
    how='left',
    on=['Elevating_Devices_Number', 'Latest_Inspection_Date']
)

# Fill missing values
final_df['Avg_last_6_months'] = final_df['Avg_last_6_months'].fillna(0)

# Scale the rolling average
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
final_df['Avg_last_6_months_scaled'] = scaler.fit_transform(
    final_df[['Avg_last_6_months']]
)

# Preview the result
print(final_df[['Elevating_Devices_Number', 'Latest_Inspection_Date', 'Avg_last_6_months']].head())

Index(['Originating_Service_Request_Number', 'Inspection_Customer',
       'Elevating_Devices_Number', 'Inspection_Number', 'Inspection_Location',
       'Inspection_Type', 'Earliest_Inspection_Date', 'Latest_Inspection_Date',
       'Inspection_Outcome', 'Has Associated Orders',
       'Inspection_Outcome_All Orders Resolved', 'Inspection_Outcome_Complete',
       'Inspection_Outcome_DC Follow up', 'Inspection_Outcome_Fail Initial',
       'Inspection_Outcome_Follow Up Initial', 'Inspection_Outcome_Follow up',
       'Inspection_Outcome_Follow up Major',
       'Inspection_Outcome_Follow up Sub Major', 'Inspection_Outcome_Other',
       'Inspection_Outcome_Passed', 'Inspection_Outcome_Passed Major',
       'Inspection_Outcome_Shutdown', 'Inspection_Outcome_Unable to Inspect',
       'Inspection_Type_ED-Enforcement Action',
       'Inspection_Type_ED-Followup Inspection',
       'Inspection_Type_ED-Followup Minor Alteration Inspection',
       'Inspection_Type_ED-Initial Inspection',
 

KeyError: 'Latest_Inspection_Date'

### d) Filter Data Based on Inspection Order Dataset: A

In [105]:
inspection_numbers_set = set(sorted_inspections_order['Inspection_Number'])

filtered_df = final_df[final_df['Inspection_Number'].apply(lambda x: any(num in inspection_numbers_set for num in x))]

print(filtered_df.head(5))
print(f"Total rows after filter: {len(filtered_df)}")


   Elevating_Devices_Number Earliest_Inspection_Date Latest_Inspection_Date  \
0                         8               2012-03-05             2015-03-27   
2                        10               2012-03-05             2015-03-27   
3                        11               2012-03-05             2015-03-27   
4                        13               2012-03-06             2015-04-30   
5                        14               2012-03-06             2015-04-30   

                                   Inspection_Number  \
0  [3930603, 4076073, 3747295, 5248300, 3132984, ...   
2  [4184747, 5248292, 5312245, 3930613, 4076085, ...   
3  [4184734, 3930616, 4076090, 3747339, 5248299, ...   
4  [3930620, 4076117, 3749130, 3763512, 5248307, ...   
5  [3930626, 4076138, 3749146, 3763473, 5248309, ...   

                                 Inspection_Customer  All_Orders_Resolved  \
0  LEGISLATIVE ASSEMBLY OF ONTARIO ATTN:  JOHN ED...                    0   
2  LEGISLATIVE ASSEMBLY OF ONTARIO

# MACHINE LEARNING MODELING

## 3 Modeling Part 1

### a) Determine the baseline score

In [106]:
outcome_columns = [
    'Complete', 'DC_Follow_up', 'Fail_Initial', 'Follow_Up_Initial',
    'Follow_up', 'Follow_up_Major', 'Follow_up_Sub_Major',
    'Other', 'Passed', 'Passed_Major', 'Shutdown', 'Unable_to_Inspect'
]

final_df['Total_Inspections'] = final_df[outcome_columns].sum(axis=1)

final_df['Pass_Rate'] = (final_df['Passed'] + final_df['Passed_Major'] )/ final_df['Total_Inspections']

total_passed = final_df['Passed'].sum() + final_df['Passed_Major'].sum()
total_inspections = final_df['Total_Inspections'].sum()
global_pass_rate = total_passed / total_inspections

print(f"Global baseline pass rate: {global_pass_rate:.2%}")

Global baseline pass rate: 21.53%


### b) Setup a pipeline

In [107]:
X = final_df[[
    'Elevating_Devices_Number',
    'All_Orders_Resolved',
    'Complete',
    'DC_Follow_up',
    'Earliest_Inspection_Date',
    'Total_Inspections', 
    
]]

X.loc[:, 'Earliest_Inspection_Year'] = pd.to_datetime(X['Earliest_Inspection_Date']).dt.year
X = X.drop(columns=['Earliest_Inspection_Date'])

dummy_cols = sorted_inspections.filter(like='Inspection_Outcome_').join(
    sorted_inspections.filter(like='Inspection_Type_')
)

dummy_cols = dummy_cols.loc[final_df.index]

X = pd.concat([X, dummy_cols], axis=1)

y = final_df['Pass_Rate'] > global_pass_rate  # Binary target based on global pass rate

assert X.shape[0] == y.shape[0], "X and y have inconsistent samples!"


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
steps = [
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('logistic', LogisticRegression())
]

pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)

score = pipeline.score(X_test, y_test)
print(f"Pipeline score: {score:.2f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'Earliest_Inspection_Year'] = pd.to_datetime(X['Earliest_Inspection_Date']).dt.year


Pipeline score: 0.56


In [108]:
# Step 1: Ensure alignment of X and y
# Start with the same base DataFrame
X = final_df[[
    'Elevating_Devices_Number',
    'All_Orders_Resolved',
    'Complete',
    'DC_Follow_up',
    'Earliest_Inspection_Date',
    'Latest_Inspection_Date',
    'Total_Inspections'
    'Avg_last_6_months_scaled',
]]

# Step 2: Process 'Earliest_Inspection_Date'
X['Earliest_Inspection_Year'] = pd.to_datetime(X['Earliest_Inspection_Date']).dt.year
X = X.drop(columns=['Earliest_Inspection_Date'])

# Step 3: Add dummy variables for 'Inspection Outcome' and 'Inspection Type'
dummy_cols = sorted_inspections.filter(like='Inspection_Outcome_').join(
    sorted_inspections.filter(like='Inspection_Type_')
)

# Ensure dummy variables align with the index of final_df
dummy_cols = dummy_cols.loc[final_df.index]

# Combine with the main feature set
X = pd.concat([X, dummy_cols], axis=1)

# Step 4: Define the target variable
y = final_df['Pass_Rate'] > global_pass_rate  # Binary target based on global pass rate

# Step 5: Verify alignment
assert X.shape[0] == y.shape[0], "X and y have inconsistent samples!"

# Step 6: Train/test split and pipeline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

steps = [
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('logistic', LogisticRegression())
]

pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)

# Check the score
score = pipeline.score(X_test, y_test)
print(f"Pipeline score: {score:.2f}")

Pipeline score: 0.56


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Earliest_Inspection_Year'] = pd.to_datetime(X['Earliest_Inspection_Date']).dt.year


### c) Include a feature Selection Step (Model optimization)

In [109]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

models = {
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'SVC': SVC(),
    'KNeighbors': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression()
}

for name, model in models.items(): 
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=2)), 
        ('classifier', model)
    ])

    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    acc = accuracy_score(y_test, predictions)


#### Cross Validation 

In [110]:
from sklearn.model_selection import cross_val_score

for name, model in models.items(): 
    pipeline = Pipeline([
        ('scaler', StandardScaler()), 
        ('pca', PCA(n_components=2)),
        ('classifier', model)
    ])
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
    print(f"{name} CV Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

RandomForest CV Accuracy: 0.64 ± 0.03
GradientBoosting CV Accuracy: 0.61 ± 0.03
SVC CV Accuracy: 0.56 ± 0.01
KNeighbors CV Accuracy: 0.65 ± 0.03
LogisticRegression CV Accuracy: 0.56 ± 0.00


#### Hyper Parameter Tuning

In [111]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

param_grid = {
    'classifier__n_neighbors': randint(1, 30),
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2] 
}

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', KNeighborsClassifier()) 
])

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=10,
    cv=3,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1 
)

search.fit(X_train, y_train)

print(f"Best parameters: {search.best_params_}")
print(f"Best cross-validation score: {search.best_score_:.2f}")

Best parameters: {'classifier__n_neighbors': 15, 'classifier__p': 1, 'classifier__weights': 'distance'}
Best cross-validation score: 0.70


In [112]:
# Final model evaluation
best_model = search.best_estimator_
final_preds = best_model.predict(X_test)
final_score = accuracy_score(y_test, final_preds)
print("Final test accuracy:", final_score)

# Save the final model
import joblib
joblib.dump(best_model, 'best_model.pkl')



Final test accuracy: 0.7098034428030765


['best_model.pkl']

## 4- Feature Engineering

In [123]:
# # Ensure the DataFrame is sorted by 'Latest_Inspection_Date'
# sorted_inspections = sorted_inspections.sort_values(by='Latest_Inspection_Date')

# # Calculate the rolling average for the last 6 months
# # Ensure the index is unique by resetting it
# sorted_inspections = sorted_inspections.reset_index(drop=True)

# final_df['Avg_last_6_months'] = (
#     sorted_inspections
#     .set_index('Latest_Inspection_Date')  # Corrected column name
#     .groupby('Elevating_Devices_Number')['Risk_Score']
#     .rolling('180D', min_periods=1, closed='left')  # Use '180D' for 6 months
#     .mean()
#     .reset_index(level=0, drop=True)
# )

# print(final_df['Avg_last_6_months'].head(20))




ValueError: cannot reindex on an axis with duplicate labels