In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

In [67]:
df = pd.read_csv(r"C:\Users\Jason\Downloads\Fire_Incident_Dispatch_Data_20250408.csv")
print(df.shape)
total_rows = df.shape[0]
total_columns = df.shape[1]

  df = pd.read_csv(r"C:\Users\Jason\Downloads\Fire_Incident_Dispatch_Data_20250408.csv")


(10275092, 29)


In [68]:
# Dropping unnecessary columns that contained no useful information
df.drop(columns=['STARFIRE_INCIDENT_ID', 'ZIPCODE', 'POLICEPRECINCT', 'CITYCOUNCILDISTRICT', 'COMMUNITYDISTRICT', 'COMMUNITYSCHOOLDISTRICT', 'CONGRESSIONALDISTRICT', 'ALARM_BOX_NUMBER', 'ALARM_BOX_LOCATION', 'INCIDENT_CLASSIFICATION', 'ALARM_LEVEL_INDEX_DESCRIPTION', 'HIGHEST_ALARM_LEVEL', 'VALID_DISPATCH_RSPNS_TIME_INDC'], inplace=True)
#Dropping all columns with DateTime data types
df.drop(columns=['INCIDENT_DATETIME', 'FIRST_ASSIGNMENT_DATETIME', 'FIRST_ACTIVATION_DATETIME', 'FIRST_ON_SCENE_DATETIME', 'INCIDENT_CLOSE_DATETIME'], inplace=True)
missing_counts = df.isnull().sum()
missing_counts = missing_counts[missing_counts > 0]
print(missing_counts)

ALARM_BOX_BOROUGH                     2
DISPATCH_RESPONSE_SECONDS_QY        936
INCIDENT_RESPONSE_SECONDS_QY     408002
INCIDENT_TRAVEL_TM_SECONDS_QY    408034
ENGINES_ASSIGNED_QUANTITY          1584
LADDERS_ASSIGNED_QUANTITY          1584
OTHER_UNITS_ASSIGNED_QUANTITY      1584
dtype: int64


In [69]:
# We can see that the rows with missing values only account for around 4% of the total rows, so we can drop them without losing too much data.
df.dropna(inplace=True)
print(df.shape)
# Remapping the values in VALID_INCIDENT_RSPNS_TIME_INDC
df['VALID_INCIDENT_RSPNS_TIME_INDC'] = df['VALID_INCIDENT_RSPNS_TIME_INDC'].map({'Y': 1, 'N': 0})

(9866972, 11)


In [70]:
# Checking to see if the ALARM_BOX_BOROUGH and INCIDENT_BOROUGH columns are equal enough to be used as a single column, they are comepletley equal, so they will be combined
print((df['ALARM_BOX_BOROUGH'] == df['INCIDENT_BOROUGH']).value_counts())
df.drop(columns=['ALARM_BOX_BOROUGH'], inplace=True)

True    9866972
Name: count, dtype: int64


In [71]:
df.groupby('INCIDENT_BOROUGH').count()

Unnamed: 0_level_0,ALARM_SOURCE_DESCRIPTION_TX,INCIDENT_CLASSIFICATION_GROUP,DISPATCH_RESPONSE_SECONDS_QY,VALID_INCIDENT_RSPNS_TIME_INDC,INCIDENT_RESPONSE_SECONDS_QY,INCIDENT_TRAVEL_TM_SECONDS_QY,ENGINES_ASSIGNED_QUANTITY,LADDERS_ASSIGNED_QUANTITY,OTHER_UNITS_ASSIGNED_QUANTITY
INCIDENT_BOROUGH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BRONX,1996904,1996904,1996904,1996904,1996904,1996904,1996904,1996904,1996904
BROOKLYN,2808691,2808691,2808691,2808691,2808691,2808691,2808691,2808691,2808691
MANHATTAN,2591202,2591202,2591202,2591202,2591202,2591202,2591202,2591202,2591202
QUEENS,1966132,1966132,1966132,1966132,1966132,1966132,1966132,1966132,1966132
RICHMOND / STATEN ISLAND,504043,504043,504043,504043,504043,504043,504043,504043,504043


In [72]:
value_counts = df['ALARM_SOURCE_DESCRIPTION_TX'].value_counts()
valid_categories = value_counts[value_counts >= 5000].index
df = df[df['ALARM_SOURCE_DESCRIPTION_TX'].isin(valid_categories)]
df.groupby('ALARM_SOURCE_DESCRIPTION_TX').count()
print(f"Percentage of remaining rows: {df.shape[0] / total_rows * 100:.2f}%")

Percentage of remaining rows: 95.97%


In [73]:
df.groupby('INCIDENT_CLASSIFICATION_GROUP').count()

Unnamed: 0_level_0,INCIDENT_BOROUGH,ALARM_SOURCE_DESCRIPTION_TX,DISPATCH_RESPONSE_SECONDS_QY,VALID_INCIDENT_RSPNS_TIME_INDC,INCIDENT_RESPONSE_SECONDS_QY,INCIDENT_TRAVEL_TM_SECONDS_QY,ENGINES_ASSIGNED_QUANTITY,LADDERS_ASSIGNED_QUANTITY,OTHER_UNITS_ASSIGNED_QUANTITY
INCIDENT_CLASSIFICATION_GROUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Medical Emergencies,4391908,4391908,4391908,4391908,4391908,4391908,4391908,4391908,4391908
Medical MFAs,67780,67780,67780,67780,67780,67780,67780,67780,67780
NonMedical Emergencies,4149412,4149412,4149412,4149412,4149412,4149412,4149412,4149412,4149412
NonMedical MFAs,427417,427417,427417,427417,427417,427417,427417,427417,427417
NonStructural Fires,305901,305901,305901,305901,305901,305901,305901,305901,305901
Structural Fires,518838,518838,518838,518838,518838,518838,518838,518838,518838


In [74]:
# Target Encoding for Categorical Variables using Stratified K-Folds

def target_encode(df, col, target='VALID_INCIDENT_RSPNS_TIME_INDC', n_splits=10, alpha=10):

    df = df.copy()
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    encoded = pd.Series(index=df.index, dtype=float)
    
    global_mean = df[target].mean()

    for train_idx, val_idx in skf.split(df, df[target]):
        train, val = df.iloc[train_idx], df.iloc[val_idx]

        # Compute smoothed means
        category_stats = train.groupby(col)[target].agg(['mean', 'count'])
        smooth = (category_stats['count'] * category_stats['mean'] + alpha * global_mean) / (category_stats['count'] + alpha)

        # Map to validation fold
        encoded.iloc[val_idx] = val[col].map(smooth).fillna(global_mean)

    return encoded

In [75]:
print(df.shape)
df.head()

(9861256, 10)


Unnamed: 0,INCIDENT_BOROUGH,ALARM_SOURCE_DESCRIPTION_TX,INCIDENT_CLASSIFICATION_GROUP,DISPATCH_RESPONSE_SECONDS_QY,VALID_INCIDENT_RSPNS_TIME_INDC,INCIDENT_RESPONSE_SECONDS_QY,INCIDENT_TRAVEL_TM_SECONDS_QY,ENGINES_ASSIGNED_QUANTITY,LADDERS_ASSIGNED_QUANTITY,OTHER_UNITS_ASSIGNED_QUANTITY
0,RICHMOND / STATEN ISLAND,Phone,NonMedical Emergencies,40,1,397,357,1.0,1.0,0.0
1,BRONX,PD Link/Medical,Medical MFAs,10,1,260,250,1.0,0.0,0.0
2,BROOKLYN,PD Link/Medical,Medical Emergencies,7,1,146,139,1.0,0.0,0.0
3,QUEENS,Phone,NonMedical MFAs,52,1,327,275,3.0,2.0,1.0
4,QUEENS,Private Fire Alarm,NonMedical Emergencies,112,1,391,279,1.0,1.0,1.0


In [76]:
# Target Encoding for categorical variables
df['Borough num'] = target_encode(df, 'INCIDENT_BOROUGH')
df['Alarm Source num'] = target_encode(df, 'ALARM_SOURCE_DESCRIPTION_TX')
df['Incident Classification num'] = target_encode(df, 'INCIDENT_CLASSIFICATION_GROUP')

# Computing cleaning metrics
cleaned_rows = df.shape[0]
cleaned_columns = df.shape[1]
print((cleaned_rows * cleaned_columns) / (total_rows * total_columns) * 100)


43.02212607422648


In [77]:
# Renaming columns to make them easier to read and work with
df.rename(columns={'INCIDENT_BOROUGH': 'Borough', 'ALARM_SOURCE_DESCRIPTION_TX': 'Alarm Source', 'INCIDENT_CLASSIFICATION_GROUP': 'Incident Classification', 'DISPATCH_RESPONSE_SECONDS_QY': 'Dispatch Response Time', 'VALID_INCIDENT_RSPNS_TIME_INDC': 'Valid Response Time', 'INCIDENT_RESPONSE_SECONDS_QY': 'Incident Response Time', 'INCIDENT_TRAVEL_TM_SECONDS_QY': 'Incident Travel Time', 'ENGINES_ASSIGNED_QUANTITY': 'Engines Assigned', 'LADDERS_ASSIGNED_QUANTITY': 'Ladders Assigned', 'OTHER_UNITS_ASSIGNED_QUANTITY': 'Other Units Assigned'}, inplace=True)
order = ['Borough', 'Borough num', 'Alarm Source', 'Alarm Source num', 'Incident Classification', 'Incident Classification num', 'Dispatch Response Time', 'Incident Response Time', 'Incident Travel Time', 'Engines Assigned', 'Ladders Assigned', 'Other Units Assigned', 'Valid Response Time']
df = df[order]
df.head()

Unnamed: 0,Borough,Borough num,Alarm Source,Alarm Source num,Incident Classification,Incident Classification num,Dispatch Response Time,Incident Response Time,Incident Travel Time,Engines Assigned,Ladders Assigned,Other Units Assigned,Valid Response Time
0,RICHMOND / STATEN ISLAND,0.86465,Phone,0.957787,NonMedical Emergencies,0.951872,40,397,357,1.0,1.0,0.0,1
1,BRONX,0.829616,PD Link/Medical,0.68704,Medical MFAs,0.879852,10,260,250,1.0,0.0,0.0,1
2,BROOKLYN,0.860628,PD Link/Medical,0.68704,Medical Emergencies,0.69566,7,146,139,1.0,0.0,0.0,1
3,QUEENS,0.850763,Phone,0.957787,NonMedical MFAs,0.955968,52,327,275,3.0,2.0,1.0,1
4,QUEENS,0.850645,Private Fire Alarm,0.985807,NonMedical Emergencies,0.951918,112,391,279,1.0,1.0,1.0,1


In [78]:
pd.DataFrame(df).to_csv(r"C:\Users\Jason\Downloads\CleanedFireIncidentData.csv")