In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

In [3]:
df = pd.read_csv(r"C:\Users\Jason\Downloads\Fire_Incident_Dispatch_Data_20250408.csv")
total_rows = df.shape[0]
total_columns = df.shape[1]

  df = pd.read_csv(r"C:\Users\Jason\Downloads\Fire_Incident_Dispatch_Data_20250408.csv")


In [4]:
# Dropping unnecessary columns that contained no useful information
df.drop(columns=['STARFIRE_INCIDENT_ID', 'ZIPCODE', 'POLICEPRECINCT', 'CITYCOUNCILDISTRICT', 'COMMUNITYDISTRICT', 'COMMUNITYSCHOOLDISTRICT', 'CONGRESSIONALDISTRICT', 'ALARM_BOX_NUMBER', 'ALARM_BOX_LOCATION', 'INCIDENT_CLASSIFICATION', 'ALARM_LEVEL_INDEX_DESCRIPTION', 'HIGHEST_ALARM_LEVEL', 'VALID_DISPATCH_RSPNS_TIME_INDC'], inplace=True)
#Dropping most columns with DateTime data types, the other one will be converted to a categorical variable later
df.drop(columns=['FIRST_ASSIGNMENT_DATETIME', 'FIRST_ACTIVATION_DATETIME', 'FIRST_ON_SCENE_DATETIME', 'INCIDENT_CLOSE_DATETIME'], inplace=True)
# Dropping the column for INCIDENT_TRAVEL_TIME adnd INCIDENT_RESPONSE_SECONDS_QY, as they are mostly out of our control and will have too great of an impact on the model
df.drop(columns=['INCIDENT_TRAVEL_TM_SECONDS_QY', 'INCIDENT_RESPONSE_SECONDS_QY', 'DISPATCH_RESPONSE_SECONDS_QY'], inplace=True)

In [5]:
# Dropping rows with missing values
# The number of rows dropped is insignificant compared to the total number of rows
df.dropna(inplace=True)
missing_counts = df.isnull().sum()
missing_counts = missing_counts[missing_counts > 0]
print(missing_counts)
print(f"Percentage of remaining rows: {df.shape[0] / total_rows * 100:.2f}%")
# Remapping the values in VALID_INCIDENT_RSPNS_TIME_INDC
df['VALID_INCIDENT_RSPNS_TIME_INDC'] = df['VALID_INCIDENT_RSPNS_TIME_INDC'].map({'Y': 1, 'N': 0})

Series([], dtype: int64)
Percentage of remaining rows: 99.98%


In [6]:
# Checking to see if the ALARM_BOX_BOROUGH and INCIDENT_BOROUGH columns are equal enough to be used as a single column, they are comepletley equal, so they will be combined
print((df['ALARM_BOX_BOROUGH'] == df['INCIDENT_BOROUGH']).value_counts())
df.drop(columns=['ALARM_BOX_BOROUGH'], inplace=True)

True    10273506
Name: count, dtype: int64


In [7]:
# Convert string to datetime
df['INCIDENT_DATETIME'] = pd.to_datetime(df['INCIDENT_DATETIME'], format="%m/%d/%Y %I:%M:%S %p")

# Extract hour in 24-hour format
df['INCIDENT_HOUR'] = df['INCIDENT_DATETIME'].dt.hour
df.drop(columns=['INCIDENT_DATETIME'], inplace=True)
df.head()

Unnamed: 0,INCIDENT_BOROUGH,ALARM_SOURCE_DESCRIPTION_TX,INCIDENT_CLASSIFICATION_GROUP,VALID_INCIDENT_RSPNS_TIME_INDC,ENGINES_ASSIGNED_QUANTITY,LADDERS_ASSIGNED_QUANTITY,OTHER_UNITS_ASSIGNED_QUANTITY,INCIDENT_HOUR
0,RICHMOND / STATEN ISLAND,Phone,NonMedical Emergencies,1,1.0,1.0,0.0,0
1,BRONX,PD Link/Medical,Medical MFAs,1,1.0,0.0,0.0,0
2,BROOKLYN,PD Link/Medical,Medical Emergencies,1,1.0,0.0,0.0,0
3,QUEENS,Phone,NonMedical MFAs,1,3.0,2.0,1.0,0
4,QUEENS,Private Fire Alarm,NonMedical Emergencies,1,1.0,1.0,1.0,0


In [8]:
df.groupby('INCIDENT_HOUR').count()

Unnamed: 0_level_0,INCIDENT_BOROUGH,ALARM_SOURCE_DESCRIPTION_TX,INCIDENT_CLASSIFICATION_GROUP,VALID_INCIDENT_RSPNS_TIME_INDC,ENGINES_ASSIGNED_QUANTITY,LADDERS_ASSIGNED_QUANTITY,OTHER_UNITS_ASSIGNED_QUANTITY
INCIDENT_HOUR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,343437,343437,343437,343437,343437,343437,343437
1,285449,285449,285449,285449,285449,285449,285449
2,240137,240137,240137,240137,240137,240137,240137
3,212611,212611,212611,212611,212611,212611,212611
4,202483,202483,202483,202483,202483,202483,202483
5,203283,203283,203283,203283,203283,203283,203283
6,245705,245705,245705,245705,245705,245705,245705
7,324368,324368,324368,324368,324368,324368,324368
8,399400,399400,399400,399400,399400,399400,399400
9,449231,449231,449231,449231,449231,449231,449231


In [9]:
# Function to convert 24-hour format to AM/PM format for labels
def hour_to_ampm(h):
    suffix = "AM" if h < 12 or h == 24 else "PM"
    hour12 = h % 12
    if hour12 == 0:
        hour12 = 12
    return f"{hour12} {suffix}"


def assign_optimal_hour_groups(df, hour_col='INCIDENT_HOUR', group_col='HOUR GROUP'):

    # Function to create the hour groupings that will be checked
    def get_hour_groups(start):
        hours = [(start + i) % 24 for i in range(24)]
        return [hours[i:i+4] for i in range(0, 24, 4)]

    # Precompute value counts of each hour
    hour_counts = df[hour_col].value_counts().reindex(range(24), fill_value=0)

    best_std = np.inf
    best_groups = None

    # Checking all possible hour groupings to find the one with the lowest standard deviation
    for start in range(24):
        groups = get_hour_groups(start)
        group_totals = [hour_counts[group].sum() for group in groups]
        std = np.std(group_totals, ddof=0)
        if std < best_std:
            best_std = std
            best_groups = groups

    # Create mapping from hour to group index + label
    hour_to_group = {}
    hour_to_label = {}

    # Creating and assigning the labels for each group
    for i, group in enumerate(best_groups):
        for hour in group:
            hour_to_group[hour] = i
            start_hour = group[0]
            end_hour = group[-1]
            label = f"{hour_to_ampm(start_hour)} - {hour_to_ampm((end_hour + 1) % 24)}"
            hour_to_label[hour] = label

    df[group_col] = df[hour_col].map(hour_to_group)
    df[f"{group_col} LABEL"] = df[hour_col].map(hour_to_label)

    print(f"Best grouping: {best_groups} with Std Dev: {best_std:.2f}")

    return df, best_groups

In [10]:
# Assign optimal hour groups and labels
df, best_groups = assign_optimal_hour_groups(df)
df.head()

Best grouping: [[2, 3, 4, 5], [6, 7, 8, 9], [10, 11, 12, 13], [14, 15, 16, 17], [18, 19, 20, 21], [22, 23, 0, 1]] with Std Dev: 503556.97


Unnamed: 0,INCIDENT_BOROUGH,ALARM_SOURCE_DESCRIPTION_TX,INCIDENT_CLASSIFICATION_GROUP,VALID_INCIDENT_RSPNS_TIME_INDC,ENGINES_ASSIGNED_QUANTITY,LADDERS_ASSIGNED_QUANTITY,OTHER_UNITS_ASSIGNED_QUANTITY,INCIDENT_HOUR,HOUR GROUP,HOUR GROUP LABEL
0,RICHMOND / STATEN ISLAND,Phone,NonMedical Emergencies,1,1.0,1.0,0.0,0,5,10 PM - 2 AM
1,BRONX,PD Link/Medical,Medical MFAs,1,1.0,0.0,0.0,0,5,10 PM - 2 AM
2,BROOKLYN,PD Link/Medical,Medical Emergencies,1,1.0,0.0,0.0,0,5,10 PM - 2 AM
3,QUEENS,Phone,NonMedical MFAs,1,3.0,2.0,1.0,0,5,10 PM - 2 AM
4,QUEENS,Private Fire Alarm,NonMedical Emergencies,1,1.0,1.0,1.0,0,5,10 PM - 2 AM


In [11]:
df.groupby('HOUR GROUP LABEL').count()

Unnamed: 0_level_0,INCIDENT_BOROUGH,ALARM_SOURCE_DESCRIPTION_TX,INCIDENT_CLASSIFICATION_GROUP,VALID_INCIDENT_RSPNS_TIME_INDC,ENGINES_ASSIGNED_QUANTITY,LADDERS_ASSIGNED_QUANTITY,OTHER_UNITS_ASSIGNED_QUANTITY,INCIDENT_HOUR,HOUR GROUP
HOUR GROUP LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10 AM - 2 PM,2051773,2051773,2051773,2051773,2051773,2051773,2051773,2051773,2051773
10 PM - 2 AM,1485664,1485664,1485664,1485664,1485664,1485664,1485664,1485664,1485664
2 AM - 6 AM,858514,858514,858514,858514,858514,858514,858514,858514,858514
2 PM - 6 PM,2278544,2278544,2278544,2278544,2278544,2278544,2278544,2278544,2278544
6 AM - 10 AM,1418704,1418704,1418704,1418704,1418704,1418704,1418704,1418704,1418704
6 PM - 10 PM,2180307,2180307,2180307,2180307,2180307,2180307,2180307,2180307,2180307


In [12]:
df.groupby('INCIDENT_BOROUGH').count()

Unnamed: 0_level_0,ALARM_SOURCE_DESCRIPTION_TX,INCIDENT_CLASSIFICATION_GROUP,VALID_INCIDENT_RSPNS_TIME_INDC,ENGINES_ASSIGNED_QUANTITY,LADDERS_ASSIGNED_QUANTITY,OTHER_UNITS_ASSIGNED_QUANTITY,INCIDENT_HOUR,HOUR GROUP,HOUR GROUP LABEL
INCIDENT_BOROUGH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BRONX,2092133,2092133,2092133,2092133,2092133,2092133,2092133,2092133,2092133
BROOKLYN,2906904,2906904,2906904,2906904,2906904,2906904,2906904,2906904,2906904
MANHATTAN,2715137,2715137,2715137,2715137,2715137,2715137,2715137,2715137,2715137
QUEENS,2041065,2041065,2041065,2041065,2041065,2041065,2041065,2041065,2041065
RICHMOND / STATEN ISLAND,518267,518267,518267,518267,518267,518267,518267,518267,518267


In [13]:
value_counts = df['ALARM_SOURCE_DESCRIPTION_TX'].value_counts()
valid_categories = value_counts[value_counts >= 5000].index
df = df[df['ALARM_SOURCE_DESCRIPTION_TX'].isin(valid_categories)]
df.groupby('ALARM_SOURCE_DESCRIPTION_TX').count()
print(f"Percentage of remaining rows: {df.shape[0] / total_rows * 100:.2f}%")

Percentage of remaining rows: 99.93%


In [14]:
df.groupby('INCIDENT_CLASSIFICATION_GROUP').count()

Unnamed: 0_level_0,INCIDENT_BOROUGH,ALARM_SOURCE_DESCRIPTION_TX,VALID_INCIDENT_RSPNS_TIME_INDC,ENGINES_ASSIGNED_QUANTITY,LADDERS_ASSIGNED_QUANTITY,OTHER_UNITS_ASSIGNED_QUANTITY,INCIDENT_HOUR,HOUR GROUP,HOUR GROUP LABEL
INCIDENT_CLASSIFICATION_GROUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Medical Emergencies,4770122,4770122,4770122,4770122,4770122,4770122,4770122,4770122,4770122
Medical MFAs,68406,68406,68406,68406,68406,68406,68406,68406,68406
NonMedical Emergencies,4172687,4172687,4172687,4172687,4172687,4172687,4172687,4172687,4172687
NonMedical MFAs,430719,430719,430719,430719,430719,430719,430719,430719,430719
NonStructural Fires,306521,306521,306521,306521,306521,306521,306521,306521,306521
Structural Fires,519209,519209,519209,519209,519209,519209,519209,519209,519209


In [15]:
# Target Encoding for Categorical Variables using Stratified K-Folds

def target_encode(df, col, target='VALID_INCIDENT_RSPNS_TIME_INDC', n_splits=10, alpha=10):

    df = df.copy()
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    encoded = pd.Series(index=df.index, dtype=float)
    
    global_mean = df[target].mean()

    for train_idx, val_idx in skf.split(df, df[target]):
        train, val = df.iloc[train_idx], df.iloc[val_idx]

        # Compute smoothed means
        category_stats = train.groupby(col)[target].agg(['mean', 'count'])
        smooth = (category_stats['count'] * category_stats['mean'] + alpha * global_mean) / (category_stats['count'] + alpha)

        # Map to validation fold
        encoded.iloc[val_idx] = val[col].map(smooth).fillna(global_mean)

    return encoded

In [16]:
print(df.shape)
df.head()

(10267664, 10)


Unnamed: 0,INCIDENT_BOROUGH,ALARM_SOURCE_DESCRIPTION_TX,INCIDENT_CLASSIFICATION_GROUP,VALID_INCIDENT_RSPNS_TIME_INDC,ENGINES_ASSIGNED_QUANTITY,LADDERS_ASSIGNED_QUANTITY,OTHER_UNITS_ASSIGNED_QUANTITY,INCIDENT_HOUR,HOUR GROUP,HOUR GROUP LABEL
0,RICHMOND / STATEN ISLAND,Phone,NonMedical Emergencies,1,1.0,1.0,0.0,0,5,10 PM - 2 AM
1,BRONX,PD Link/Medical,Medical MFAs,1,1.0,0.0,0.0,0,5,10 PM - 2 AM
2,BROOKLYN,PD Link/Medical,Medical Emergencies,1,1.0,0.0,0.0,0,5,10 PM - 2 AM
3,QUEENS,Phone,NonMedical MFAs,1,3.0,2.0,1.0,0,5,10 PM - 2 AM
4,QUEENS,Private Fire Alarm,NonMedical Emergencies,1,1.0,1.0,1.0,0,5,10 PM - 2 AM


In [17]:
# Target Encoding for categorical variables
df['Borough num'] = target_encode(df, 'INCIDENT_BOROUGH')
df['Alarm Source num'] = target_encode(df, 'ALARM_SOURCE_DESCRIPTION_TX')
df['Incident Classification num'] = target_encode(df, 'INCIDENT_CLASSIFICATION_GROUP')
df['Time num'] = target_encode(df, 'HOUR GROUP LABEL')

# Computing percentage of dataset remaining after cleaning
cleaned_rows = df.shape[0]
cleaned_columns = df.shape[1]
print((cleaned_rows * cleaned_columns) / (total_rows * total_columns) * 100)


48.24096280933375


In [18]:
# Renaming columns to make them easier to read and work with
df.rename(columns={'INCIDENT_BOROUGH': 'Borough', 'ALARM_SOURCE_DESCRIPTION_TX': 'Alarm Source', 'INCIDENT_CLASSIFICATION_GROUP': 'Incident Classification', 'VALID_INCIDENT_RSPNS_TIME_INDC': 'Valid Response Time', 'ENGINES_ASSIGNED_QUANTITY': 'Engines Assigned', 'LADDERS_ASSIGNED_QUANTITY': 'Ladders Assigned', 'OTHER_UNITS_ASSIGNED_QUANTITY': 'Other Units Assigned', 'HOUR GROUP LABEL' : 'Time'}, inplace=True)
order = ['Borough', 'Borough num', 'Time', 'Time num',  'Alarm Source', 'Alarm Source num', 'Incident Classification', 'Incident Classification num', 'Engines Assigned', 'Ladders Assigned', 'Other Units Assigned', 'Valid Response Time']
df = df[order]
df.head()

Unnamed: 0,Borough,Borough num,Time,Time num,Alarm Source,Alarm Source num,Incident Classification,Incident Classification num,Engines Assigned,Ladders Assigned,Other Units Assigned,Valid Response Time
0,RICHMOND / STATEN ISLAND,0.840686,10 PM - 2 AM,0.80663,Phone,0.957765,NonMedical Emergencies,0.946594,1.0,1.0,0.0,1
1,BRONX,0.791903,10 PM - 2 AM,0.806821,PD Link/Medical,0.686953,Medical MFAs,0.871638,1.0,0.0,0.0,1
2,BROOKLYN,0.831623,10 PM - 2 AM,0.806821,PD Link/Medical,0.686953,Medical Emergencies,0.640499,1.0,0.0,0.0,1
3,QUEENS,0.819562,10 PM - 2 AM,0.80663,Phone,0.957765,NonMedical MFAs,0.94875,3.0,2.0,1.0,1
4,QUEENS,0.819512,10 PM - 2 AM,0.80681,Private Fire Alarm,0.985822,NonMedical Emergencies,0.946592,1.0,1.0,1.0,1


In [None]:
pd.DataFrame(df).to_csv(r"C:\Users\Jason\Downloads\CleanedFireIncidentData.csv")