In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# df = pd.read_csv('/kaggle/input/ny-ems-incident-dispatch-data/ems-incident-dispatch-data.csv', nrows=500000)
#df = pd.read_csv('/kaggle/input/ny-ems-incident-dispatch-data/ems-incident-dispatch-data.csv')

# Iterates through all columns and modifies the data type to reduce memory usage
def reduce_mem_usage(df):
    
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


# Creates a dataframe and optimizes its memory usage
def import_data(file):
    df = pd.read_csv(file, parse_dates=True)
    # df = reduce_mem_usage(df)
    return df



print('-' * 80)

train = import_data('EMS_Incident_Dispatch_Data_20241118.csv')




--------------------------------------------------------------------------------


In [3]:
df = train

In [4]:
df.shape

(279350, 31)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279350 entries, 0 to 279349
Data columns (total 31 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   CAD_INCIDENT_ID                 279350 non-null  int64  
 1   INCIDENT_DATETIME               279350 non-null  object 
 2   INITIAL_CALL_TYPE               279350 non-null  object 
 3   INITIAL_SEVERITY_LEVEL_CODE     279350 non-null  int64  
 4   FINAL_CALL_TYPE                 279350 non-null  object 
 5   FINAL_SEVERITY_LEVEL_CODE       279350 non-null  int64  
 6   FIRST_ASSIGNMENT_DATETIME       272560 non-null  object 
 7   VALID_DISPATCH_RSPNS_TIME_INDC  279350 non-null  object 
 8   DISPATCH_RESPONSE_SECONDS_QY    279350 non-null  int64  
 9   FIRST_ACTIVATION_DATETIME       271905 non-null  object 
 10  FIRST_ON_SCENE_DATETIME         261124 non-null  object 
 11  VALID_INCIDENT_RSPNS_TIME_INDC  279350 non-null  object 
 12  INCIDENT_RESPONS

In [6]:
df.isnull().sum()

CAD_INCIDENT_ID                        0
INCIDENT_DATETIME                      0
INITIAL_CALL_TYPE                      0
INITIAL_SEVERITY_LEVEL_CODE            0
FINAL_CALL_TYPE                        0
FINAL_SEVERITY_LEVEL_CODE              0
FIRST_ASSIGNMENT_DATETIME           6790
VALID_DISPATCH_RSPNS_TIME_INDC         0
DISPATCH_RESPONSE_SECONDS_QY           0
FIRST_ACTIVATION_DATETIME           7445
FIRST_ON_SCENE_DATETIME            18226
VALID_INCIDENT_RSPNS_TIME_INDC         0
INCIDENT_RESPONSE_SECONDS_QY       18295
INCIDENT_TRAVEL_TM_SECONDS_QY      18227
FIRST_TO_HOSP_DATETIME            110557
FIRST_HOSP_ARRIVAL_DATETIME       111219
INCIDENT_CLOSE_DATETIME               32
HELD_INDICATOR                         0
INCIDENT_DISPOSITION_CODE              0
BOROUGH                                0
INCIDENT_DISPATCH_AREA                 0
ZIPCODE                             2919
POLICEPRECINCT                      2907
CITYCOUNCILDISTRICT                 2907
COMMUNITYDISTRIC

In [7]:
df['INCIDENT_DATETIME'] = pd.to_datetime(df['INCIDENT_DATETIME'])

df['incident_year'] = df['INCIDENT_DATETIME'].dt.year
df['incident_month'] = df['INCIDENT_DATETIME'].dt.month
df['day_of_week'] = df['INCIDENT_DATETIME'].dt.dayofweek
df['incident_hour'] = df['INCIDENT_DATETIME'].dt.hour


df['INCIDENT_CLOSE_DATETIME'] = pd.to_datetime(df['INCIDENT_CLOSE_DATETIME'])
df['incident_duration'] = (df['INCIDENT_CLOSE_DATETIME'] - df['INCIDENT_DATETIME']).dt.total_seconds()
df['HELD_INDICATOR'] = df['HELD_INDICATOR'].apply(lambda x: 1 if x == 'Y' else 0)

  df['INCIDENT_DATETIME'] = pd.to_datetime(df['INCIDENT_DATETIME'])


In [8]:
print(df.loc[0])

CAD_INCIDENT_ID                                241220001
INCIDENT_DATETIME                    2024-05-01 00:00:02
INITIAL_CALL_TYPE                                   DRUG
INITIAL_SEVERITY_LEVEL_CODE                            4
FINAL_CALL_TYPE                                     DRUG
FINAL_SEVERITY_LEVEL_CODE                              4
FIRST_ASSIGNMENT_DATETIME         05/01/2024 12:00:46 AM
VALID_DISPATCH_RSPNS_TIME_INDC                         Y
DISPATCH_RESPONSE_SECONDS_QY                          44
FIRST_ACTIVATION_DATETIME         05/01/2024 12:01:06 AM
FIRST_ON_SCENE_DATETIME           05/01/2024 12:12:17 AM
VALID_INCIDENT_RSPNS_TIME_INDC                         Y
INCIDENT_RESPONSE_SECONDS_QY                       735.0
INCIDENT_TRAVEL_TM_SECONDS_QY                      691.0
FIRST_TO_HOSP_DATETIME                               NaN
FIRST_HOSP_ARRIVAL_DATETIME                          NaN
INCIDENT_CLOSE_DATETIME              2024-05-01 01:01:56
HELD_INDICATOR                 

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [22]:
target_col = 'FINAL_CALL_TYPE'
feature_cols = ['INITIAL_CALL_TYPE', 'INITIAL_SEVERITY_LEVEL_CODE', 'day_of_week', 'incident_hour', 'POLICEPRECINCT']

In [11]:
df.isnull().sum()

CAD_INCIDENT_ID                        0
INCIDENT_DATETIME                      0
INITIAL_CALL_TYPE                      0
INITIAL_SEVERITY_LEVEL_CODE            0
FINAL_CALL_TYPE                        0
FINAL_SEVERITY_LEVEL_CODE              0
FIRST_ASSIGNMENT_DATETIME           6790
VALID_DISPATCH_RSPNS_TIME_INDC         0
DISPATCH_RESPONSE_SECONDS_QY           0
FIRST_ACTIVATION_DATETIME           7445
FIRST_ON_SCENE_DATETIME            18226
VALID_INCIDENT_RSPNS_TIME_INDC         0
INCIDENT_RESPONSE_SECONDS_QY       18295
INCIDENT_TRAVEL_TM_SECONDS_QY      18227
FIRST_TO_HOSP_DATETIME            110557
FIRST_HOSP_ARRIVAL_DATETIME       111219
INCIDENT_CLOSE_DATETIME               32
HELD_INDICATOR                         0
INCIDENT_DISPOSITION_CODE              0
BOROUGH                                0
INCIDENT_DISPATCH_AREA                 0
ZIPCODE                             2919
POLICEPRECINCT                      2907
CITYCOUNCILDISTRICT                 2907
COMMUNITYDISTRIC

In [12]:
df.dropna(subset=feature_cols, inplace=True)

In [13]:
df.isnull().sum()

CAD_INCIDENT_ID                        0
INCIDENT_DATETIME                      0
INITIAL_CALL_TYPE                      0
INITIAL_SEVERITY_LEVEL_CODE            0
FINAL_CALL_TYPE                        0
FINAL_SEVERITY_LEVEL_CODE              0
FIRST_ASSIGNMENT_DATETIME           6684
VALID_DISPATCH_RSPNS_TIME_INDC         0
DISPATCH_RESPONSE_SECONDS_QY           0
FIRST_ACTIVATION_DATETIME           7328
FIRST_ON_SCENE_DATETIME            17922
VALID_INCIDENT_RSPNS_TIME_INDC         0
INCIDENT_RESPONSE_SECONDS_QY       17986
INCIDENT_TRAVEL_TM_SECONDS_QY      17923
FIRST_TO_HOSP_DATETIME            108763
FIRST_HOSP_ARRIVAL_DATETIME       109417
INCIDENT_CLOSE_DATETIME                0
HELD_INDICATOR                         0
INCIDENT_DISPOSITION_CODE              0
BOROUGH                                0
INCIDENT_DISPATCH_AREA                 0
ZIPCODE                               12
POLICEPRECINCT                         0
CITYCOUNCILDISTRICT                    0
COMMUNITYDISTRIC

## CatBoost Gradient Boosting

In [14]:
%pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [15]:
from catboost import CatBoostClassifier, Pool

In [23]:
# df['ZIPCODE'] = df['ZIPCODE'].astype('str')
df['POLICEPRECINCT'] = df['POLICEPRECINCT'].astype('str')

In [24]:
X = df[feature_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [27]:
categorical_features = ['INITIAL_CALL_TYPE', 'day_of_week', 'POLICEPRECINCT']

train_pool = Pool(X_train, y_train, feature_names=feature_cols, cat_features=categorical_features)
test_pool = Pool(X_test, y_test, feature_names=feature_cols, cat_features=categorical_features)

In [32]:
clf = CatBoostClassifier(
    loss_function='MultiClass',
    # eval_metric='Accuracy',
    iterations=50, depth=6, learning_rate=0.1,
	task_type='GPU',
)
clf.fit(train_pool, plot=True, verbose=False)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f3e7776bda0>

In [33]:
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

report = classification_report(y_test, y_pred)
print(f'Report: {report}')

Accuracy: 0.8300009647387969


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Report:               precision    recall  f1-score   support

       ABDPN       0.92      0.93      0.93      3395
      ACTIVE       0.00      0.00      0.00         5
      ALTMEN       0.88      0.76      0.81      1629
      AMPMIN       0.00      0.00      0.00        11
      ANAPFC       0.00      0.00      0.00         3
       ANAPH       0.88      0.92      0.90       568
      ARREST       0.80      0.70      0.75      1410
      ASTHMB       0.65      0.90      0.76       471
      BURNHM       0.00      0.00      0.00         3
      BURNHZ       0.00      0.00      0.00        13
      BURNMA       0.00      0.00      0.00        67
      BURNMI       0.00      0.00      0.00        76
        CARD       0.87      0.83      0.85      4501
      CARDBR       0.91      0.89      0.90      5477
      CARDFC       0.00      0.00      0.00         2
      CDBRFC       0.00      0.00      0.00         2
      CHILDA       0.00      0.00      0.00        14
       CHOKE       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
clf.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,INITIAL_SEVERITY_LEVEL_CODE,78.126015
1,INITIAL_CALL_TYPE,21.492085
2,POLICEPRECINCT,0.220506
3,incident_hour,0.161394
4,day_of_week,0.0


Model 1 : 83.0%
iter 50
depth 6
learn rate 0.1
features ['INITIAL_CALL_TYPE', 'INITIAL_SEVERITY_LEVEL_CODE', 'day_of_week', 'incident_hour', 'BOROUGH']


Model 2: 83.2% iter 100 depth 10 learn rate 0.1 features ['INITIAL_CALL_TYPE', 'INITIAL_SEVERITY_LEVEL_CODE', 'day_of_week', 'incident_hour', 'BOROUGH', 'incident_duration', 'ZIPCODE']