# Combining Datasets

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
%matplotlib inline

# Helper function to split our data
from sklearn.model_selection import train_test_split

# Helper fuctions to evaluate our model.
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, f1_score

# Import our Decision Tree
from sklearn.tree import DecisionTreeClassifier 

# Import our Random Forest 
from sklearn.ensemble import RandomForestClassifier

# Import the trees from sklearn
from sklearn import tree

# Helper functions to visualize our trees
from sklearn.tree import plot_tree, export_text

In [2]:
df = pd.read_csv('EMS-May-June.csv', nrows = 80000)

In [3]:
df.head()

Unnamed: 0,CAD_INCIDENT_ID,INCIDENT_DATETIME,INITIAL_CALL_TYPE,INITIAL_SEVERITY_LEVEL_CODE,FINAL_CALL_TYPE,FINAL_SEVERITY_LEVEL_CODE,FIRST_ASSIGNMENT_DATETIME,VALID_DISPATCH_RSPNS_TIME_INDC,DISPATCH_RESPONSE_SECONDS_QY,FIRST_ACTIVATION_DATETIME,...,ZIPCODE,POLICEPRECINCT,CITYCOUNCILDISTRICT,COMMUNITYDISTRICT,COMMUNITYSCHOOLDISTRICT,CONGRESSIONALDISTRICT,REOPEN_INDICATOR,SPECIAL_EVENT_INDICATOR,STANDBY_INDICATOR,TRANSFER_INDICATOR
0,241220001,05/01/2024 12:00:02 AM,DRUG,4,DRUG,4,05/01/2024 12:00:46 AM,Y,44,05/01/2024 12:01:06 AM,...,11230.0,70.0,40.0,314.0,22.0,9.0,N,N,N,N
1,241220002,05/01/2024 12:00:40 AM,STNDBM,8,STNDBM,8,05/01/2024 04:54:29 AM,Y,17629,05/01/2024 04:55:09 AM,...,11212.0,73.0,41.0,316.0,23.0,9.0,Y,N,N,N
2,241220003,05/01/2024 12:00:44 AM,STNDBM,8,STNDBM,8,05/01/2024 04:46:03 AM,Y,17119,05/01/2024 04:46:38 AM,...,11237.0,83.0,37.0,304.0,32.0,7.0,Y,N,N,N
3,241220004,05/01/2024 12:00:48 AM,STNDBY,8,STNDBY,8,05/01/2024 06:24:45 AM,Y,23037,05/01/2024 06:24:45 AM,...,10035.0,25.0,8.0,111.0,4.0,12.0,Y,N,Y,N
4,241220005,05/01/2024 12:00:59 AM,UNKNOW,4,DRUG,4,05/01/2024 12:01:14 AM,Y,15,05/01/2024 12:01:33 AM,...,10463.0,50.0,14.0,208.0,10.0,13.0,N,N,N,N


In [4]:
df.duplicated().sum()

0

In [5]:
df = df[['CAD_INCIDENT_ID', 'INCIDENT_DATETIME', 'INITIAL_CALL_TYPE', 'INITIAL_SEVERITY_LEVEL_CODE', 'FINAL_CALL_TYPE', 'BOROUGH', 'POLICEPRECINCT']]

In [6]:
df = df.dropna(subset=['POLICEPRECINCT'])

In [7]:
df.isnull().sum()

CAD_INCIDENT_ID                0
INCIDENT_DATETIME              0
INITIAL_CALL_TYPE              0
INITIAL_SEVERITY_LEVEL_CODE    0
FINAL_CALL_TYPE                0
BOROUGH                        0
POLICEPRECINCT                 0
dtype: int64

In [8]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['INITIAL_CALL_TYPE'] = encoder.fit_transform(df['INITIAL_CALL_TYPE'])
df['FINAL_CALL_TYPE'] = encoder.fit_transform(df['FINAL_CALL_TYPE'])
df['BOROUGH'] = encoder.fit_transform(df['BOROUGH'])

In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79212 entries, 0 to 79999
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CAD_INCIDENT_ID              79212 non-null  int64  
 1   INCIDENT_DATETIME            79212 non-null  object 
 2   INITIAL_CALL_TYPE            79212 non-null  int64  
 3   INITIAL_SEVERITY_LEVEL_CODE  79212 non-null  int64  
 4   FINAL_CALL_TYPE              79212 non-null  int64  
 5   BOROUGH                      79212 non-null  int64  
 6   POLICEPRECINCT               79212 non-null  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 4.8+ MB
None


In [10]:
#df['incident_year'] = pd.to_datetime(df['INCIDENT_DATETIME']).dt.year
df['incident_month'] = pd.to_datetime(df['INCIDENT_DATETIME']).dt.month
df['day_of_week'] = pd.to_datetime(df['INCIDENT_DATETIME']).dt.dayofweek
df['incident_hour'] = pd.to_datetime(df['INCIDENT_DATETIME']).dt.hour
df['INCIDENT_DATETIME'] = pd.to_datetime(df['INCIDENT_DATETIME'])

In [11]:
df['POLICEPRECINCT'] = df['POLICEPRECINCT'].astype(int)

In [12]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79212 entries, 0 to 79999
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   CAD_INCIDENT_ID              79212 non-null  int64         
 1   INCIDENT_DATETIME            79212 non-null  datetime64[ns]
 2   INITIAL_CALL_TYPE            79212 non-null  int64         
 3   INITIAL_SEVERITY_LEVEL_CODE  79212 non-null  int64         
 4   FINAL_CALL_TYPE              79212 non-null  int64         
 5   BOROUGH                      79212 non-null  int64         
 6   POLICEPRECINCT               79212 non-null  int64         
 7   incident_month               79212 non-null  int64         
 8   day_of_week                  79212 non-null  int64         
 9   incident_hour                79212 non-null  int64         
dtypes: datetime64[ns](1), int64(9)
memory usage: 6.6 MB
None


### Adding Weather Data

In [13]:
weather_df = pd.read_csv('Weather-May-June.csv', nrows = 1200)

In [14]:
weather_df.head()

Unnamed: 0,time,temperature_2m (F),relative_humidity_2m (%),precipitation (inch),snow_depth (ft),cloud_cover (%),wind_speed_10m (mp/h),wind_gusts_10m (mp/h)
0,2024-05-01T00:00,51.7,93,0.0,0,100,4.7,8.5
1,2024-05-01T01:00,51.4,94,0.012,0,100,4.3,9.4
2,2024-05-01T02:00,51.4,96,0.031,0,99,2.7,8.1
3,2024-05-01T03:00,51.6,94,0.008,0,100,2.2,5.6
4,2024-05-01T04:00,51.6,94,0.0,0,100,1.9,4.3


In [15]:
weather_df['time'] = pd.to_datetime(weather_df['time'])
weather_df['time'].head()

0   2024-05-01 00:00:00
1   2024-05-01 01:00:00
2   2024-05-01 02:00:00
3   2024-05-01 03:00:00
4   2024-05-01 04:00:00
Name: time, dtype: datetime64[ns]

In [16]:
df['INCIDENT_DATETIME'] = pd.to_datetime(df['INCIDENT_DATETIME'])
df['incident_hour_rounded'] = df['INCIDENT_DATETIME'].dt.floor('H')

In [17]:
merged_df = pd.merge(df, weather_df, left_on='incident_hour_rounded', right_on='time', how='left')

In [18]:
# Drop the redundant 'time' column from weather_df after the merge
merged_df = merged_df.drop(columns=['time'])

# Preview the merged data
merged_df.head()

Unnamed: 0,CAD_INCIDENT_ID,INCIDENT_DATETIME,INITIAL_CALL_TYPE,INITIAL_SEVERITY_LEVEL_CODE,FINAL_CALL_TYPE,BOROUGH,POLICEPRECINCT,incident_month,day_of_week,incident_hour,incident_hour_rounded,temperature_2m (F),relative_humidity_2m (%),precipitation (inch),snow_depth (ft),cloud_cover (%),wind_speed_10m (mp/h),wind_gusts_10m (mp/h)
0,241220001,2024-05-01 00:00:02,22,4,25,1,70,5,2,0,2024-05-01,51.7,93,0.0,0,100,4.7,8.5
1,241220002,2024-05-01 00:00:40,65,8,74,1,73,5,2,0,2024-05-01,51.7,93,0.0,0,100,4.7,8.5
2,241220003,2024-05-01 00:00:44,65,8,74,1,83,5,2,0,2024-05-01,51.7,93,0.0,0,100,4.7,8.5
3,241220004,2024-05-01 00:00:48,66,8,75,2,25,5,2,0,2024-05-01,51.7,93,0.0,0,100,4.7,8.5
4,241220005,2024-05-01 00:00:59,83,4,25,0,50,5,2,0,2024-05-01,51.7,93,0.0,0,100,4.7,8.5


In [19]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79212 entries, 0 to 79211
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   CAD_INCIDENT_ID              79212 non-null  int64         
 1   INCIDENT_DATETIME            79212 non-null  datetime64[ns]
 2   INITIAL_CALL_TYPE            79212 non-null  int64         
 3   INITIAL_SEVERITY_LEVEL_CODE  79212 non-null  int64         
 4   FINAL_CALL_TYPE              79212 non-null  int64         
 5   BOROUGH                      79212 non-null  int64         
 6   POLICEPRECINCT               79212 non-null  int64         
 7   incident_month               79212 non-null  int64         
 8   day_of_week                  79212 non-null  int64         
 9   incident_hour                79212 non-null  int64         
 10  incident_hour_rounded        79212 non-null  datetime64[ns]
 11  temperature_2m (F)           79212 non-nu

In [20]:
merged_df.shape

(79212, 18)

### Random Forest

In [21]:
independent_variables = ['INITIAL_CALL_TYPE', 'INITIAL_SEVERITY_LEVEL_CODE', 'incident_month', 'day_of_week', 'incident_hour', 'POLICEPRECINCT', 'temperature_2m (F)', 'relative_humidity_2m (%)', 'cloud_cover (%)', 'precipitation (inch)', 'wind_speed_10m (mp/h)', 'wind_gusts_10m (mp/h)']

dependent_variable = 'FINAL_CALL_TYPE'

In [22]:
# Check the distribution of the target variable to identify rare classes
class_counts = df[dependent_variable].value_counts()
#print(class_counts[class_counts < 6])  # Show classes with fewer than 2 instances

# Filter out rows with classes that have fewer than 2 instances
model_df = merged_df[merged_df[dependent_variable].isin(class_counts[class_counts > 5].index)]

In [23]:
from sklearn.metrics import classification_report

X = merged_df[independent_variables]
y = merged_df[dependent_variable]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [24]:
model = RandomForestClassifier(oob_score=True)

In [25]:
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.88      0.89       660
           2       0.71      0.76      0.73       316
           4       0.00      0.00      0.00         3
           6       0.79      0.89      0.84       112
           7       0.60      0.60      0.60       255
           8       0.76      0.69      0.73        75
          10       0.00      0.00      0.00         3
          11       0.22      0.18      0.20        11
          12       0.57      0.29      0.38        28
          13       0.75      0.77      0.76       865
          14       0.85      0.84      0.85      1110
          15       0.00      0.00      0.00         1
          16       0.83      0.62      0.71         8
          17       0.88      0.65      0.75        23
          18       0.00      0.00      0.00         4
          19       0.56      0.45      0.50        31
          20       0.68      0.71      0.69       178
          21       0.75    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
'''

model = DecisionTreeClassifier(max_depth=2)

X = df[independent_variables]

y = df[dependent_variable]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=45)

print('Shape of our X_train data:', X_train.shape, '\nShape of our y_test data:', y_test.shape)


model.fit(X_train,y_train)

y_pred = model.predict(X_test)


## Eval Model 
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred, average='weighted')
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred, average='weighted')
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred, average='weighted')
print('F1 Score: %f' % f1)


# Calculate predicted probabilities
y_pred_proba = model.predict_proba(X_test)

# # Keep only the proba for True
y_pred_proba = y_pred_proba[:,1]

print(y_pred_proba.shape)

# Compute AUC score for multiclass classification with 'ovr' method
auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba, multi_class='ovr', average='weighted')
print('AUC Score: %f' % auc)


# Produce classification Report
print(classification_report(y_test, y_pred))

'''

'\n\nmodel = DecisionTreeClassifier(max_depth=2)\n\nX = df[independent_variables]\n\ny = df[dependent_variable]\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=45)\n\nprint(\'Shape of our X_train data:\', X_train.shape, \'\nShape of our y_test data:\', y_test.shape)\n\n\nmodel.fit(X_train,y_train)\n\ny_pred = model.predict(X_test)\n\n\n## Eval Model \naccuracy = accuracy_score(y_true=y_test, y_pred=y_pred)\nprint("Accuracy Score: %f" % accuracy)\n\nprecision = precision_score(y_true=y_test, y_pred=y_pred, average=\'weighted\')\nprint("Precision Score: %f" % precision)\n\nrecall = recall_score(y_true=y_test, y_pred=y_pred, average=\'weighted\')\nprint("Recall Score: %f" % recall)\n\nf1 = f1_score(y_true=y_test, y_pred=y_pred, average=\'weighted\')\nprint(\'F1 Score: %f\' % f1)\n\n\n# Calculate predicted probabilities\ny_pred_proba = model.predict_proba(X_test)\n\n# # Keep only the proba for True\ny_pred_proba = y_pred_proba[:,1]\n\npr