In [193]:
# seasonality
def seasonality(date):
    """
    Return the season given the date.

    date: Exception date 
    
    1 = winter
    2 = spring
    3 = summer 
    4 = Fall
    
    date thresholds are referenced from
    https://www.timeanddate.com/calendar/seasons.html
    """
    date = str(date)
    if date < "2020-03-19":
        return "winter"
    
    elif date < "2020-06-20":
        return "spring"
    
    elif date < "2020-09-22":
        return "summer"
    
    elif date < "2020-12-21":
        return "fall"
    
    elif date < "2021-03-20":
        return "winter"
    
    elif date < "2021-06-20":
        return "spring"
    
    elif date < "2021-09-22":
        return "summer"
    
    elif date < "2021-12-21":
        return "fall"
    
    elif date < "2022-03-20":
        return "winter"
    
    elif date < "2022-06-21":
        return "spring"
    
    elif date < "2022-09-22":
        return "summer"
    
    elif date < "2022-12-21":
        return "fall"
    
    else:
        return "winter"

In [194]:
import pandas as pd
import numpy as np

import sklearn as sk
from sklearn import model_selection,linear_model, metrics
from sklearn.model_selection import train_test_split

import xgboost as xgb
import optuna as op

In [195]:
crime_df = pd.read_csv("data/clean_crime_df.csv")
staff_df = pd.read_csv("data/clean_staff_df.csv")

In [196]:
crime_df['call_min']=crime_df['call_time'].apply(lambda x: x[3:5])

In [197]:
crime_df.head(10)

Unnamed: 0,Call Date,call_time,District,Sector,Call Type,year,month,day,call_hour,month_day,priority,month_num,call_min
0,2020-01-01,00:05:46,2,21,UNKNOWN TROUBLE,2020,Jan,1,0,Jan-01,high,1,5
1,2020-01-01,02:06:01,4,44,ASSAULT IN PROGRESS,2020,Jan,1,2,Jan-01,high,1,6
2,2020-01-01,00:37:32,5,55,FIRE,2020,Jan,1,0,Jan-01,high,1,37
3,2020-01-01,01:41:30,5,55,CHECK WELFARE,2020,Jan,1,1,Jan-01,medium,1,41
4,2020-01-01,00:46:26,1,13,DOMESTIC VIOLENCE,2020,Jan,1,0,Jan-01,high,1,46
5,2020-01-01,00:47:19,2,22,MVC INJURY,2020,Jan,1,0,Jan-01,high,1,47
6,2020-01-01,01:29:17,2,23,SUSPICIOUS VEHICLE,2020,Jan,1,1,Jan-01,medium,1,29
7,2020-01-01,00:57:54,1,15,IMPAIRED,2020,Jan,1,0,Jan-01,high,1,57
8,2020-01-01,01:58:55,5,51,INJURED PERSON,2020,Jan,1,1,Jan-01,medium,1,58
9,2020-01-01,00:24:38,2,21,SOUND OF GUNSHOTS,2020,Jan,1,0,Jan-01,high,1,24


In [198]:
crime_df['season'] = crime_df['Call Date'].apply(seasonality)

crime_df.head()

Unnamed: 0,Call Date,call_time,District,Sector,Call Type,year,month,day,call_hour,month_day,priority,month_num,call_min,season
0,2020-01-01,00:05:46,2,21,UNKNOWN TROUBLE,2020,Jan,1,0,Jan-01,high,1,5,winter
1,2020-01-01,02:06:01,4,44,ASSAULT IN PROGRESS,2020,Jan,1,2,Jan-01,high,1,6,winter
2,2020-01-01,00:37:32,5,55,FIRE,2020,Jan,1,0,Jan-01,high,1,37,winter
3,2020-01-01,01:41:30,5,55,CHECK WELFARE,2020,Jan,1,1,Jan-01,medium,1,41,winter
4,2020-01-01,00:46:26,1,13,DOMESTIC VIOLENCE,2020,Jan,1,0,Jan-01,high,1,46,winter


In [199]:
crime_df.dtypes

Call Date    object
call_time    object
District      int64
Sector        int64
Call Type    object
year          int64
month        object
day           int64
call_hour     int64
month_day    object
priority     object
month_num     int64
call_min     object
season       object
dtype: object

In [200]:
crime_df['District']= crime_df['District'].apply(str)
crime_df['Sector']= crime_df['Sector'].apply(str)
crime_df['day']= crime_df['day'].apply(str)
crime_df['call_hour']= crime_df['call_hour'].apply(str)
crime_df['month_num']= crime_df['month_num'].apply(str)
crime_df.dtypes

Call Date    object
call_time    object
District     object
Sector       object
Call Type    object
year          int64
month        object
day          object
call_hour    object
month_day    object
priority     object
month_num    object
call_min     object
season       object
dtype: object

In [201]:
priority_pivot = crime_df.pivot_table(values='year', index = ['District', 'Sector', 'month_num', 'day', 'call_hour','season'], columns='priority', aggfunc='sum').reset_index()
priority_pivot

priority,District,Sector,month_num,day,call_hour,season,high,low,medium
0,1,11,1,1,0,winter,6062.0,2021.0,2020.0
1,1,11,1,1,1,winter,2020.0,,2020.0
2,1,11,1,1,10,winter,2022.0,,2021.0
3,1,11,1,1,11,winter,,4043.0,2020.0
4,1,11,1,1,12,winter,,4040.0,
...,...,...,...,...,...,...,...,...,...
148462,5,55,9,9,5,summer,,2022.0,
148463,5,55,9,9,6,summer,2022.0,2020.0,4044.0
148464,5,55,9,9,7,summer,,2022.0,
148465,5,55,9,9,8,summer,,,4044.0


In [202]:
priority_pivot.isna().sum()

priority
District          0
Sector            0
month_num         0
day               0
call_hour         0
season            0
high         100220
low           78607
medium        29313
dtype: int64

In [203]:
priority_pivot = priority_pivot.fillna(0)
priority_pivot

priority,District,Sector,month_num,day,call_hour,season,high,low,medium
0,1,11,1,1,0,winter,6062.0,2021.0,2020.0
1,1,11,1,1,1,winter,2020.0,0.0,2020.0
2,1,11,1,1,10,winter,2022.0,0.0,2021.0
3,1,11,1,1,11,winter,0.0,4043.0,2020.0
4,1,11,1,1,12,winter,0.0,4040.0,0.0
...,...,...,...,...,...,...,...,...,...
148462,5,55,9,9,5,summer,0.0,2022.0,0.0
148463,5,55,9,9,6,summer,2022.0,2020.0,4044.0
148464,5,55,9,9,7,summer,0.0,2022.0,0.0
148465,5,55,9,9,8,summer,0.0,0.0,4044.0


In [204]:
# for col in priority_pivot.columns:
#     priority_pivot[col] = priority_pivot[col].apply(int)
    
# priority_pivot.dtypes

In [205]:
priority_pivot.shape

(148467, 9)

In [206]:
# randomize data
priority_pivot = priority_pivot.sample(148467, random_state = 888)
priority_pivot

priority,District,Sector,month_num,day,call_hour,season,high,low,medium
53868,2,23,4,4,8,spring,0.0,0.0,4042.0
124843,5,52,4,14,19,spring,0.0,4043.0,0.0
46318,2,22,5,17,16,spring,6063.0,4041.0,2022.0
52820,2,23,3,13,7,winter,0.0,4043.0,2020.0
112722,4,46,5,14,6,spring,0.0,2020.0,2021.0
...,...,...,...,...,...,...,...,...,...
70806,3,32,12,25,16,winter,0.0,0.0,2021.0
58607,2,24,12,11,12,fall,0.0,0.0,4043.0
80365,4,41,7,28,2,summer,0.0,2021.0,0.0
100250,4,44,5,29,4,spring,0.0,2020.0,0.0


In [207]:
priority_pivot.dtypes

priority
District      object
Sector        object
month_num     object
day           object
call_hour     object
season        object
high         float64
low          float64
medium       float64
dtype: object

In [208]:
priority_pivot.to_csv("data/ml_data.csv", index=False)

In [187]:
targets = ['high','low','medium']
# targets='high'

predictors = ['District', 'Sector', 
              'month_num', 'day', 
              'call_hour', 'season']

In [188]:
from sklearn.preprocessing import OneHotEncoder

df = priority_pivot.copy()

enc = OneHotEncoder(handle_unknown='ignore')
new_df = enc.fit_transform(df[predictors])
new_df

<148467x99 sparse matrix of type '<class 'numpy.float64'>'
	with 890802 stored elements in Compressed Sparse Row format>

In [189]:
X.columns

Index(['District', 'Sector', 'month_num', 'day', 'call_hour', 'season'], dtype='object', name='priority')

In [190]:
Y = priority_pivot[targets]
X = priority_pivot[predictors]

print(Y.head())
print(X.head())

priority    high     low  medium
53868     -999.0  -999.0  4042.0
124843    -999.0  4043.0  -999.0
46318     6063.0  4041.0  2022.0
52820     -999.0  4043.0  2020.0
112722    -999.0  2020.0  2021.0
priority District Sector month_num day call_hour  season
53868           2     23         4   4         8  spring
124843          5     52         4  14        19  spring
46318           2     22         5  17        16  spring
52820           2     23         3  13         7  winter
112722          4     46         5  14         6  spring


In [191]:
x_train, x_test, y_train, y_test = train_test_split(new_df, Y, random_state=888, test_size=.20)

In [192]:
xgb_model = xgb.XGBRegressor(tree_method='hist', objective='reg:squarederror')
xgb_model.fit(x_train, y_train)

y_pred = xgb_model.predict(x_train)
pd.DataFrame(y_pred)

Unnamed: 0,0,1,2
0,-148.062378,-134.587921,1728.354736
1,209.792297,-11.304429,2776.468506
2,423.922363,752.877625,3424.017090
3,72.372650,184.093903,2147.017090
4,-274.920074,15.838157,2013.869995
...,...,...,...
118768,215.296036,1339.370972,4455.435547
118769,-237.658783,696.607117,2332.799316
118770,1134.629395,983.565796,5538.190430
118771,1200.505615,594.010803,3759.917236


In [184]:
y_train

priority,high,low,medium
114960,2021.0,2021.0,-999.0
120692,-999.0,2020.0,6064.0
123312,-999.0,-999.0,2021.0
61133,2022.0,2020.0,-999.0
129640,-999.0,-999.0,2022.0
...,...,...,...
36738,-999.0,-999.0,2022.0
102711,2021.0,2020.0,4042.0
20469,2020.0,4042.0,-999.0
67571,2021.0,-999.0,4043.0


In [181]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [178]:
mean_squared_error(y_train, y_pred, squared=False)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


2236.6846758780775