In [1]:


from datetime import date, timedelta
import pandas as pd
import holidays


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

from sklearn import metrics

import sys, json, joblib


In [4]:

req_data = {"city": "New_York", "model":"XGBoost"}
df_year=pd.read_csv(f'Data/{req_data["city"]}_2001_2023.csv')

# print(df_year)

df_year = df_year.set_index('date')

df_year.index =  pd.to_datetime(df_year.index,format = '%Y-%m-%d %H:%M:%S')
df_year = df_year[(df_year.index >='2001') & (df_year.index <='2024') ]


# Create a list of dates from 1 Jan 2023 to 1 Jan 2024
start_date = df_year.head(1).index.date[0]
end_date = df_year.tail(1).index.date[0]

delta = timedelta(days=1)
dates = []
while start_date <= end_date:
    dates.append(start_date)
    start_date += delta
dates.append(start_date)
# Create a DataFrame
df_h = pd.DataFrame(dates, columns=['Date'])

# Get US holidays
us_holidays = holidays.US(state='NY')

# Create a column for holidays
df_h['Holiday'] = df_h['Date'].apply(lambda x: us_holidays.get(x))

df_h['Date'] = pd.to_datetime(df_h['Date'], format = '%Y-%m-%d')

df_h.set_index('Date',inplace=True)

df_h[df_h['Holiday'].isnull()] = 0
df_h[df_h['Holiday'] != 0] = 1

df_h = df_h.resample('h').ffill()

df_h = df_h['Holiday'].astype('int32')
df_h = df_h.to_frame('Holiday')


def create_features(df_yr):

    df = df_yr.copy()

    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    # df['weekday'] = df['date'].dt.day_name()
    # df['weekday'] = df['weekday'].map(weekday_mapping)
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    df['weekofyear'] = df.index.isocalendar().week
    df['date_offset'] = (df.index.month*100 + df.index.day - 320)%1300

    df['season'] = pd.cut(df['date_offset'], [0, 300, 602, 900, 1300],
                          labels=[0, 1, 2, 3] # 0 = 'Spring', 1 = 'Summer', 2 = 'Fall', 3 = 'Winter'
                   )
    df['holiday'] = df_h["Holiday"]
    df=df.dropna()

    return df


FEATURES = ['hour','dayofweek','quarter','month','year',
          'dayofyear','dayofmonth','weekofyear',
           'season', 'holiday']
TARGET = 'Crime Count'


df_n = create_features(df_year)

X = df_n[FEATURES]# Input variable
y = df_n[TARGET] # Output variable
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=None, shuffle = False)


In [5]:
X_train['season'] = X_train['season'].astype(np.int32)
X_train['weekofyear'] = X_train['weekofyear'].astype(np.int32)
X_test['season'] = X_test['season'].astype(np.int32)
X_test['weekofyear'] = X_test['weekofyear'].astype(np.int32)

In [7]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2621 entries, 2023-09-13 19:00:00 to 2023-12-31 23:00:00
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   hour        2621 non-null   int32
 1   dayofweek   2621 non-null   int32
 2   quarter     2621 non-null   int32
 3   month       2621 non-null   int32
 4   year        2621 non-null   int32
 5   dayofyear   2621 non-null   int32
 6   dayofmonth  2621 non-null   int32
 7   weekofyear  2621 non-null   int32
 8   season      2621 non-null   int32
 9   holiday     2621 non-null   int32
dtypes: int32(10)
memory usage: 122.9 KB


In [90]:
 X_train.values.astype(np.float32)

array([[ 0.,  6.,  1., ..., 52.,  3.,  1.],
       [ 1.,  6.,  1., ..., 52.,  3.,  1.],
       [ 2.,  6.,  1., ..., 52.,  3.,  1.],
       ...,
       [16.,  2.,  3., ..., 37.,  1.,  0.],
       [17.,  2.,  3., ..., 37.,  1.,  0.],
       [18.,  2.,  3., ..., 37.,  1.,  0.]], dtype=float32)

In [10]:
model = joblib.load(f"Models/{req_data['city']}_{req_data['model']}.joblib")

In [156]:
import skl2onnx
from skl2onnx.common.data_types import FloatTensorType
import onnx

from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn, to_onnx, update_registered_converter
from skl2onnx.common.shape_calculator import (
    calculate_linear_regressor_output_shapes
)
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
from onnxmltools.convert import convert_xgboost as convert_xgboost_booster



In [178]:
from xgboost import XGBRegressor

In [179]:
update_registered_converter(
    XGBRegressor,
    "XGBoostXGBRegressor",
    calculate_linear_regressor_output_shapes,
    convert_xgboost,
)

In [180]:
new_column_names = {col: f'f{i}' for i, col in enumerate(X_train.columns)}
new_column_names

{'hour': 'f0',
 'dayofweek': 'f1',
 'quarter': 'f2',
 'month': 'f3',
 'year': 'f4',
 'dayofyear': 'f5',
 'dayofmonth': 'f6',
 'weekofyear': 'f7',
 'season': 'f8',
 'holiday': 'f9'}

In [181]:
X_train = X_train.rename(columns = new_column_names)
X_test = X_test.rename(columns = new_column_names)

In [182]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 140734 entries, 2001-01-01 00:00:00 to 2017-02-05 21:00:00
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   f0      140734 non-null  int32
 1   f1      140734 non-null  int32
 2   f2      140734 non-null  int32
 3   f3      140734 non-null  int32
 4   f4      140734 non-null  int32
 5   f5      140734 non-null  int32
 6   f6      140734 non-null  int32
 7   f7      140734 non-null  int32
 8   f8      140734 non-null  int32
 9   f9      140734 non-null  int32
dtypes: int32(10)
memory usage: 6.4 MB


In [183]:
if req_data['city'] == 'New_York':
  model = XGBRegressor( base_score=0.5, booster= 'gbtree',objective='reg:squarederror',
                            n_estimators=346,
                        early_stopping_rounds=10,
                          max_leaves=30,
                          learning_rate = 0.01, enable_categorical=True)  
if req_data['city'] == 'Chicago':
  model =  XGBRegressor( base_score=0.5, booster= 'gbtree',objective='reg:squarederror',
                            n_estimators=346,
                        early_stopping_rounds=10,
                          max_depth=11,
                          learning_rate =0.0091, enable_categorical=True)
# pipe = Pipeline([("xgb", model)])
# pipe.fit(X_train, y_train, xgb__eval_set=[(X_train, y_train), (X_test, y_test)],
#             xgb__verbose=False)
# Train the model
model.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=True)

[0]	validation_0-rmse:48.77101	validation_1-rmse:30.11709
[1]	validation_0-rmse:48.34244	validation_1-rmse:29.83314
[2]	validation_0-rmse:47.91787	validation_1-rmse:29.55205
[3]	validation_0-rmse:47.49727	validation_1-rmse:29.27356
[4]	validation_0-rmse:47.08060	validation_1-rmse:28.99792
[5]	validation_0-rmse:46.66783	validation_1-rmse:28.72482
[6]	validation_0-rmse:46.25892	validation_1-rmse:28.45457
[7]	validation_0-rmse:45.85385	validation_1-rmse:28.18694
[8]	validation_0-rmse:45.45257	validation_1-rmse:27.92201
[9]	validation_0-rmse:45.05504	validation_1-rmse:27.66010
[10]	validation_0-rmse:44.66131	validation_1-rmse:27.40052
[11]	validation_0-rmse:44.27120	validation_1-rmse:27.14442
[12]	validation_0-rmse:43.88475	validation_1-rmse:26.89028
[13]	validation_0-rmse:43.50196	validation_1-rmse:26.63849
[14]	validation_0-rmse:43.12276	validation_1-rmse:26.39042
[15]	validation_0-rmse:42.74710	validation_1-rmse:26.14375
[16]	validation_0-rmse:42.37498	validation_1-rmse:25.89964
[17]	va

In [184]:
model.score(X_test, y_test)

0.6179944730647522

In [185]:
X_train[:5].values.astype(np.int32)

array([[   0,    0,    1,    1, 2001,    1,    1,    1,    3,    1],
       [   1,    0,    1,    1, 2001,    1,    1,    1,    3,    1],
       [   2,    0,    1,    1, 2001,    1,    1,    1,    3,    1],
       [   3,    0,    1,    1, 2001,    1,    1,    1,    3,    1],
       [   4,    0,    1,    1, 2001,    1,    1,    1,    3,    1]])

In [186]:
model.predict(X_test[:5].values.astype(np.float32))

array([27.430922, 21.489073, 25.461348, 12.248466, 10.486635],
      dtype=float32)

In [187]:
initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]

onx = to_onnx(model, initial_types=initial_type, verbose=True, target_opset={"": 12, "ai.onnx.ml": 2})

[to_onnx] initial_types=[('float_input', FloatTensorType(shape=[None, 10]))]
[convert_sklearn] parse_sklearn_model
[convert_sklearn] convert_topology
[convert_operators] begin
[convert_operators] iteration 1 - n_vars=0 n_ops=1
[call_converter] call converter for 'XGBoostXGBRegressor'.
[convert_operators] end iter: 1 - n_vars=2
[convert_operators] iteration 2 - n_vars=2 n_ops=1
[convert_operators] end iter: 2 - n_vars=2
[convert_operators] end.
[_update_domain_version] +opset 0: name='ai.onnx.ml', version=1
[convert_topology] +opset: name='', version=12
[convert_sklearn] end


In [188]:
feature_names = [input.name for input in onx.graph.input]
print("Feature names:", feature_names)

Feature names: ['float_input']


In [189]:
import onnxruntime as rt
sess = rt.InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
input_name = sess.get_inputs()[0].name
pred_onx = sess.run(None, {input_name: X_test[:5].values.astype(np.float32)})
print("predict", pred_onx[0].ravel())

predict [27.430922 21.489073 25.461346 12.248466 10.486633]


In [190]:
with open(f"Onnx Models/{req_data['city']}_XGBoost.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [42]:
import onnxruntime as rt
sess = rt.InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
input_name = sess.get_inputs()[0].name
pred_onx = sess.run(None, {input_name: X_test[:5].values.astype(np.float32)})
print("predict", pred_onx[0].ravel())

predict [27.430922 21.489073 25.461346 12.248466 10.486633]


In [43]:
onnx_model = onnx.load(f"Onnx Models/{req_data['city']}_{req_data['model']}.onnx")
sess = rt.InferenceSession(onnx_model.SerializeToString(), providers=["CPUExecutionProvider"])
input_name = sess.get_inputs()[0].name
pred_onx = sess.run(None, {input_name: X_test.values.astype(np.float32)})
print("predict ", pred_onx[0].ravel())

predict  [ 27.43093   21.489077  25.461342 ...  29.809853  23.666615 336.1273  ]


Conversion of LightGBM

In [191]:
from lightgbm import LGBMRegressor
from skl2onnx.common.shape_calculator import (
    calculate_linear_regressor_output_shapes,
)  # noqa
from onnxmltools import __version__ as oml_version
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import (
    convert_lightgbm,
)  # noqa

In [192]:
if req_data['city'] == 'New_York':
    model = LGBMRegressor(force_col_wise=True, n_estimators=240, early_stopping_rounds=10,
                     max_depth=15, learning_rate =0.01, num_leaves=30, verbose=-1)
                            
if req_data['city'] == 'Chicago':
     model = LGBMRegressor(force_col_wise=True, n_estimators=400,
                          max_depth=14,
                          learning_rate =0.01, num_leaves=150, verbose=-1)
np.random.seed(0)
model.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_train, y_train)])
      

In [111]:
X_test.tail(20)

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-12-31 04:00:00,4,6,4,12,2023,365,31,52,3,0
2023-12-31 05:00:00,5,6,4,12,2023,365,31,52,3,0
2023-12-31 06:00:00,6,6,4,12,2023,365,31,52,3,0
2023-12-31 07:00:00,7,6,4,12,2023,365,31,52,3,0
2023-12-31 08:00:00,8,6,4,12,2023,365,31,52,3,0
2023-12-31 09:00:00,9,6,4,12,2023,365,31,52,3,0
2023-12-31 10:00:00,10,6,4,12,2023,365,31,52,3,0
2023-12-31 11:00:00,11,6,4,12,2023,365,31,52,3,0
2023-12-31 12:00:00,12,6,4,12,2023,365,31,52,3,0
2023-12-31 13:00:00,13,6,4,12,2023,365,31,52,3,0


In [193]:
model.score(X_test, y_test)

0.5503283010764283

In [194]:
model.predict(X_test[:5].values)

array([29.73603472, 24.5048727 , 27.82722156, 14.95360639, 12.56081023])

In [196]:
def skl2onnx_convert_lightgbm(scope, operator, container):
    options = scope.get_options(operator.raw_operator)
    if "split" in options:
        if pv.Version(oml_version) < pv.Version("1.9.2"):
            warnings.warn(
                "Option split was released in version 1.9.2 but %s is "
                "installed. It will be ignored." % oml_version
            )
        operator.split = options["split"]
    else:
        operator.split = None
    convert_lightgbm(scope, operator, container)


update_registered_converter(
    LGBMRegressor,
    "LightGbmLGBMRegressor",
    calculate_linear_regressor_output_shapes,
    skl2onnx_convert_lightgbm,
    options={"split": None},
)

In [197]:
initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]
onx = to_onnx(
    model, initial_types = initial_type, target_opset={"": 14, "ai.onnx.ml": 2}
)

In [198]:
import onnxruntime as rt
sess = rt.InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
input_name = sess.get_inputs()[0].name
pred_onx = sess.run(None, {input_name: X_test[:5].values.astype(np.float32)})
print("predict", pred_onx[0].ravel())
# And save.

predict [29.736048 24.504864 27.827211 14.953598 12.56081 ]


In [199]:
with open(f"Onnx Models/{req_data['city']}_Light_GBM.onnx", "wb") as f:
    f.write(onx.SerializeToString())

Convert to Onnx function

In [200]:
def Convert_to_onnx_model(reg_model):
    initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]
    onx_model = to_onnx(reg_model, initial_types=initial_type, target_opset={"": 14, "ai.onnx.ml": 2})
    return onx_model

Function to Predict using Onnx Model

In [201]:
def onnx_predict(onx_model):
    import onnxruntime as rt
    sess = rt.InferenceSession(onx_model.SerializeToString(), providers=["CPUExecutionProvider"])
    input_name = sess.get_inputs()[0].name
    pred_onx = sess.run(None, {input_name: X_test[:5].values.astype(np.float32)})
    return pred_onx

In [202]:
def save_onnx_model(onx_model, city, model):
    with open(f"Onnx Models/{city}_{model}.onnx", "wb") as f:
      f.write(onx_model.SerializeToString())

In [203]:
def onnx_load_runtime(city, model):
    onnx_model = onnx.load(f"Onnx Models/{city}_{model}.onnx")
    sess = rt.InferenceSession(onnx_model.SerializeToString(), providers=["CPUExecutionProvider"])
    input_name = sess.get_inputs()[0].name
    pred_onx = sess.run(None, {input_name: X_test.values.astype(np.float32)})
    print("predict ", pred_onx[0].ravel())

Decision Tree

In [213]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression


In [214]:
if req_data['city'] == 'New_York':
   model = DecisionTreeRegressor(max_features=0.7,
                          max_depth=20, min_samples_leaf=30)

if req_data['city'] == 'Chicago':
   model = DecisionTreeRegressor(max_features=0.7,
                          max_depth=13, min_samples_leaf=20)
                          
np.random.seed(0)
model.fit(X_train, y_train)

In [215]:
model.score(X_test,y_test)

0.5170570023916486

In [216]:
model.predict(X_test[:5].values.astype(np.float32))



array([28.9047619 , 22.55882353, 24.63333333, 14.88235294, 10.92934783])

In [217]:
onx = Convert_to_onnx_model(model)
onnx_predict(onx)[0].ravel()


array([28.904762, 22.558823, 24.633333, 14.882353, 10.929348],
      dtype=float32)

In [218]:
save_onnx_model(onx, req_data['city'], 'Decision_Tree')

Random Forest

In [219]:
if req_data['city'] == 'New_York':
   model = RandomForestRegressor( max_features=0.7,
                          n_estimators=1000,
                          max_depth=15, min_samples_leaf=30
                      )
if req_data['city'] == 'Chicago':
   model = RandomForestRegressor( max_features=0.7,
                n_estimators=346,
                     max_depth=13, min_samples_leaf=20
                     )
np.random.seed(0)
model.fit(X_train, y_train)

In [220]:
model.score(X_test,y_test)

0.5631832363173775

In [221]:
model.predict(X_test[:5].values)



array([28.91701496, 24.23446707, 25.03036903, 12.75430781, 10.77710911])

In [222]:
onx = Convert_to_onnx_model(model)
onnx_predict(onx)[0].ravel()

array([28.917015, 24.23448 , 25.03037 , 12.75431 , 10.777109],
      dtype=float32)

In [223]:
save_onnx_model(onx, req_data['city'], 'Random_Forest')

Linear Regression

In [228]:
model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

-0.2382545270428813

In [229]:
model.predict(X_test[:5].values)



array([41.31912462, 42.89357478,  7.75483443,  9.3292846 , 10.90373476])

In [230]:
onx = Convert_to_onnx_model(model)
onnx_predict(onx)[0].ravel()

array([41.318848, 42.89331 ,  7.754883,  9.329346, 10.903809],
      dtype=float32)

In [231]:
save_onnx_model(onx, req_data['city'], 'Linear')