### Deng AI Poisson Regression Notebook
Notebook to test Poisson regression for the Deng AI Competition

#### Prep Environment and Data

In [1]:
#Run the utils notebook
%run DengAI-Utils.ipynb

Environment is: keras


In [2]:
#pre-process the data
df_x_train_sj, df_y_train_sj, df_x_test_sj, df_y_test_sj, df_x_valid_sj, df_y_valid_sj, df_x_pred_sj, df_y_pred_sj=pre_process_data(
    city='sj',
    train_split=0.8,
    test_split=0.2,
    valid_split=0.0,
    xy_split=0.0,
    time_series_split=True)
df_x_train_iq, df_y_train_iq, df_x_test_iq, df_y_test_iq, df_x_valid_iq, df_y_valid_iq, df_x_pred_iq, df_y_pred_iq=pre_process_data(
    city='iq',
    train_split=0.8,
    test_split=0.2,
    valid_split=0.0,
    xy_split=0.0,
    time_series_split=True)

#set target field
target='total_cases'

#### Linear Model for Comparison

In [4]:
from sklearn.linear_model import LinearRegression

#create and fit the model. Use the scaled training and test feature data against the unscaled target
lr_sj=LinearRegression().fit(df_x_train_sj,df_y_train_sj[target])

#Look at scores
print('Training set score sj: {:.2f}'.format(lr_sj.score(df_x_train_sj,df_y_train_sj[target])))
print('Test set score sj: {:.2f}'.format(lr_sj.score(df_x_test_sj,df_y_test_sj[target])))

#create and fit the model. Use the scaled training and test feature data against the unscaled target
lr_iq=LinearRegression().fit(df_x_train_iq,df_y_train_iq[target])

#Look at scores
print('Training set score iq: {:.2f}'.format(lr_iq.score(df_x_train_iq,df_y_train_iq[target])))
print('Test set score iq: {:.2f}'.format(lr_iq.score(df_x_test_iq,df_y_test_iq[target])))
print(target)
evaluate_results(lr_sj,lr_iq,df_x_test_sj,df_x_test_iq,df_y_test_sj,df_y_test_iq,target)


Training set score sj: 0.71
Test set score sj: -7.14
Training set score iq: 1.00
Test set score iq: -11.17
total_cases
MAE of SJ: 74.40815575842913
MAE of IQ: 31.143989009513533
MAE of Combined: 35.017590860263134


#### Poisson Model

In [3]:
from sklearn.linear_model import PoissonRegressor

sj_samples=df_x_train_sj.shape[0]
iq_samples=df_x_train_iq.shape[0]

alpha_sj=1e-6/sj_samples
alpha_iq=1e-6/iq_samples

pr_sj=PoissonRegressor(alpha=alpha_sj,max_iter=300)
pr_sj.fit(df_x_train_sj,df_y_train_sj[target])

#Look at scores
print('Training set score sj: {:.2f}'.format(pr_sj.score(df_x_train_sj,df_y_train_sj[target])))
print('Test set score sj: {:.2f}'.format(pr_sj.score(df_x_test_sj,df_y_test_sj[target])))

pr_iq=PoissonRegressor(alpha=alpha_iq,max_iter=300)
pr_iq.fit(df_x_train_iq,df_y_train_iq[target])

#Look at scores
print('Training set score sj: {:.2f}'.format(pr_iq.score(df_x_train_iq,df_y_train_iq[target])))
print('Test set score sj: {:.2f}'.format(pr_iq.score(df_x_test_iq,df_y_test_iq[target])))

#Evaluate results
evaluate_results(pr_sj,pr_iq,df_x_test_sj,df_x_test_iq,df_y_test_sj,df_y_test_iq,target)


Training set score sj: 0.85
Test set score sj: -0.17
Training set score sj: 0.93
Test set score sj: -2.03
MAE of SJ: 19.597102355337448
MAE of IQ: 7.492804995513625
MAE of Combined: 15.45494736528367


In [27]:
'''Create the submission file'''
#Use the trained model to make predictions on the holdout set
y_pred_sj=pr_sj.predict(df_x_pred_sj)
y_pred_iq=pr_iq.predict(df_x_pred_iq)

#Create the holdout file
create_submit_file(y_pred_sj,y_pred_iq)

#### Grid Search for Poisson Model

In [4]:
#Start timer
from datetime import datetime
s_time=datetime.now()

sj_samples=df_x_train_sj.shape[0]
iq_samples=df_x_train_iq.shape[0]

alpha_sj=1e-6/sj_samples
alpha_iq=1e-6/iq_samples

#Define param grid
p_grid={
    'alpha':[alpha_sj,alpha_iq,0.0001,0.001,0.01,0.1,1,10],
    'max_iter':[100,200,300,500],
    'fit_intercept':[True,False],
    'tol':[0,1,5,10],
    'warm_start':[True,False],
}

from sklearn.linear_model import PoissonRegressor

#run param grid for sj
grid_sj=GridSearchCV(PoissonRegressor(),p_grid,cv=5,return_train_score=True)
grid_sj.fit(df_x_train_sj,df_y_train_sj[target])

#run param grid for sj
grid_iq=GridSearchCV(PoissonRegressor(),p_grid,cv=5,return_train_score=True)
grid_iq.fit(df_x_train_iq,df_y_train_iq[target])

#Evaluate results
print('Score of SJ train: ' + str(grid_sj.score(df_x_train_sj,df_y_train_sj[target])))
print('Score of SJ test: ' + str(grid_sj.score(df_x_test_sj,df_y_test_sj[target])))
print('Score of IQ train: ' + str(grid_iq.score(df_x_train_iq,df_y_train_iq[target])))
print('Score of IQ test: ' + str(grid_iq.score(df_x_test_iq,df_y_test_iq[target])))
print('-'*25)
evaluate_results(grid_sj,grid_iq,df_x_test_sj,df_x_test_iq,df_y_test_sj,df_y_test_iq,target='total_cases')
print('-'*25)
#end timer
e_time=datetime.now()
exec_time=e_time-s_time
print('Execution time:' + str(exec_time))


Score of SJ train: 0.7199614903102085
Score of SJ test: 0.14438594572062768
Score of IQ train: 0.647266386054991
Score of IQ test: 0.589192119535564
-------------------------
MAE of SJ: 16.310249927492933
MAE of IQ: 4.838453801830189
MAE of Combined: 12.384540226695796
-------------------------
Execution time:0:01:44.670367


In [12]:
print('Best params for sj: ' + str(grid_sj.best_params_))
print('Best params for iq: ' + str(grid_iq.best_params_))

Best params for sj: {'alpha': 10, 'fit_intercept': True, 'max_iter': 100, 'tol': 1, 'warm_start': True}
Best params for iq: {'alpha': 0.1, 'fit_intercept': True, 'max_iter': 100, 'tol': 1, 'warm_start': True}


In [13]:
#create the submission file from best model
sj_pred=grid_sj.predict(df_x_pred_sj)
iq_pred=grid_iq.predict(df_x_pred_iq)

#create submit file
create_submit_file(sj_pred,iq_pred)


#### Results
The poisson regression using grid search over the features selected for their strongest correlation with total_cases yielded a combined MAE of 37.60 - far worse than the baseline model. This is not surprising given the models assumptions.