## **DengAI Regression Notebook**
This notebook contains regression based models for the DengAI competition

### Environment and Data Prep

In [1]:
#Run the utils notebook
%run DengAI-Utils.ipynb

Environment is: keras


In [None]:
#pre-process the data
df_x_train_sj, df_y_train_sj, df_x_test_sj, df_y_test_sj, df_x_valid_sj, df_y_valid_sj, df_x_pred_sj, df_y_pred_sj=pre_process_data(
    city='sj',
    train_split=0.8,
    test_split=0.2,
    valid_split=0.0,
    xy_split=0.0,
    time_series_split=True
)
#Pre-process data for iq
df_x_train_iq, df_y_train_iq, df_x_test_iq, df_y_test_iq, df_x_valid_iq, df_y_valid_iq, df_x_pred_iq, df_y_pred_iq=pre_process_data(
    city='iq',
    train_split=0.8,
    test_split=0.2,
    valid_split=0.0,
    xy_split=0.0,
    time_series_split=True
)

#set target field
target='total_cases'


#### Model Baseline
Use a basic linear regression to establish the baseline model performance

In [3]:
from sklearn.linear_model import LinearRegression

#create and fit the model. Use the scaled training and test feature data against the unscaled target
lr_sj=LinearRegression().fit(df_x_train_sj,df_y_train_sj[target])

#Look at scores
print('Training set score sj: {:.2f}'.format(lr_sj.score(df_x_train_sj,df_y_train_sj[target])))
print('Test set score sj: {:.2f}'.format(lr_sj.score(df_x_test_sj,df_y_test_sj[target])))

#create and fit the model. Use the scaled training and test feature data against the unscaled target
lr_iq=LinearRegression().fit(df_x_train_iq,df_y_train_iq[target])

#Look at scores
print('Training set score iq: {:.2f}'.format(lr_iq.score(df_x_train_iq,df_y_train_iq[target])))
print('Test set score iq: {:.2f}'.format(lr_iq.score(df_x_test_iq,df_y_test_iq[target])))
print(target)
evaluate_results(lr_sj,lr_iq,df_x_test_sj,df_x_test_iq,df_y_test_sj,df_y_test_iq,target)


Training set score sj: 0.22
Test set score sj: -0.37
Training set score iq: 0.30
Test set score iq: 0.08
total_cases
MAE of SJ: 22.93956851413918
MAE of IQ: 8.304594216035644
MAE of Combined: 17.602800442341028


In [5]:
'''Create the submission file'''
#Use the trained model to make predictions on the holdout set
y_pred_sj=lr_sj.predict(df_x_pred_sj)
y_pred_iq=lr_iq.predict(df_x_pred_iq)

#Create the holdout file
create_submit_file(y_pred_sj,y_pred_iq)


**Use ridge regression as a comparative baseline**

In [122]:
from sklearn.linear_model import Ridge

#create and fit the model. Use the scaled training and test feature data against the unscaled target
rr_sj=Ridge(alpha=150).fit(df_x_train_sj,df_y_train_sj[target])

#Look at scores
print('Training set score sj: {:.2f}'.format(rr_sj.score(df_x_train_sj,df_y_train_sj[target])))
print('Test set score sj: {:.2f}'.format(rr_sj.score(df_x_test_sj,df_y_test_sj[target])))

#create and fit the model. Use the scaled training and test feature data against the unscaled target
rr_iq=Ridge(alpha=150).fit(df_x_train_iq,df_y_train_iq[target])

#Look at scores
print('Training set score iq: {:.2f}'.format(rr_iq.score(df_x_train_iq,df_y_train_iq[target])))
print('Test set score iq: {:.2f}'.format(rr_iq.score(df_x_test_iq,df_y_test_iq[target])))

#Test MAE 
y_pred_sj=rr_sj.predict(df_x_test_sj)
y_pred_iq=rr_iq.predict(df_x_test_iq)
y_pred_combined=np.append(y_pred_sj,y_pred_iq)
y_target_combined=np.append(df_y_test_sj[target],df_y_test_iq[target])
print('MAE of SJ: '+ str(mean_absolute_error(df_y_test_sj[target],y_pred_sj)))
print('MAE of IQ: '+ str(mean_absolute_error(df_y_test_iq[target],y_pred_iq)))
print('MAE of Combined: ' + str(mean_absolute_error(y_target_combined,y_pred_combined)))


Training set score sj: 0.13
Test set score sj: -0.19
Training set score iq: 0.09
Test set score iq: -0.02
MAE of SJ: 24.90159410921486
MAE of IQ: 7.650982091168224
MAE of Combined: 18.813142808727815


In [123]:
from sklearn.linear_model import Lasso

#create and fit the model. Use the scaled training and test feature data against the unscaled target
lasso_sj=Lasso(alpha=150).fit(df_x_train_sj,df_y_train_sj[target])

#Look at scores
print('Training set score sj: {:.2f}'.format(lasso_sj.score(df_x_train_sj,df_y_train_sj[target])))
print('Test set score sj: {:.2f}'.format(lasso_sj.score(df_x_test_sj,df_y_test_sj[target])))

#create and fit the model. Use the scaled training and test feature data against the unscaled target
lasso_iq=Lasso(alpha=150).fit(df_x_train_iq,df_y_train_iq[target])

#Look at scores
print('Training set score iq: {:.2f}'.format(lasso_iq.score(df_x_train_iq,df_y_train_iq[target])))
print('Test set score iq: {:.2f}'.format(lasso_iq.score(df_x_test_iq,df_y_test_iq[target])))

#Test MAE 
y_pred_sj=lasso_sj.predict(df_x_test_sj)
y_pred_iq=lasso_iq.predict(df_x_test_iq)
y_pred_combined=np.append(y_pred_sj,y_pred_iq)
y_target_combined=np.append(df_y_test_sj[target],df_y_test_iq[target])
print('MAE of SJ: '+ str(mean_absolute_error(df_y_test_sj[target],y_pred_sj)))
print('MAE of IQ: '+ str(mean_absolute_error(df_y_test_iq[target],y_pred_iq)))
print('MAE of Combined: ' + str(mean_absolute_error(y_target_combined,y_pred_combined)))

Training set score sj: 0.00
Test set score sj: -0.48
Training set score iq: 0.00
Test set score iq: -0.08
MAE of SJ: 28.80807828676783
MAE of IQ: 7.877219047619047
MAE of Combined: 21.420716202362375


**Baseline using a Linear SVM Model**

In [124]:
from sklearn.svm import LinearSVR

c=0.1
e=30.0

svm_reg_sj=LinearSVR(epsilon=e, C=c, random_state=42)
svm_reg_sj.fit(df_x_train_sj,df_y_train_sj[target])

#Look at scores
print('Training set score: {:.2f}'.format(svm_reg_sj.score(df_x_train_sj,df_y_train_sj[target])))
print('Test set score: {:.2f}'.format(svm_reg_sj.score(df_x_test_sj,df_y_test_sj[target])))

svm_reg_iq=LinearSVR(epsilon=e, C=c,random_state=42)
svm_reg_iq.fit(df_x_train_iq,df_y_train_iq[target])

#Look at scores
print('Training set score: {:.2f}'.format(svm_reg_iq.score(df_x_train_iq,df_y_train_iq[target])))
print('Test set score: {:.2f}'.format(svm_reg_iq.score(df_x_test_iq,df_y_test_iq[target])))

#Test MAE 
y_pred_sj=svm_reg_sj.predict(df_x_test_sj)
y_pred_iq=svm_reg_iq.predict(df_x_test_iq)
y_pred_combined=np.append(y_pred_sj,y_pred_iq)
y_target_combined=np.append(df_y_test_sj[target],df_y_test_iq[target])
print('MAE of SJ: '+ str(mean_absolute_error(df_y_test_sj[target],y_pred_sj)))
print('MAE of IQ: '+ str(mean_absolute_error(df_y_test_iq[target],y_pred_iq)))
print('MAE of Combined: ' + str(mean_absolute_error(y_target_combined,y_pred_combined)))

#Use the trained model to make predictions on the holdout set
y_pred_sj=svm_reg_sj.predict(df_x_pred_sj)
y_pred_iq=svm_reg_iq.predict(df_x_pred_iq)

#Create the holdout file
create_submit_file(y_pred_sj,y_pred_iq)

print('-'*25)
print('Submission File Creation complete')


Training set score: 0.07
Test set score: 0.18
Training set score: 0.04
Test set score: -0.08
MAE of SJ: 18.31039982659118
MAE of IQ: 7.537959468706822
MAE of Combined: 14.508362053220232
-------------------------
Submission File Creation complete


**Baseline Model Using Nonlinear SVM**

In [125]:
from sklearn.svm import SVR

svr_reg_sj=SVR(kernel='poly', degree=2, C=100, epsilon=0)
svr_reg_sj.fit(df_x_train_sj,df_y_train_sj[target])

#Look at scores
print('Training set score: {:.2f}'.format(svr_reg_sj.score(df_x_train_sj,df_y_train_sj[target])))
print('Test set score: {:.2f}'.format(svr_reg_sj.score(df_x_test_sj,df_y_test_sj[target])))

svr_reg_iq=SVR(kernel='poly', degree=2, C=100, epsilon=0)
svr_reg_iq.fit(df_x_train_iq,df_y_train_iq[target])

#Look at scores
print('Training set score: {:.2f}'.format(svr_reg_iq.score(df_x_train_iq,df_y_train_iq[target])))
print('Test set score: {:.2f}'.format(svr_reg_iq.score(df_x_test_iq,df_y_test_iq[target])))

#Test MAE 
y_pred_sj=svr_reg_sj.predict(df_x_test_sj)
y_pred_iq=svr_reg_iq.predict(df_x_test_iq)
y_pred_combined=np.append(y_pred_sj,y_pred_iq)
y_target_combined=np.append(df_y_test_sj[target],df_y_test_iq[target])
print('MAE of SJ: '+ str(mean_absolute_error(df_y_test_sj[target],y_pred_sj)))
print('MAE of IQ: '+ str(mean_absolute_error(df_y_test_iq[target],y_pred_iq)))
print('MAE of Combined: ' + str(mean_absolute_error(y_target_combined,y_pred_combined)))

Training set score: 0.10
Test set score: 0.15
Training set score: 0.35
Test set score: -0.11
MAE of SJ: 17.823760292154823
MAE of IQ: 7.743770861261916
MAE of Combined: 14.266116963604384


In [93]:
'''Create the submission file'''
#Use the trained model to make predictions on the holdout set
y_pred_sj=model_sj.predict(df_x_pred_sj)
y_pred_iq=model_iq.predict(df_x_pred_iq)

#Create the holdout file
create_submit_file(y_pred_sj,y_pred_iq)

print('Creation complete')

Creation complete


In [None]:
#Create a random forest regressor to get feature importances and select a set of features
from sklearn.feature_selection import SelectFromModel,RFE
from sklearn.ensemble import RandomForestRegressor

select=RFE(RandomForestRegressor(n_estimators=100,random_state=1066),n_features_to_select=50)
select.fit(df_x_train_sj_sc[training_feature_list],df_y_train_sj[target])
df_x_train_sj_sc_l1=select.transform(df_x_train_sj_sc[training_feature_list])
df_x_test_sj_sc_l1=select.transform(df_x_test_sj_sc[training_feature_list])

print(df_x_train_sj_sc_l1.shape)

#### Results
Several of the regression models yielded decent results for train and test. However, when they we applied to the holdout data, they did not generalize well. Presumably this is because the holdout cases involve severe outbreaks.