In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import seaborn as sns
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from sklearn.model_selection import GridSearchCV

# from .. import data_preprocess

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

## Common Code

In [34]:
# Files supplied by the competition for model training
X_train = pd.read_csv('../../data/dengue_features_train.csv')
y_train = pd.read_csv('../../data/dengue_labels_train.csv', usecols=['total_cases'])

In [35]:
# Files supplied by the competition for submission
X_test = pd.read_csv('../../data/dengue_features_test.csv')
y_test = pd.read_csv('../../data/submission_format.csv')

In [36]:
X_train[X_train['city']=='sj'].describe()

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
count,936.0,936.0,745.0,887.0,917.0,917.0,927.0,930.0,930.0,930.0,930.0,930.0,930.0,930.0,927.0,930.0,930.0,930.0,930.0,930.0,930.0,930.0
mean,1998.826923,26.503205,0.057925,0.067469,0.177655,0.165956,35.470809,299.163653,299.27692,295.109519,301.398817,297.301828,30.465419,78.568181,35.470809,16.552409,2.516267,27.006528,6.757373,31.607957,22.600645,26.785484
std,5.212076,15.021909,0.107153,0.092479,0.057166,0.056073,44.606137,1.236429,1.218637,1.569943,1.258927,1.294705,35.628055,3.389488,44.606137,1.560923,0.498892,1.415473,0.835993,1.717297,1.506277,29.325811
min,1990.0,1.0,-0.40625,-0.4561,-0.015533,-0.063457,0.0,295.938571,296.114286,289.642857,297.8,292.6,0.0,66.735714,0.0,11.715714,1.357143,22.842857,4.528571,26.7,17.8,0.0
25%,1994.0,13.75,0.0045,0.016425,0.139283,0.129157,0.0,298.195,298.3,293.847857,300.4,296.3,10.825,76.246071,0.0,15.236429,2.157143,25.842857,6.2,30.6,21.7,6.825
50%,1999.0,26.5,0.0577,0.068075,0.177186,0.165971,20.8,299.254286,299.378571,295.464286,301.5,297.5,21.3,78.667857,20.8,16.845714,2.457143,27.228571,6.757143,31.7,22.8,17.75
75%,2003.0,39.25,0.1111,0.1152,0.212557,0.202771,52.18,300.132857,300.228571,296.418929,302.4,298.4,37.0,80.963214,52.18,17.858571,2.8,28.185714,7.285714,32.8,23.9,35.45
max,2008.0,53.0,0.4934,0.4371,0.393129,0.38142,390.6,302.2,302.164286,297.795714,304.3,299.9,570.5,87.575714,390.6,19.44,4.428571,30.071429,9.914286,35.6,25.6,305.9


In [37]:
X_train[X_train['city']=='iq'].describe()

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
count,520.0,520.0,517.0,517.0,517.0,517.0,516.0,516.0,516.0,516.0,516.0,516.0,516.0,516.0,516.0,516.0,516.0,483.0,483.0,506.0,512.0,504.0
mean,2005.0,26.503846,0.263869,0.238783,0.250126,0.266779,64.245736,297.869538,299.133043,295.492982,307.082752,292.866667,57.609864,88.639117,64.245736,17.09611,9.206783,27.530933,10.566197,34.004545,21.19668,62.467262
std,2.918283,15.02945,0.08137,0.076751,0.077354,0.086345,35.218995,1.170997,1.332073,1.417229,2.38298,1.663069,50.286555,7.583889,35.218995,1.445769,2.448525,0.921769,1.535496,1.325261,1.260327,63.245958
min,2000.0,1.0,0.061729,0.03586,0.02988,0.064183,0.0,294.635714,294.892857,290.088571,300.0,286.9,0.0,57.787143,0.0,12.111429,3.714286,21.4,5.2,30.1,14.7,0.0
25%,2002.75,13.75,0.2,0.17954,0.194743,0.204129,39.105,297.0925,298.221429,294.593929,305.2,291.975,24.065,84.295,39.105,16.102857,7.371429,27.0,9.5,33.2,20.6,17.2
50%,2005.0,26.5,0.263643,0.232971,0.2498,0.262143,60.47,297.822857,299.121429,295.852143,307.05,293.05,46.44,90.917143,60.47,17.428571,8.964286,27.6,10.625,34.0,21.3,45.3
75%,2007.25,39.25,0.319971,0.293929,0.3023,0.32515,85.7575,298.649286,300.123214,296.548571,308.7,294.2,71.0725,94.563929,85.7575,18.180357,11.014286,28.1,11.655,34.9,22.0,85.95
max,2010.0,53.0,0.508357,0.454429,0.538314,0.546017,210.83,301.637143,302.928571,298.45,314.0,296.0,362.03,98.61,210.83,20.461429,16.028571,30.8,15.8,42.2,24.2,543.3


In [24]:
X_test.describe()

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
count,416.0,416.0,373.0,405.0,415.0,415.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0,404.0,404.0,413.0,407.0,411.0
mean,2010.766827,26.439904,0.12605,0.126803,0.207702,0.201721,38.354324,298.818295,299.353071,295.419179,303.62343,295.743478,42.171135,82.49981,38.354324,16.927088,5.124569,27.369587,7.810991,32.534625,22.36855,34.278589
std,1.434835,14.978257,0.164353,0.14142,0.079102,0.092028,35.171126,1.469501,1.306233,1.523099,3.101817,2.761109,48.909514,7.378243,35.171126,1.557868,3.54287,1.232608,2.449718,1.920429,1.731437,34.655966
min,2008.0,1.0,-0.4634,-0.2118,0.0062,-0.014671,0.0,294.554286,295.235714,290.818571,298.2,286.2,0.0,64.92,0.0,12.537143,1.485714,24.157143,4.042857,27.2,14.2,0.0
25%,2010.0,13.75,-0.0015,0.015975,0.14867,0.134079,8.175,297.751429,298.323214,294.335714,301.425,293.5,9.43,77.397143,8.175,15.792857,2.446429,26.514286,5.928571,31.1,21.2,9.1
50%,2011.0,26.0,0.1101,0.0887,0.204171,0.186471,31.455,298.547143,299.328571,295.825,302.75,296.3,25.85,80.33,31.455,17.337143,2.914286,27.483333,6.642857,32.8,22.2,23.6
75%,2012.0,39.0,0.263329,0.2424,0.254871,0.253243,57.7725,300.240357,300.521429,296.643571,305.8,298.275,56.475,88.328929,57.7725,18.174643,8.171429,28.319048,9.8125,33.9,23.3,47.75
max,2013.0,53.0,0.5004,0.649,0.453043,0.529043,169.34,301.935714,303.328571,297.794286,314.1,299.7,301.4,97.982857,169.34,19.598571,14.485714,30.271429,14.725,38.4,26.7,212.0


In [38]:
def data_preprocess(df):
    # drop or encode categorical cols
    df_processed = df.drop('week_start_date', axis=1)
    df_processed['city'] = df_processed['city'].apply(lambda x : 1 if x=='iq' else 0)
    return df_processed

In [39]:
def create_submission_file(pipeline, filename_comment):
    next_file_id = generate_next_submission_fileid()
    X_test_processed = data_preprocess(X_test)
    y_submit_pred = np.rint(pipeline.predict(X_test_processed))
    y_test['total_cases'] = y_submit_pred
    y_test['total_cases'] = y_test['total_cases'].astype(int)
    filename = f'../../data/dengue_submission_{next_file_id}_{filename_comment}.csv'
    y_test.to_csv(filename, index = False)
    
    return y_submit_pred, filename

In [40]:
def generate_next_submission_fileid():
    files_found = []
    for file in os.listdir("../../data"):
        if file.startswith("dengue_submission"):
            files_found.append(file[18:20])
    return f'{int(sorted(files_found).pop()) + 1 :02}'

## Notebook-specific code

### Other Estimators to try:
https://www.analyticsvidhya.com/blog/2021/01/a-quick-overview-of-regression-algorithms-in-machine-learning/ <br>
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble <br>
- AdaBoost
- XGBoost
- SVM
- KNN
- Linear Regression (incl L1 reg)
- Time Series (ARIMA, etc)

In [7]:
from sklearn.ensemble import AdaBoostRegressor

In [8]:
def cross_validate(X, y, estimator, cv, scaler=StandardScaler(), imputer=KNNImputer(n_neighbors = 5), dim_reduction=PCA(n_components = 9)):
    pipeline = Pipeline(steps=[
        ('scaler', scaler),
        ('imputer', imputer),
        ('dim_reduction', dim_reduction),
        ('estimator', estimator)
    ])
    
    #X_train, y_train, X_val, y_val = train_test_split(X, y, test_size=.2, random_state=42)
    mae_list_train = []
    mae_list_val = []
    
    for train_idxs, val_idxs in cv.split(X, y):
        X_train, y_train = X.iloc[train_idxs], y.iloc[train_idxs]
        pipeline.fit(X_train, y_train)
        y_pred_train = pipeline.predict(X_train)
        print(f'Train MAE = {mean_absolute_error(y_train, y_pred_train)}')
        mae_list_train.append(mean_absolute_error(y_train, y_pred_train))
        
        X_val, y_val = X.iloc[val_idxs], y.iloc[val_idxs]
        y_pred_val = pipeline.predict(X_val)

        print(f'Validation MAE = {mean_absolute_error(y_val, y_pred_val)}')
        mae_list_val.append(mean_absolute_error(y_val, y_pred_val))
    
    print(f'MAE Train Mean: {np.mean(mae_list_train)}')
    print(f'MAE Val Mean: {np.mean(mae_list_val)}')

    return pipeline    

In [9]:
def tune_model(X_processed, y_train, pipe, params):
    

    # do gridsearch being sure to set scoring to MAE
    gridsearch = GridSearchCV(estimator=pipe, param_grid=params, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1).fit(X_processed, y_train)
    print(f'Score with entire training dataset:{-gridsearch.score(X_processed, y_train)}')

    best_params = gridsearch.best_params_
    print(best_params)
    
    return gridsearch

In [15]:
X_processed = data_preprocess(X_train)

# define pipeline
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('imputer', KNNImputer()),
    ('dim_reduction', PCA()),
    ('estimator', AdaBoostRegressor())
])

# define parameter ranges in dict
# use double underscore to link pipline object with param name -
# - use the label created when defining the pipe for the test left of the '__'
params = {
    'imputer__n_neighbors' : np.arange(8,15,1),
    
    'dim_reduction__n_components' : np.arange(4,8,1),
    
    'estimator__n_estimators' : np.arange(20,41,5),
    'estimator__learning_rate' : np.arange(.0001, .01, .001)
    #'estimator__loss' : ['linear', 'square', 'exponential']
}

grid_pipe = tune_model(X_processed, y_train, pipe, params)

Fitting 5 folds for each of 1400 candidates, totalling 7000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:   52.2s
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed:  4.3min


Score with entire training dataset:19.062854311366802
{'dim_reduction__n_components': 6, 'estimator__learning_rate': 0.0051, 'estimator__n_estimators': 20, 'imputer__n_neighbors': 11}


[Parallel(n_jobs=-1)]: Done 7000 out of 7000 | elapsed:  4.5min finished
  return f(**kwargs)


In [21]:
# GridSearch returns the estimator so we can call .predict() on it!
# so just pass the gridsearch object to create_submission_file()
y_pred_sub, filename = create_submission_file(grid_pipe, "adaboost_gridsearch_refined2")
y_pred_sub = pd.read_csv(filename)
y_pred_sub.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,18
1,sj,2008,19,18
2,sj,2008,20,18
3,sj,2008,21,18
4,sj,2008,22,18


### Before writing tune_model function
So delete when confirm new process works as expected

In [None]:
X_processed = data_preprocess(X_train)

# define pipeline
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('imputer', KNNImputer()),
    ('dim_reduction', PCA()),
    ('estimator', AdaBoostRegressor())
])

# define parameter ranges in dict
# use dounble underscore to link pipline object with param name -
# - use the label created when defining the pipe for the test left of the '__'
params = {
    'imputer__n_neighbors' : np.arange(1,10,2),
    'dim_reduction__n_components' : np.arange(2,10,2),
    'estimator__n_estimators' : np.arange(0,100,25),
    'estimator__learning_rate' : np.arange(.02, 1.01, .1)
    #'estimator__loss' : ['linear', 'square', 'exponential']
}

# do gridsearch being sure to set scoring to MAE
gridsearch = GridSearchCV(estimator=pipe, param_grid=params, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1).fit(X_processed, y_train)
print(f'Score with entire training dataset:{-gridsearch.score(X_processed, y_train)}')

best_params = gridsearch.best_params_
print(best_params)

In [None]:
# GridSearch returns the estimator so we can call .predict() on it!
# so just pass the gridsearch object to create_submission_file()
y_pred_sub, filename = create_submission_file(gridsearch, "adaboost_gridsearch_2")

In [None]:
sub_file

In [None]:
y_sub = pd.read_csv(filename)
y_sub.head()