In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import seaborn as sns
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from sklearn.model_selection import GridSearchCV

# from .. import data_preprocess

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

## Common Code

In [2]:
# Files supplied by the competition for model training
X_train = pd.read_csv('../../data/dengue_features_train.csv')
y_train = pd.read_csv('../../data/dengue_labels_train.csv', usecols=['total_cases'])

In [3]:
# Files supplied by the competition for submission
X_test = pd.read_csv('../../data/dengue_features_test.csv')
y_test = pd.read_csv('../../data/submission_format.csv')

In [4]:
def data_preprocess(df):
    # drop or encode categorical cols
    df_processed = df.drop('week_start_date', axis=1)
    df_processed['city'] = df_processed['city'].apply(lambda x : 1 if x=='iq' else 0)
    return df_processed

In [61]:
def create_submission_file(pipeline, filename_comment):
    next_file_id = generate_next_submission_fileid()
    X_test_processed = data_preprocess(X_test)
    y_submit_pred = np.rint(pipeline.predict(X_test_processed))
    y_test['total_cases'] = y_submit_pred
    y_test['total_cases'] = y_test['total_cases'].astype(int)
    filename = f'../../data/dengue_submission_{next_file_id}_{filename_comment}.csv'
    y_test.to_csv(filename, index = False)
    
    return y_submit_pred, filename

In [6]:
def generate_next_submission_fileid():
    files_found = []
    for file in os.listdir("../../data"):
        if file.startswith("dengue_submission"):
            files_found.append(file[18:20])
    return f'{int(sorted(files_found).pop()) + 1 :02}'

## Notebook-specific code

### Other Estimators to try:
https://www.analyticsvidhya.com/blog/2021/01/a-quick-overview-of-regression-algorithms-in-machine-learning/ <br>
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble <br>
- AdaBoost
- XGBoost
- SVM
- KNN
- Linear Regression (incl L1 reg)
- Time Series (ARIMA, etc)

In [7]:
from sklearn.ensemble import AdaBoostRegressor

In [8]:
def cross_validate(X, y, estimator, cv, scaler=StandardScaler(), imputer=KNNImputer(n_neighbors = 5), dim_reduction=PCA(n_components = 9)):
    pipeline = Pipeline(steps=[
        ('scaler', scaler),
        ('imputer', imputer),
        ('dim_reduction', dim_reduction),
        ('estimator', estimator)
    ])
    
    #X_train, y_train, X_val, y_val = train_test_split(X, y, test_size=.2, random_state=42)
    mae_list_train = []
    mae_list_val = []
    
    for train_idxs, val_idxs in cv.split(X, y):
        X_train, y_train = X.iloc[train_idxs], y.iloc[train_idxs]
        pipeline.fit(X_train, y_train)
        y_pred_train = pipeline.predict(X_train)
        print(f'Train MAE = {mean_absolute_error(y_train, y_pred_train)}')
        mae_list_train.append(mean_absolute_error(y_train, y_pred_train))
        
        X_val, y_val = X.iloc[val_idxs], y.iloc[val_idxs]
        y_pred_val = pipeline.predict(X_val)

        print(f'Validation MAE = {mean_absolute_error(y_val, y_pred_val)}')
        mae_list_val.append(mean_absolute_error(y_val, y_pred_val))
    
    print(f'MAE Train Mean: {np.mean(mae_list_train)}')
    print(f'MAE Val Mean: {np.mean(mae_list_val)}')

    return pipeline    

In [65]:
def tune_model(X_processed, y_train, pipe, param_grid):
    

    # do gridsearch being sure to set scoring to MAE
    gridsearch = GridSearchCV(estimator=pipe, param_grid=params, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1).fit(X_processed, y_train)
    print(f'Score with entire training dataset:{-gridsearch.score(X_processed, y_train)}')

    best_params = gridsearch.best_params_
    print(best_params)
    
    return gridsearch

In [67]:
X_processed = data_preprocess(X_train)

# define pipeline
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('imputer', KNNImputer()),
    ('dim_reduction', PCA()),
    ('estimator', AdaBoostRegressor())
])

# define parameter ranges in dict
# use dounble underscore to link pipline object with param name -
# - use the label created when defining the pipe for the test left of the '__'
params = {
    'imputer__n_neighbors' : np.arange(1,10,2),
    'dim_reduction__n_components' : np.arange(2,10,2),
    'estimator__n_estimators' : np.arange(0,100,25),
    'estimator__learning_rate' : np.arange(.02, 1.01, .1)
    #'estimator__loss' : ['linear', 'square', 'exponential']
}

grid_pipe = tune_model(X_processed, y_train, pipe, params)

Fitting 5 folds for each of 800 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 148 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 554 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 1120 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done 1850 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done 2740 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 3794 tasks      | elapsed:  1.5min


Score with entire training dataset:19.01419401445714
{'dim_reduction__n_components': 6, 'estimator__learning_rate': 0.02, 'estimator__n_estimators': 25, 'imputer__n_neighbors': 3}


[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:  1.6min finished
  return f(**kwargs)


In [68]:
# GridSearch returns the estimator so we can call .predict() on it!
# so just pass the gridsearch object to create_submission_file()
y_pred_sub, filename = create_submission_file(grid_pipe, "adaboost_gridsearch_test")
y_pred_sub = pd.read_csv(filename)
y_pred_sub.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,20
1,sj,2008,19,19
2,sj,2008,20,19
3,sj,2008,21,19
4,sj,2008,22,20


### Before writing tune_model function
So delete when confirm new process works as expected

In [29]:
X_processed = data_preprocess(X_train)

# define pipeline
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('imputer', KNNImputer()),
    ('dim_reduction', PCA()),
    ('estimator', AdaBoostRegressor())
])

# define parameter ranges in dict
# use dounble underscore to link pipline object with param name -
# - use the label created when defining the pipe for the test left of the '__'
params = {
    'imputer__n_neighbors' : np.arange(1,10,2),
    'dim_reduction__n_components' : np.arange(2,10,2),
    'estimator__n_estimators' : np.arange(0,100,25),
    'estimator__learning_rate' : np.arange(.02, 1.01, .1)
    #'estimator__loss' : ['linear', 'square', 'exponential']
}

# do gridsearch being sure to set scoring to MAE
gridsearch = GridSearchCV(estimator=pipe, param_grid=params, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1).fit(X_processed, y_train)
print(f'Score with entire training dataset:{-gridsearch.score(X_processed, y_train)}')

best_params = gridsearch.best_params_
print(best_params)

Fitting 5 folds for each of 800 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 148 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 554 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 1120 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 1850 tasks      | elapsed:   38.9s
[Parallel(n_jobs=-1)]: Done 2740 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 3794 tasks      | elapsed:  1.5min


Score with entire training dataset:19.156325813099276
{'dim_reduction__n_components': 6, 'estimator__learning_rate': 0.02, 'estimator__n_estimators': 25, 'imputer__n_neighbors': 9}


[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:  1.6min finished
  return f(**kwargs)


In [62]:
# GridSearch returns the estimator so we can call .predict() on it!
# so just pass the gridsearch object to create_submission_file()
y_pred_sub, filename = create_submission_file(gridsearch, "adaboost_gridsearch_2")

In [63]:
sub_file

array([19., 19., 19., 19., 19., 29., 19., 32., 32., 32., 19., 32., 45.,
       38., 40., 44., 46., 45., 44., 40., 48., 37., 32., 31., 29., 20.,
       19., 32., 19., 19., 19., 19., 19., 19., 19., 19., 19., 19., 19.,
       19., 19., 19., 19., 19., 19., 19., 19., 19., 19., 19., 19., 19.,
       19., 19., 19., 19., 19., 19., 19., 31., 31., 38., 38., 42., 40.,
       42., 42., 42., 44., 44., 43., 38., 42., 42., 44., 49., 59., 42.,
       35., 32., 29., 32., 31., 19., 19., 19., 19., 19., 19., 19., 19.,
       19., 19., 19., 19., 19., 19., 19., 19., 19., 19., 19., 19., 19.,
       31., 29., 19., 27., 31., 44., 44., 46., 32., 38., 42., 44., 42.,
       32., 42., 44., 44., 55., 44., 44., 44., 44., 38., 44., 38., 43.,
       32., 29., 32., 19., 19., 19., 19., 19., 19., 19., 19., 19., 19.,
       19., 19., 19., 19., 19., 19., 19., 19., 19., 19., 19., 19., 19.,
       19., 19., 19., 17., 19., 31., 19., 33., 32., 42., 38., 43., 32.,
       44., 32., 42., 32., 42., 42., 42., 44., 32., 46., 46., 32

In [64]:
y_sub = pd.read_csv(filename)
y_sub.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,19
1,sj,2008,19,19
2,sj,2008,20,19
3,sj,2008,21,19
4,sj,2008,22,19
