In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import seaborn as sns
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from sklearn.model_selection import GridSearchCV

# from .. import data_preprocess

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

## Common Code

In [21]:
# Files supplied by the competition for model training
X_train = pd.read_csv('../../data/dengue_features_train.csv')
y_train = pd.read_csv('../../data/dengue_labels_train.csv', usecols=['total_cases'])

In [22]:
# Files supplied by the competition for submission
X_test = pd.read_csv('../../data/dengue_features_test.csv')
y_test = pd.read_csv('../../data/submission_format.csv')

In [23]:
def data_preprocess(df):
    # drop or encode categorical cols
    df_processed = df.drop('week_start_date', axis=1)
    df_processed['city'] = df_processed['city'].apply(lambda x : 1 if x=='iq' else 0)
    return df_processed

In [24]:
def create_submission_file(pipeline, filename_comment):
    next_file_id = generate_next_submission_fileid()
    X_test_processed = data_preprocess(X_test)
    y_submit_pred = np.rint(pipeline.predict(X_test_processed))
    y_test['total_cases'] = y_submit_pred
    y_test['total_cases'] = y_test['total_cases'].astype(int)
    y_test.to_csv(f'../../data/dengue_submission_{next_file_id}_{filename_comment}.csv', index = False)
    return y_submit_pred

In [25]:
def generate_next_submission_fileid():
    files_found = []
    for file in os.listdir("../../data"):
        if file.startswith("dengue_submission"):
            files_found.append(file[18:20])
    return f'{int(sorted(files_found).pop()) + 1 :02}'

## Notebook-specific code

### Other Estimators to try:
https://www.analyticsvidhya.com/blog/2021/01/a-quick-overview-of-regression-algorithms-in-machine-learning/ <br>
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble <br>
- AdaBoost
- XGBoost
- SVM
- KNN
- Linear Regression (incl L1 reg)
- Time Series (ARIMA, etc)

In [30]:
from sklearn.ensemble import AdaBoostRegressor

In [35]:
def cross_validate(X, y, estimator, cv, scaler=StandardScaler(), imputer=KNNImputer(n_neighbors = 5), dim_reduction=PCA(n_components = 9)):
    pipeline = Pipeline(steps=[
        ('scaler', scaler),
        ('imputer', imputer),
        ('dim_reduction', dim_reduction),
        ('estimator', estimator)
    ])
    
    #X_train, y_train, X_val, y_val = train_test_split(X, y, test_size=.2, random_state=42)
    mae_list_train = []
    mae_list_val = []
    
    for train_idxs, val_idxs in cv.split(X, y):
        X_train, y_train = X.iloc[train_idxs], y.iloc[train_idxs]
        pipeline.fit(X_train, y_train)
        y_pred_train = pipeline.predict(X_train)
        print(f'Train MAE = {mean_absolute_error(y_train, y_pred_train)}')
        mae_list_train.append(mean_absolute_error(y_train, y_pred_train))
        
        X_val, y_val = X.iloc[val_idxs], y.iloc[val_idxs]
        y_pred_val = pipeline.predict(X_val)

        print(f'Validation MAE = {mean_absolute_error(y_val, y_pred_val)}')
        mae_list_val.append(mean_absolute_error(y_val, y_pred_val))
    
    print(f'MAE Train Mean: {np.mean(mae_list_train)}')
    print(f'MAE Val Mean: {np.mean(mae_list_val)}')

    return pipeline    

In [41]:
X_processed = data_preprocess(X_train)

estimator = AdaBoostRegressor(base_estimator=None,
                              n_estimators=100, 
                              learning_rate=0.01, 
                              loss='linear', 
                              random_state=42)
cv = KFold(random_state = 42)
pipe = cross_validate(X_processed, y_train, estimator, cv)

  y = column_or_1d(y, warn=True)


Train MAE = 12.346005164001712
Validation MAE = 38.62703779586685


  y = column_or_1d(y, warn=True)


Train MAE = 16.561957878050077
Validation MAE = 23.09315873582804


  y = column_or_1d(y, warn=True)


Train MAE = 19.54743226760971
Validation MAE = 29.64050657364647


  y = column_or_1d(y, warn=True)


Train MAE = 20.795974186897507
Validation MAE = 13.395170483785469


  y = column_or_1d(y, warn=True)


Train MAE = 21.580445229141468
Validation MAE = 8.979725135424621
MAE Train Mean: 18.166362945140094
MAE Val Mean: 22.747119744910286


In [37]:
sub_file = create_submission_file(pipe, "AdaBoost_initial")

In [None]:
y_sub = pd.read_csv('../../data/dengue_submission_04_testing_submission_script_2.csv')
y_sub.head()