In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import seaborn as sns
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from sklearn.model_selection import GridSearchCV

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

  import pandas.util.testing as tm


## Data Set Nomenclature
Train<br>
Validation - split off from training data to validate model<br>
Test - new data - the data that is used for competition submission<br>

In [2]:
# Files supplied by the competition for model training
X_train = pd.read_csv('../../data/dengue_features_train.csv')
y_train = pd.read_csv('../../data/dengue_labels_train.csv', usecols=['total_cases'])

In [3]:
# Files supplied by the competition for submission
X_test = pd.read_csv('../../data/dengue_features_test.csv')
y_test = pd.read_csv('../../data/submission_format.csv')

In [4]:
def data_preprocess(df):
    # drop or encode categorical cols
    df_processed = df.drop('week_start_date', axis=1)
    df_processed['city'] = df_processed['city'].apply(lambda x : 1 if x=='iq' else 0)
    return df_processed

In [5]:
def cross_validate(X, y, estimator, cv, scaler=StandardScaler(), imputer=KNNImputer(n_neighbors = 5), dim_reduction=PCA(n_components = 9)):
    pipeline = Pipeline(steps=[
        ('scaler', scaler),
        ('imputer', imputer),
        ('dim_reduction', dim_reduction),
        ('estimator', estimator)
    ])
    
    #X_train, y_train, X_val, y_val = train_test_split(X, y, test_size=.2, random_state=42)
    
    for train_idxs, val_idxs in cv.split(X, y):
        X_train, y_train = X.iloc[train_idxs], y.iloc[train_idxs]
        pipeline.fit(X_train, y_train)
        y_pred_train = pipeline.predict(X_train)
        print(f'Train MAE = {mean_absolute_error(y_train, y_pred_train)}')
        
        X_val, y_val = X.iloc[val_idxs], y.iloc[val_idxs]
        y_pred_val = pipeline.predict(X_val)

        print(f'Validation MAE = {mean_absolute_error(y_val, y_pred_val)}')

    return pipeline        

In [6]:
def create_submission_file(pipeline, filename_comment):
    next_file_id = generate_next_submission_filename()
    X_test_processed = data_preprocess(X_test)
    y_submit_pred = np.rint(pipeline.predict(X_test_processed))
    y_test['total_cases'] = y_submit_pred
    y_test['total_cases'] = y_test['total_cases'].astype(int)
    y_test.to_csv(f'../../data/dengue_submission_{next_file_id}_{filename_comment}.csv', index = False)
    return y_submit_pred

In [7]:
def generate_next_submission_filename():
    files_found = []
    for file in os.listdir("../../data"):
        if file.startswith("dengue_submission"):
            files_found.append(file[18:20])
    return f'{int(sorted(files_found).pop()) + 1 :02}'

In [10]:
X_processed = data_preprocess(X_train)

estimator = RandomForestRegressor(random_state=42,n_estimators=51)
cv = KFold(random_state = 42)
pipe = cross_validate(X_processed, y_train, estimator, cv)

  self._final_estimator.fit(Xt, y, **fit_params)


Train MAE = 4.781854322485008
Validation MAE = 41.75866236905721


  self._final_estimator.fit(Xt, y, **fit_params)


Train MAE = 7.0943027854918785
Validation MAE = 27.457988006199045


  self._final_estimator.fit(Xt, y, **fit_params)


Train MAE = 8.021543381301017
Validation MAE = 32.59369314736204


  self._final_estimator.fit(Xt, y, **fit_params)


Train MAE = 8.27785912648321
Validation MAE = 14.049659726433529


  self._final_estimator.fit(Xt, y, **fit_params)


Train MAE = 8.438357317175798
Validation MAE = 9.13476180850347


In [11]:
sub_file = create_submission_file(pipe, "testing_submission_script_2")

In [12]:
y_sub = pd.read_csv('../../data/dengue_submission_04_testing_submission_script_2.csv')
y_sub.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,6
1,sj,2008,19,7
2,sj,2008,20,8
3,sj,2008,21,9
4,sj,2008,22,9
