In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

"dengue_features_train.csv" - split into 80 ("train") / 20 ("test")
"dengue_features_test.csv" - data for scoring on the DrivenData challende ("Hold out")

In [2]:
X = pd.read_csv('../../data/dengue_features_train.csv')
y = pd.read_csv('../../data/dengue_labels_train.csv', usecols=['total_cases'])

In [3]:
def data_preprocess(df):
    # drop or encode categorical cols
    df_processed = df.drop('week_start_date', axis=1)
    df_processed['city'] = df_processed['city'].apply(lambda x : 1 if x=='iq' else 0)
    return df_processed

In [4]:
X_processed = data_preprocess(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size = 0.20, random_state = 42, stratify = X_processed['city'])

In [31]:
n_neighbors = 5
n_estimators = 51
n_components = 9

In [7]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer',KNNImputer(n_neighbors = n_neighbors)),
    ('regressor', RandomForestRegressor(random_state=42, n_estimators = n_estimators))
])

In [8]:
pipe.fit(X_train, y_train)
print('Testing score: ', pipe.score(X_test, y_test))

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Testing score:  0.5597929445295612


In [9]:
y_pred = pipe.predict(X_test)
print(f'MAE = {mean_absolute_error(y_test, y_pred)}')

MAE = 12.403082191780822


### Working first pipeline!  Scores match MAE from 'modeling.ipynb'

## Adding in PCA to Pipeline

In [32]:
pipe_2 = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer',KNNImputer(n_neighbors = n_neighbors)),
    ('dim_reduction', PCA(n_components = n_components)),
    ('regressor', RandomForestRegressor(random_state=42, n_estimators = n_estimators))
])

In [33]:
pipe_2.fit(X_train, y_train)
y_pred_2 = pipe_2.predict(X_test)
print(f'MAE = {mean_absolute_error(y_test, y_pred_2)}')

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


MAE = 17.57299221058286


n_neighbors = 2
n_estimators = 30
n_components = 2
MAE = 19.86187214611872

n_neighbors = 5
n_estimators = 51
n_components = 9
MAE = 17.57299221058286

## Adding in GridSearchCV

In [34]:
# Define range for PCA n_components
n_components_to_test = np.arange(1,15, 2)
# define n_estimators for RandomForestRegressor
n_estimators_to_test = np.arange(1,202,25)
# define n_neighbors for KNNImputer
n_neighbors_to_test = np.arange(1,10, 2)

In [35]:
# use dounble underscore to link pipline object with param name -
# - use the label created when defining the pipe for the test left of the '__'
params = {'imputer__n_neighbors' : n_neighbors_to_test,
          'dim_reduction__n_components' : n_components_to_test,
          'regressor__n_estimators' : n_estimators_to_test}

In [36]:
gridsearch = GridSearchCV(pipe_2, params, verbose=2, n_jobs=-1).fit(X_train, y_train)
print(f'Final score is: {gridsearch.score(X_test, y_test)}')

Fitting 5 folds for each of 315 candidates, totalling 1575 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   52.1s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 1575 out of 1575 | elapsed:  3.2min finished
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Final score is: 0.11121885848683244


In [39]:
gridsearch.best_params_

{'dim_reduction__n_components': 9,
 'imputer__n_neighbors': 5,
 'regressor__n_estimators': 51}