In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

## Definitions
- "dengue_features_train.csv" - split into 80 ("train") / 20 ("test")
- "dengue_features_test.csv" - data for scoring on the DrivenData challende ("Hold out")

In [7]:
X = pd.read_csv('../../data/dengue_features_train.csv')
y = pd.read_csv('../../data/dengue_labels_train.csv', usecols=['total_cases'])

In [8]:
X_holdout = pd.read_csv('../../data/dengue_features_test.csv')
y_holdout = pd.read_csv('../../data/submission_format.csv')

In [9]:
def data_preprocess(df):
    # drop or encode categorical cols
    df_processed = df.drop('week_start_date', axis=1)
    df_processed['city'] = df_processed['city'].apply(lambda x : 1 if x=='iq' else 0)
    return df_processed

In [10]:
X_processed = data_preprocess(X)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size = 0.20, random_state = 42, stratify = X_processed['city'])

In [28]:
n_neighbors = 5
n_estimators = 51
n_components = 9

In [29]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer',KNNImputer(n_neighbors = n_neighbors)),
    ('regressor', RandomForestRegressor(random_state=42, n_estimators = n_estimators))
])

In [30]:
pipe.fit(X_train, y_train)
print('Testing score: ', pipe.score(X_test, y_test))

  self._final_estimator.fit(Xt, y, **fit_params)


Testing score:  0.47389916261932774


In [31]:
y_pred = pipe.predict(X_test)
print(f'MAE = {mean_absolute_error(y_test, y_pred)}')

MAE = 12.830781627719581


### Working first pipeline!  Scores match MAE from 'modeling.ipynb'

## Adding in PCA to Pipeline

In [32]:
pipe_2 = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer',KNNImputer(n_neighbors = n_neighbors)),
    ('dim_reduction', PCA(n_components = n_components)),
    ('regressor', RandomForestRegressor(random_state=42, n_estimators = n_estimators))
])

In [33]:
pipe_2.fit(X_train, y_train)
y_pred_2 = pipe_2.predict(X_test)
print(f'MAE = {mean_absolute_error(y_test, y_pred_2)}')

  self._final_estimator.fit(Xt, y, **fit_params)


MAE = 17.57299221058286


## MAE Scoring History
### 1) Initial Pipeline with PCA
- n_neighbors = 2
- n_estimators = 30
- n_components = 2
- **MAE = 19.86187214611872**

### 2) With GridSearchCV #1
- n_neighbors = 5
- n_estimators = 51
- n_components = 9
- **MAE = 17.57299221058286**

### 3) With GridSearchCV #2
- n_neighbors = 5 (1,10,1)
- n_estimators = 51 (1,202,10)
- n_components = 9 (1,15,1)
- **MAE = 17.57299221058286**

## Adding in GridSearchCV

In [13]:
# Define range for PCA n_components
n_components_to_test = np.arange(1,15, 1)
# define n_estimators for RandomForestRegressor
n_estimators_to_test = np.arange(1,202,10)
# define n_neighbors for KNNImputer
n_neighbors_to_test = np.arange(1,10, 1)

In [19]:
# use dounble underscore to link pipline object with param name -
# - use the label created when defining the pipe for the test left of the '__'
params = {'imputer__n_neighbors' : n_neighbors_to_test,
          'dim_reduction__n_components' : n_components_to_test,
          'regressor__n_estimators' : n_estimators_to_test}

In [15]:
gridsearch = GridSearchCV(pipe_2, params, verbose=2, n_jobs=-1).fit(X_train, y_train)
print(f'Final score is: {gridsearch.score(X_test, y_test)}')

Fitting 5 folds for each of 2646 candidates, totalling 13230 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   27.8s
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 1981 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 2588 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 3277 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 19.1min
[Parallel(n_jobs=-1)]: Done 4897 tasks      | elapsed: 24.3min
[Parallel(n_jobs=-1)]: Done 5828 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done 6841 tasks      | elapsed: 37.9min
[Parallel(n_jobs=-1)]: Done 7934 tasks      | elapsed: 46.5min
[Parallel(n_jobs=-1)]: Done 9109 tasks      | 

Final score is: 0.11121885848683244


In [24]:
best_params = gridsearch.best_params_
best_params

{'dim_reduction__n_components': 9,
 'imputer__n_neighbors': 5,
 'regressor__n_estimators': 51}

### Trying a smaller GridSearchCV window

In [17]:
# Define range for PCA n_components
n_components_to_test = np.arange(1,10, 2)
# define n_estimators for RandomForestRegressor
n_estimators_to_test = np.arange(1,102,20)
# define n_neighbors for KNNImputer
n_neighbors_to_test = np.arange(1,10, 2)

In [20]:
gridsearch_2 = GridSearchCV(pipe_2, params, verbose=2, n_jobs=-1).fit(X_train, y_train)
print(f'Final score is: {gridsearch.score(X_test, y_test)}')

Fitting 5 folds for each of 150 candidates, totalling 750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:   55.3s
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed:  2.4min finished
  self._final_estimator.fit(Xt, y, **fit_params)


Final score is: 0.11121885848683244


In [25]:
best_params_2 = gridsearch_2.best_params_
best_params_2

{'dim_reduction__n_components': 7,
 'imputer__n_neighbors': 5,
 'regressor__n_estimators': 101}

### N_Estimators wasn't allowed to be 51...interesting that it's now 101 (the max limit)

In [35]:
#  I don't love instanciating the pipeline in the function - let's see how this can be improved.

def pipe_mae_with_best_params(best_params):
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('imputer',KNNImputer(n_neighbors = best_params['imputer__n_neighbors'])),
        ('dim_reduction', PCA(n_components = best_params['dim_reduction__n_components'])),
        ('regressor', RandomForestRegressor(random_state=42, n_estimators = best_params['regressor__n_estimators']))
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(f'MAE = {mean_absolute_error(y_test, y_pred)}')
    return pipe, mean_absolute_error(y_test, y_pred)

In [36]:
_, best_param_mae = pipe_mae_with_best_params(best_params)

  self._final_estimator.fit(Xt, y, **fit_params)


MAE = 17.57299221058286


In [37]:
_, best_param_2_mae = pipe_mae_with_best_params(best_params_2)

  self._final_estimator.fit(Xt, y, **fit_params)


MAE = 17.78438220534382


In [39]:
def predict_submission(best_params, file_rev_name):
    pipe, _ = pipe_mae_with_best_params(best_params)
    y_submit_pred = np.rint(pipe.predict(X_holdout))
    y_holdout['total_cases'] = y_submit_pred
    y_holdout['total_cases'] = y_holdout['total_cases'].astype(int)
    # y_holdout.to_csv(f'../../data/dengue_submission_{file_rev_name}.csv', index = False)
    return y_holdout

In [42]:
y_submit = predict_submission(best_params,"03_pipeline_with_PCA")

  self._final_estimator.fit(Xt, y, **fit_params)


MAE = 17.57299221058286


ValueError: could not convert string to float: 'sj'

###  Lovely error - from trying to predict on pipeline steps that hand't been transformed...
#### Looks like this article could help us...
https://towardsdatascience.com/machine-learning-pipelines-with-scikit-learn-d43c32a6aa52

In [24]:
kfold = KFold(n_splits=5, random_state=42)



In [37]:
mae = - cross_val_score(pipe_2, X_processed, y, cv=kfold, scoring = 'neg_mean_absolute_error')

  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)


In [39]:
mae.mean()

24.99895301151106