In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
rooth_path = 'data/'
deng_sj_train = pd.read_csv(rooth_path + 'deng_sj_train.csv')
deng_iq_train = pd.read_csv(rooth_path + 'deng_iq_train.csv')
deng_sj_val = pd.read_csv(rooth_path + 'deng_sj_val.csv')
deng_iq_val = pd.read_csv(rooth_path + 'deng_iq_val.csv')
test_sj = pd.read_csv(rooth_path + 'test_sj_.csv')
test_iq = pd.read_csv(rooth_path + 'test_iq_.csv')

In [3]:
def pred_regressor(model, regressor, X_train, X_val, y_train, y_val, params=None):  
    
    #Grid Search
    grid_search = GridSearchCV(estimator=regressor, param_grid=params, cv=5, n_jobs=-1)

    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_val)
    
    #Predictions
    predictions = {'y_pred' : y_pred}
    df_predictions = pd.DataFrame.from_dict(predictions)
    
    print ("Iquitos Test  MAE error :", mean_absolute_error(y_pred, y_val))
    
    return grid_search.best_estimator_, df_predictions

In [4]:
def rfr(X_train, X_val, y_train, y_val):
    
    model = 'Random Forest Regressor'
    regressor = RandomForestRegressor(random_state=42, n_jobs=-1)
    gs_params = {'n_estimators': [90,100,200,300], 'criterion': ['mae'], 'max_features': ['sqrt', 'auto', 'log2'],
                'min_samples_split' : [7,8,9,10,12,13], 'max_depth': [2,3,4,5]}

    regressor_rfr, pred_rfr = pred_regressor(model, regressor, X_train, X_val, y_train, y_val, params=gs_params)
    
    print('\nBest Parameters:\n',regressor_rfr)
    
    return regressor_rfr, pred_rfr

### San Juan

In [5]:
X_train_sj = deng_sj_train.drop(columns = ['week_start_date', 'city', 'year', 'weekofyear', 'total_cases'])
y_train_sj = deng_sj_train['total_cases']
X_val_sj = deng_sj_val.drop(columns = ['week_start_date', 'city', 'year', 'weekofyear', 'total_cases'])
y_val_sj = deng_sj_val['total_cases']

In [6]:
regressor_rfr_sj, pred_rfr_sj = rfr(X_train_sj, X_val_sj, y_train_sj, y_val_sj)

Iquitos Test  MAE error : 19.570322580645158

Best Parameters:
 RandomForestRegressor(criterion='mae', max_depth=3, min_samples_split=12,
                      n_jobs=-1, random_state=42)


### Iquitos

In [7]:
X_train_iq = deng_iq_train.drop(columns = ['week_start_date', 'city', 'year', 'weekofyear', 'total_cases'])
y_train_iq = deng_iq_train['total_cases']
X_val_iq = deng_iq_val.drop(columns = ['week_start_date', 'city', 'year', 'weekofyear', 'total_cases'])
y_val_iq = deng_iq_val['total_cases']

In [8]:
regressor_rfr_iq, pred_rfr_iq = rfr(X_train_iq, X_val_iq, y_train_iq, y_val_iq)

Iquitos Test  MAE error : 3.646057692307692

Best Parameters:
 RandomForestRegressor(criterion='mae', max_depth=2, max_features='log2',
                      min_samples_split=8, n_estimators=200, n_jobs=-1,
                      random_state=42)
