In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split

# Import local packages
from src.data_processing import load_csv_from_zip as lcfz
from src.data_processing import data_preprocessing as dpp

%matplotlib inline

## Import data

In [None]:
train = lcfz.read_csv_from_zip('./../../data/input/bike-sharing-demand.zip', ['train.csv'])[0]

In [None]:
train = dpp.basic_prep_wrapper(train, ['temp'])
train = dpp.target_to_log(train)

In [None]:
features = train.columns.drop(['casual', 'registered', 'count'])
label = ['casual', 'registered', 'count']

X_train, X_test, y_train, y_test = train_test_split(train[features], train[label], test_size=0.3, random_state=42)

## Hyperparameter tuning of 3 models (casual, registered, count)

In [None]:
def hyperparam_tuning(X, y, target):
    model = RandomForestRegressor(n_jobs=-1)
    
    param_grid = [
        {
            "n_estimators": [900],  #range(100, 1200, 100),
        }
    ]
    
    gs = GridSearchCV(model, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)
    gs.fit(X, y[target].values.ravel())
    
    cvres = gs.cv_results_
    
    print("Best estimator is :\n")
    print(gs.best_estimator_)
    print("KFoldCV best score = {}".format(gs.best_score_))
    
    return gs.best_estimator_, cvres

In [None]:
def model_performances(results):
    results_df = pd.DataFrame(results)
    results_df.plot(x='param_n_estimators',y='mean_test_score')
    plt.legend()
    plt.show()
    pass

In [None]:
def retrain_model(model, X, y, target):
    model.fit(X, y[target].values.ravel())
    return model

In [None]:
def optimize_and_train(X, y, target):
    model, results = hyperparam_tuning(X, y, target)
    model_performances(results)
    model = retrain_model(model, X, y, target)
    return model

In [None]:
mdl_casual = optimize_and_train(X_train, y_train, 'casual')
mdl_registered = optimize_and_train(X_train, y_train, 'registered')
mdl_count = optimize_and_train(X_train, y_train, 'count')

## Study model performances on the test set

In [None]:
def make_predictions(model, X):
    predictions = np.expm1(model.predict(X))
    return predictions

In [None]:
def format_results(casual_preds, registered_preds, count_preds, y):
    results = np.expm1(y.copy())
    results['casual_preds'], results['registered_preds'], results['count_preds'] = casual_preds, registered_preds, count_preds
    results[results<0] = 0
    return results

In [None]:
def summarize_perfs(df):
    for target in ['casual', 'registered', 'count']:
        preds = target+'_preds'
        rmsle = np.sqrt(mean_squared_log_error(df[target], df[preds]))
        rmse = np.sqrt(mean_squared_error(df[target], df[preds]))
        print("Target is : {}".format(target))
        print("RMSLE = {}, \t RMSE = {}".format(rmsle, rmse))
        plot_results(df, target)
        
    rmsle = np.sqrt(mean_squared_log_error(df['count'], df['casual_preds']+df['registered_preds']))
    rmse = np.sqrt(mean_squared_error(df['count'], df['casual_preds']+df['registered_preds']))
    print("Target is : {}".format("composite count"))
    print("RMSLE = {}, \t RMSE = {}".format(rmsle, rmse))
    
    target = 'composite'
    plot_results(df, target)
    
    pass

In [None]:
def plot_results(df, target):
    # Sort the dataframe by datetimeindex for coherent time serie plots
    df.sort_index(inplace=True)

    if target == 'composite':
        y_preds = df['casual_preds']+df['registered_preds']
        y = df['count']
    else:
        y_preds = df[target+'_preds']
        y = df[target]
    
    fig, ax = plt.subplots(2, 2, figsize=(12,9))
    
    # Plot the time series of predictions and actual values
    ax[0][0].plot(df.index, y, color='g', alpha=0.6)
    ax[0][0].plot(df.index, y_preds, color='r', alpha=0.6)
    
    # Plot the predictions versus the actual values as a scatter plot
    ax[0][1].plot(y, y_preds, marker='o', linewidth=0, alpha=0.6)
    ax[0][1].plot(range(800), range(800), 'r-')
    
    # Plot the residuals as a time serie
    ax[1][0].plot(df.index, y_preds-y, color='r', alpha=0.6)
    ax[1][0].plot(df.index, [0 for _ in range(len(df))], 'b-')

    # Plot the histogram of the residuals
    ax[1][1].hist(x=(y_preds-y), bins=50)
    #ax[1][1].plot(x=[0 for _ in range(2)], y=[0, 600], 'r-')

    plt.show()
    
    pass

In [None]:
import mpld3
mpld3.enable_notebook()

casual_preds = make_predictions(mdl_casual, X_test)
registered_preds = make_predictions(mdl_registered, X_test)
count_preds = make_predictions(mdl_count, X_test)

results = format_results(casual_preds, registered_preds, count_preds, y_test)

summarize_perfs(results)

### Model properties

In [None]:
fig, ax = plt.subplots(1, 3, sharey=True, figsize=(12,9))

ax[0].barh(np.arange(len(features)), mdl_casual.feature_importances_)
ax[1].barh(np.arange(len(features)), mdl_registered.feature_importances_)
ax[2].barh(np.arange(len(features)), mdl_count.feature_importances_)
ax[0].set_yticks(np.arange(len(features)))
ax[0].set_yticklabels(features)

### Retrain the model on the entire train dataset

In [None]:
final_casual = mdl_casual.fit(train[features], train['casual'])

### Save best model

In [None]:
pickle.dump(mdl_casual, open('./../../models/trained/RandomForest_casual.sav', 'wb'))
pickle.dump(mdl_registered, open('./../../models/trained/RandomForest_registered.sav', 'wb'))
pickle.dump(mdl_count, open('./../../models/trained/RandomForest_count.sav', 'wb'))