In [None]:
%load_ext autoreload
%autoreload 2

## Common Libray Definitions

import sqlalchemy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.ticker as mtick

# Custom Library Definitions
from CustomLibs.CustomFunctions import plot_corr_heatmap, plot_permutation_importance, sqlcol,save_to_file,load_from_file
from config import Config
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler,SplineTransformer,OneHotEncoder

from CustomLibs.CustomTransformers import filtered_transformer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import RandomForestRegressor
from CustomLibs.MultiPipe import MultiPipe
from sklearn.model_selection import GridSearchCV
from pprint import pprint


import scipy.stats

## SQL Store Definition
engine = sqlalchemy.create_engine(Config.CONN_STR)

date_val_end=Config.TEST_DATE_CUTOFF
date_test_start=pd.to_datetime(date_val_end) + pd.DateOffset(days=1)




In [None]:
with engine.connect() as conn:
    df_preproc = pd.read_sql_table('Preprocessed_Features', conn,schema='Gold')
    df_feature_ranks = pd.read_sql_table('PermutationFeatureRanks', conn,schema='Gold')

df_preproc.set_index('Date',inplace=True)
df_preproc.columns = [str(x) for x in df_preproc.columns]
label_field = df_preproc.columns[-1]

X=df_preproc.sort_index().loc[:date_val_end].drop(columns=label_field)
y=df_preproc[label_field].sort_index().loc[:date_val_end]

X_test=df_preproc.sort_index().loc[date_test_start:].drop(columns=label_field)
y_test=df_preproc[label_field].sort_index().loc[date_test_start:]

pds=MultiPipe()


In [None]:
# Credit: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 5)]
# Number of features to consider at every split
max_features = [0.6,0.8,1.0]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 90, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'randomforestregressor__n_estimators': n_estimators,
               'randomforestregressor__max_features': max_features,
               'randomforestregressor__max_depth': max_depth,
               'randomforestregressor__min_samples_split': min_samples_split,
               'randomforestregressor__min_samples_leaf': min_samples_leaf,
            #    'randomforestregressor__bootstrap': bootstrap
               }

print('\nParameters for random search:')
for k,v in random_grid.items():
    print(f'{k}: {v}')


scoring = {'Mean Absolute Error': 'neg_mean_absolute_error', 'RMS Error': 'neg_root_mean_squared_error','R^2 Score':'r2'}


In [None]:

if Config.REGEN_RANDOM_SEARCH_RF:

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf_reg = make_pipeline(filtered_transformer(X.columns.tolist()),RandomForestRegressor())
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf_reg, param_distributions = random_grid, n_iter = 300, cv = pds.CV, verbose=2, scoring=scoring, refit='RMS Error', random_state=43, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(X, y)

    print(f'Best parameters for RF,  (CV score={rf_random.best_score_:.3f}):',end='')
    print(rf_random.best_params_)

    rf_random_search_parameters={}
    for p,v in rf_random.best_params_.items():
        rf_random_search_parameters[p.replace('randomforestregressor__','')]=v
    save_to_file(rf_random_search_parameters,'rf_random_search_parameters')

else:
    rf_random_search_parameters = load_from_file('rf_random_search_parameters')
for k,v in rf_random_search_parameters.items():
    print(f'{k}: {v}')

In [None]:
if Config.REGEN_RANDOM_SEARCH_RF_TXX:
    feat_ranks = df_feature_ranks.sort_values('Mean Rank')['Feature'].to_list()[:20]
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf_reg_t20 = make_pipeline(filtered_transformer(X[feat_ranks].columns.tolist()),RandomForestRegressor())
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random_t20 = RandomizedSearchCV(estimator = rf_reg_t20, param_distributions = random_grid, n_iter = 300, cv = pds.CV, verbose=2, scoring=scoring, refit='RMS Error', random_state=43, n_jobs = -1)
    # Fit the random search model
    rf_random_t20.fit(X[feat_ranks], y)

    print(f'Best parameters for RF,  (CV score={rf_random_t20.best_score_:.3f}):',end='')
    print(rf_random_t20.best_params_)

    rf_random_search_parameters_t20={}
    for p,v in rf_random_t20.best_params_.items():
        rf_random_search_parameters_t20[p.replace('randomforestregressor__','')]=v
    save_to_file(rf_random_search_parameters_t20,'rf_random_search_parameters_t20')

else:
    rf_random_search_parameters_t20 = load_from_file('rf_random_search_parameters_t20')
for k,v in rf_random_search_parameters_t20.items():
    print(f'{k}: {v}')

In [None]:


# n_estimators = [100,150,200,250,300,350]
n_estimators = [125, 275,325,375]
# Number of features to consider at every split
max_features = [0.5,0.6,0.7]
# Maximum number of levels in tree
max_depth = [40,50,60,None]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [4,5,6,7]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,3]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
search_grid = {'randomforestregressor__n_estimators': n_estimators,
               'randomforestregressor__max_features': max_features,
               'randomforestregressor__max_depth': max_depth,
               'randomforestregressor__min_samples_split': min_samples_split,
               'randomforestregressor__min_samples_leaf': min_samples_leaf,
               'randomforestregressor__bootstrap': bootstrap
               }

print('\nParameters for exhaustive search:')
for k,v in search_grid.items():
    print(f'{k}: {v}')


In [None]:

if Config.REGEN_GRID_SEARCH_RF:
    rf_reg = make_pipeline(filtered_transformer(X.columns.tolist()),RandomForestRegressor())

    rf_gsearch = GridSearchCV(rf_reg, search_grid, scoring=scoring, refit='RMS Error',cv=pds.CV,n_jobs=-1,verbose=10)
    rf_gsearch.fit(X, y)
    print(f'Best parameters for RF,  (CV score={rf_gsearch.best_score_:.3f}):',end='')
    print(rf_gsearch.best_params_)

    rf_grid_search_parameters={}
    for p,v in rf_gsearch.best_params_.items():
        rf_grid_search_parameters[p.replace('randomforestregressor__','')]=v
    save_to_file(rf_grid_search_parameters,'rf_grid_search_parameters')

else:
    rf_grid_search_parameters = load_from_file('rf_grid_search_parameters')

for k,v in rf_grid_search_parameters.items():
    print(f'{k}: {v}')

# rf_grid_search_parameters['n_estimators']=150



In [None]:
if Config.REGEN_GRID_SEARCH_RF_TXX:
    feat_ranks = df_feature_ranks.sort_values('Mean Rank')['Feature'].to_list()[:20]
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf_reg_t20 = make_pipeline(filtered_transformer(X[feat_ranks].columns.tolist()),RandomForestRegressor())
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_grid_t20 = GridSearchCV(rf_reg_t20, search_grid, scoring=scoring, refit='RMS Error',cv=pds.CV,n_jobs=-1,verbose=10)
    # Fit the random search model
    rf_grid_t20.fit(X[feat_ranks], y)

    print(f'Best parameters for RF,  (CV score={rf_grid_t20.best_score_:.3f}):',end='')
    print(rf_grid_t20.best_params_)

    rf_grid_search_parameters_t20={}
    for p,v in rf_grid_t20.best_params_.items():
        rf_grid_search_parameters_t20[p.replace('randomforestregressor__','')]=v
    save_to_file(rf_grid_search_parameters_t20,'rf_grid_search_parameters_t20')

else:
    rf_grid_search_parameters_t20 = load_from_file('rf_grid_search_parameters_t20')
for k,v in rf_grid_search_parameters_t20.items():
    print(f'{k}: {v}')

In [None]:


pds.Regressors = {}
# pds.Regressors['Linear Regression']=LinearRegression()
pds.Regressors['Random Forest Baseline']=RandomForestRegressor(random_state=43)
pds.Regressors['Random Forest RandomSearch']=RandomForestRegressor(random_state=43).set_params(**rf_random_search_parameters)
pds.Regressors['Random Forest GridSearch']=RandomForestRegressor(random_state=43).set_params(**rf_grid_search_parameters)
# pds.Regressors['Random Forest RandomSearch T20']=RandomForestRegressor(random_state=43).set_params(**rf_random_search_parameters_t20)
pds.Regressors['Random Forest GridSearch T20']=RandomForestRegressor(random_state=43).set_params(**rf_grid_search_parameters_t20)
# pds.Regressors['Random Forest GridSearch T20 ++']=RandomForestRegressor(random_state=43).set_params(**rf_grid_search_parameters_t20).set_params(**{'n_estimators':150})

# display(pds.Regressors['Random Forest GridSearch T20'])

feat_ranks = df_feature_ranks.sort_values('Mean Rank')['Feature'].to_list()

for t in range(70,0,-10):
# for t in range(5,75,10):
    k='Top ' + str(t)
    tra=filtered_transformer(feat_ranks[:t])
    pds.AddPreProc(tra,'pp'+ str(t))
    pds.AddQCSet('pp'+ str(t),'RF Tuning Tests')
    _ = pds.CalculateScores('RF Tuning Tests','pp'+ str(t),k,X[feat_ranks[:t]],y,verbose=False)






In [None]:
temp = pds.GetScores(qc_set_keys=['RF Tuning Tests'],metric_keys=['R^2 Score','RMS Error','Mean Absolute Error'],verbose=False)
fig,axs=plt.subplots(1,len(pds.active_metrics),figsize=(0.5+5*len(pds.active_metrics),4))  
pds.GraphScores(qc_set_key='RF Tuning Tests',axs=axs)


# for axs in [axs1,axs2,axs3]:
    # axs[0].set_ylim(0.75,1)
axs[0].set_ylim(0.015,0.03)
axs[1].set_ylim(0.02,0.04)
axs[2].set_ylim(0.9,1.0)

for ax in axs:
    for line in ax.get_lines():
        # print(line.get_label())
        if line.get_label() in ['_child8','_child6','Random Forest RandomSearch T20','Random Forest GridSearch T20']:
            line.set_linestyle('--')
    leg=ax.get_legend()
    for line in leg.get_lines():
        if line.get_label() in ['_child8','_child6','Random Forest RandomSearch T20','Random Forest GridSearch T20']:
            line.set_linestyle('--')
    ax.set_xlabel('Selected Features')

fig.suptitle('Mean Feature Ranking Metrics, Random Forest Hyperparameter Tuning',fontsize=12,fontweight='bold')
fig.tight_layout()
fig.savefig('./Output Files/Images/Model Tuning/RF_tuning_metrics.png',format='png',bbox_inches='tight')

In [None]:
print('Broad Options Usinbg Random Search:')
for k,v in random_grid.items():
    print(f'{k}: {v}')
# print(random_grid)
# pprint.pp(rf_random.best_params_)
print('\nSelected by Random Search:')
for k,v in rf_random.best_params_.items():
    print(f'{k}: {v}')


print('\nNarrow Options Using Exhaustive (Grid) Search:')
for k,v in search_grid.items():
    print(f'{k}: {v}')
# print(random_grid)
# pprint.pp(rf_random.best_params_)
print('\nSelected by Random Search:')
for k,v in rf_grid_search_parameters.items():
    print(f'{k}: {v}')