In [None]:
%load_ext autoreload
%autoreload 2

## Common Libray Definitions

import sqlalchemy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.ticker as mtick

# Custom Library Definitions
from CustomLibs.CustomFunctions import plot_corr_heatmap, plot_permutation_importance, sqlcol,save_to_file,load_from_file
from config import Config
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler,SplineTransformer,OneHotEncoder

from CustomLibs.CustomTransformers import filtered_transformer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import RandomForestRegressor
from CustomLibs.MultiPipe import MultiPipe
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from pprint import pprint
import xgboost as xgb
from sklearn.metrics import mean_absolute_error,root_mean_squared_error,r2_score
import scipy.stats

## SQL Store Definition
engine = sqlalchemy.create_engine(Config.CONN_STR)

date_val_end=Config.TEST_DATE_CUTOFF
date_test_start=pd.to_datetime(date_val_end) + pd.DateOffset(days=1)




In [None]:
with engine.connect() as conn:
    df_preproc = pd.read_sql_table('Preprocessed_Features', conn,schema='Gold')
    df_feature_ranks = pd.read_sql_table('PermutationFeatureRanks', conn,schema='Gold')

df_preproc.set_index('Date',inplace=True)
df_preproc.columns = [str(x) for x in df_preproc.columns]
label_field = df_preproc.columns[-1]

X=df_preproc.sort_index().loc[:date_val_end].drop(columns=label_field)
y=df_preproc[label_field].sort_index().loc[:date_val_end]

X_test=df_preproc.sort_index().loc[date_test_start:].drop(columns=label_field)
y_test=df_preproc[label_field].sort_index().loc[date_test_start:]

feat_ranks = df_feature_ranks.sort_values('Mean Rank')['Feature'].to_list()





In [None]:
# Credit:
# https://randomrealizations.com/posts/xgboost-parameter-tuning-with-optuna/
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
# https://machinelearningmastery.com/tune-number-size-decision-trees-xgboost-python/
# https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f


pds=MultiPipe()


xgb_pipe = make_pipeline(filtered_transformer(feat_ranks),xgb.XGBRegressor())

search_grid_1 = {
    'xgbregressor__learning_rate': [0.1,0.2,0.3],
    'xgbregressor__n_estimators': [100, 500, 1000],
    # 'xgbregressor__max_depth':  [3,5,7],
    # 'xgbregressor__min_child_weight': [1,5,9],
    'xgbregressor__tree_method':['hist'],
    'xgbregressor__objective':['reg:squarederror'],
    'xgbregressor__eval_metric':['rmse'],
    'xgbregressor__seed':[43]
    }

print(search_grid_1)
scoring = {'Mean Absolute Error': 'neg_mean_absolute_error', 'RMS Error': 'neg_root_mean_squared_error','R^2 Score':'r2'}

if Config.REGEN_GRID_SEARCH_XGB:
    xgb_gridsearch = GridSearchCV(xgb_pipe, search_grid_1, scoring=scoring, refit='RMS Error',cv=pds.CV,n_jobs=-1,verbose=10)
    xgb_gridsearch.fit(X, y)
    xgb_gs1_parameters={}
    for p,v in xgb_gridsearch.best_params_.items():
        xgb_gs1_parameters[p.replace('xgbregressor__','')]=v
    save_to_file(xgb_gs1_parameters,'xgb_gs1_parameters')

else:
    xgb_gs1_parameters = load_from_file('xgb_gs1_parameters')

# print(xgb_gs1_parameters)


for k,v in xgb_gs1_parameters.items():
    print(f'{k}: {v}')

In [None]:
xgb_pipe = make_pipeline(filtered_transformer(feat_ranks),xgb.XGBRegressor(seed=43).set_params(**xgb_gs1_parameters).set_params(**xgb_gs5_parameters))

search_grid_2 = {
    'xgbregressor__max_depth':  [3,5,7,9],
    'xgbregressor__min_child_weight': [1,3,5,7,9],
    'xgbregressor__gamma':[i/10.0 for i in range(0,5)]
    }
    
for k,v in search_grid_2.items():
    print(f'{k}: {v}')

scoring = {'Mean Absolute Error': 'neg_mean_absolute_error', 'RMS Error': 'neg_root_mean_squared_error','R^2 Score':'r2'}

if Config.REGEN_GRID_SEARCH_XGB:
    xgb_gridsearch = GridSearchCV(xgb_pipe, search_grid_2, scoring=scoring, refit='RMS Error',cv=pds.CV,n_jobs=-1,verbose=10)
    xgb_gridsearch.fit(X, y)
    xgb_gs2_parameters={}
    for p,v in xgb_gridsearch.best_params_.items():
        xgb_gs2_parameters[p.replace('xgbregressor__','')]=v
    save_to_file(xgb_gs2_parameters,'xgb_gs2_parameters')

else:
    xgb_gs2_parameters = load_from_file('xgb_gs2_parameters')

# print(xgb_gs2_parameters)

for k,v in xgb_gs2_parameters.items():
    print(f'{k}: {v}')


In [None]:
xgb_pipe = make_pipeline(filtered_transformer(feat_ranks),xgb.XGBRegressor(seed=43).set_params(**xgb_gs1_parameters).set_params(**xgb_gs2_parameters))

search_grid_3 = {
    'xgbregressor__subsample':  [0.9,1],
    'xgbregressor__colsample_bynode': [0.8,0.9,1],
    }
    
for k,v in search_grid_3.items():
    print(f'{k}: {v}')

scoring = {'Mean Absolute Error': 'neg_mean_absolute_error', 'RMS Error': 'neg_root_mean_squared_error','R^2 Score':'r2'}

if Config.REGEN_GRID_SEARCH_XGB:
    xgb_gridsearch = GridSearchCV(xgb_pipe, search_grid_3, scoring=scoring, refit='RMS Error',cv=pds.CV,n_jobs=-1,verbose=10)
    xgb_gridsearch.fit(X, y)
    xgb_gs3_parameters={}
    for p,v in xgb_gridsearch.best_params_.items():
        xgb_gs3_parameters[p.replace('xgbregressor__','')]=v
    save_to_file(xgb_gs3_parameters,'xgb_gs3_parameters')

else:
    xgb_gs3_parameters = load_from_file('xgb_gs3_parameters')

# print(xgb_gs3_parameters)

for k,v in xgb_gs3_parameters.items():
    print(f'{k}: {v}')


In [None]:
xgb_pipe = make_pipeline(filtered_transformer(feat_ranks),xgb.XGBRegressor(seed=43).set_params(**xgb_gs1_parameters).set_params(**xgb_gs2_parameters).set_params(**xgb_gs3_parameters))

search_grid_4 = {
    'xgbregressor__reg_alpha':[0, 0.05, 0.2,0.5,1,2],
    'xgbregressor__reg_lambda':[0, 0.05, 0.2,0.5,1,2],
    }
    
for k,v in search_grid_4.items():
    print(f'{k}: {v}')

scoring = {'Mean Absolute Error': 'neg_mean_absolute_error', 'RMS Error': 'neg_root_mean_squared_error','R^2 Score':'r2'}

if Config.REGEN_GRID_SEARCH_XGB:
    xgb_gridsearch = GridSearchCV(xgb_pipe, search_grid_4, scoring=scoring, refit='RMS Error',cv=pds.CV,n_jobs=-1,verbose=10)
    xgb_gridsearch.fit(X, y)
    xgb_gs4_parameters={}
    for p,v in xgb_gridsearch.best_params_.items():
        xgb_gs4_parameters[p.replace('xgbregressor__','')]=v
    save_to_file(xgb_gs4_parameters,'xgb_gs4_parameters')

else:
    xgb_gs4_parameters = load_from_file('xgb_gs4_parameters')

# print(xgb_gs4_parameters)

for k,v in xgb_gs4_parameters.items():
    print(f'{k}: {v}')

In [None]:
xgb_pipe = make_pipeline(filtered_transformer(feat_ranks),xgb.XGBRegressor().set_params(**xgb_gs1_parameters).set_params(**xgb_gs2_parameters).set_params(**xgb_gs3_parameters).set_params(**xgb_gs4_parameters))

search_grid_5 = {
    'xgbregressor__learning_rate': [0.01,0.05,0.1,0.2],
    'xgbregressor__n_estimators': [100, 500, 1000, 2000],
    }
    
for k,v in search_grid_5.items():
    print(f'{k}: {v}')

scoring = {'Mean Absolute Error': 'neg_mean_absolute_error', 'RMS Error': 'neg_root_mean_squared_error','R^2 Score':'r2'}

if Config.REGEN_GRID_SEARCH_XGB:
    xgb_gridsearch = GridSearchCV(xgb_pipe, search_grid_5, scoring=scoring, refit='RMS Error',cv=pds.CV,n_jobs=-1,verbose=10)
    xgb_gridsearch.fit(X, y)
    xgb_gs5_parameters={}
    for p,v in xgb_gridsearch.best_params_.items():
        xgb_gs5_parameters[p.replace('xgbregressor__','')]=v
    save_to_file(xgb_gs5_parameters,'xgb_gs5_parameters')

else:
    xgb_gs5_parameters = load_from_file('xgb_gs5_parameters')

# print(xgb_gs5_parameters)

for k,v in xgb_gs5_parameters.items():
    print(f'{k}: {v}')

In [None]:
params_list=[xgb_gs1_parameters,xgb_gs2_parameters,xgb_gs3_parameters,xgb_gs4_parameters,xgb_gs5_parameters]
print(params_list)

In [None]:



# pds.AddPreProc(filtered_transformer(feat_ranks),'preproc')
pds.Regressors={}
pds.Regressors['XGBoost Baseline'] = xgb.XGBRegressor()
pds.Regressors['XGBoost GS1']=xgb.XGBRegressor().set_params(**xgb_gs1_parameters)
pds.Regressors['XGBoost GS2']=xgb.XGBRegressor().set_params(**xgb_gs1_parameters).set_params(**xgb_gs2_parameters)
pds.Regressors['XGBoost GS3']=xgb.XGBRegressor().set_params(**xgb_gs1_parameters).set_params(**xgb_gs2_parameters).set_params(**xgb_gs3_parameters)
pds.Regressors['XGBoost GS4']=xgb.XGBRegressor().set_params(**xgb_gs1_parameters).set_params(**xgb_gs2_parameters).set_params(**xgb_gs3_parameters).set_params(**xgb_gs4_parameters)
pds.Regressors['XGBoost GS5']=xgb.XGBRegressor().set_params(**xgb_gs1_parameters).set_params(**xgb_gs2_parameters).set_params(**xgb_gs3_parameters).set_params(**xgb_gs4_parameters).set_params(**xgb_gs4_parameters)
# pds.CalculateScores('XGBoost Tuning','preproc','Models',X,y)
# _ = pds.GetScores(qc_set_keys=['XGBoost Tuning'],metric_keys=['R^2 Score','RMS Error','Mean Absolute Error'],verbose=False)

# for a in _:
#     print(a)

feat_ranks = df_feature_ranks.sort_values('Mean Rank')['Feature'].to_list()

for t in range(70,0,-10):
# for t in range(5,75,10):
    k='Top ' + str(t)
    tra=filtered_transformer(feat_ranks[:t])
    pds.AddPreProc(tra,'pp'+ str(t))
    pds.AddQCSet('pp'+ str(t),'XGB Tuning Tests')
    _ = pds.CalculateScores('XGB Tuning Tests','pp'+ str(t),k,X[feat_ranks[:t]],y,verbose=False)





In [None]:

temp = pds.GetScores(qc_set_keys=['XGB Tuning Tests'],metric_keys=['R^2 Score','RMS Error','Mean Absolute Error'],verbose=False)
fig,axs=plt.subplots(1,len(pds.active_metrics),figsize=(0.5+5*len(pds.active_metrics),4))  
pds.GraphScores(qc_set_key='XGB Tuning Tests',axs=axs)
fig.tight_layout()

axs[0].set_ylim(0.015,0.035)
axs[1].set_ylim(0.02,0.045)
axs[2].set_ylim(0.85,1.0)

for ax in axs:
    ax.set_xlabel('Selected Features')
    # for line in ax.get_lines():
    #     # print(line.get_label())
    #     if line.get_label() in ['_child8','_child10','Random Forest RandomSearch T60','Random Forest GridSearch T60']:
    #         line.set_linestyle('--')
    # leg=ax.get_legend()
    # for line in leg.get_lines():
    #     if line.get_label() in ['_child8','_child10','Random Forest RandomSearch T60','Random Forest GridSearch T60']:
    #         line.set_linestyle('--')
fig.suptitle('Mean Feature Ranking Metrics, XGBoost Hyperparameter Tuning',fontsize=12,fontweight='bold')
fig.tight_layout()
fig.savefig('./Output Files/Images/Model Tuning/XGB_tuning_metrics.png',format='png',bbox_inches='tight')