## Libraries import and Definitions

In [None]:
%load_ext autoreload
%autoreload 2

## Common Libray Definitions

import sqlalchemy
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
import numpy as np
import matplotlib.ticker as mtick

# Custom Library Definitions
from CustomLibs.CustomFunctions import plot_corr_heatmap, plot_permutation_importance, sqlcol
from CustomLibs.CustomTransformers import filtered_transformer
from config import Config
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.inspection import permutation_importance
from sklearn.model_selection import TimeSeriesSplit
from CustomLibs.MultiPipe import MultiPipe
import scipy.stats

from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr,pearsonr


## SQL Store Definition
engine = sqlalchemy.create_engine(Config.CONN_STR)

date_val_end=Config.TEST_DATE_CUTOFF
date_test_start=pd.to_datetime(date_val_end) + pd.DateOffset(days=1)

# lower = 0.0
# upper = 2
# mu = 0.95
# sigma = 0.2

# rangen=scipy.stats.truncnorm((lower-mu)/sigma,(upper-mu)/sigma,loc=mu,scale=sigma)


## Read Data from SQL

In [None]:
## Load silver level, full combined dataset and define X dataframe of independent variables and y series of attendence percentage.
with engine.connect() as conn:
    df_preproc = pd.read_sql_table('Preprocessed_Features', conn,schema='Gold')

df_preproc.columns = [str(x) for x in df_preproc.columns]

# df_preproc.info()
df_preproc.set_index('Date',inplace=True)

label_field = df_preproc.columns[-1]
X=df_preproc.sort_index().loc[:date_val_end].drop(columns=label_field)
y=df_preproc.sort_index().loc[:date_val_end][label_field]


In [None]:


fig, ax = plt.subplots(figsize=(10, 6))
corr = spearmanr(X).correlation

# Ensure the correlation matrix is symmetric
corr = (corr + corr.T) / 2
np.fill_diagonal(corr, 1)

# We convert the correlation matrix to a distance matrix before performing
# hierarchical clustering using Ward's linkage.
distance_matrix = 1 - np.abs(corr)
# print(distance_matrix.shape)
dist_linkage = hierarchy.ward(squareform(distance_matrix))
dendro = hierarchy.dendrogram(
    dist_linkage, labels=X.columns.to_list(), ax=ax, leaf_rotation=90, color_threshold=0.4, leaf_font_size=8
)
fig.suptitle('Hierarchical Agglomerative Clustering using Spearman Correlation (Ward Linkage)',fontsize=12,fontweight='bold')
fig.tight_layout()
fig.savefig('./Output Files/Images/Feature Selection/dendrogram.png',format='png',bbox_inches='tight')


In [None]:
from collections import defaultdict

slices=[(0,X.shape[1],X.columns.tolist())]
for cut_dist in [x / 10.0 for x in range(1, 12, 1)]:
    # create a array of cluster IDs corresponding to each leaf in dendrogram accoridng to cut distance
    cluster_ids = hierarchy.fcluster(dist_linkage, cut_dist, criterion="distance")
    # Initialise a dictionary of lists
    cluster_id_to_feature_ids = defaultdict(list)
    # idx is the leaf node index, corresponding to features in, cluster_id is assigned cluster
    for idx, cluster_id in enumerate(cluster_ids):
        # Get correlation of that feature at col index = idx with the label series (y) 
        label_corr=abs(X.iloc[:,idx].corr(y,method='spearman'))
        # append a tuple for each feature in the cluster containing the feature index and the correlation to label value
        cluster_id_to_feature_ids[cluster_id].append((idx,label_corr))
    # for each cluster, sort the associated list of tuples (in place) to put (index of) the feature with  largest label correlation first in cluster list
    [v.sort(key=lambda x: x[1],reverse=True) for v in cluster_id_to_feature_ids.values()]
    # for each cluster, get the feature index (first element of tuple) from the first element of the sort list
    selected_features = [v[0][0] for v in cluster_id_to_feature_ids.values()]
    # convert features indexs to names applying index selection to the original dataframe
    selected_features_names = X.columns[selected_features].tolist()
    # update list of slices only the cut has acutally changed the number of selected features
    if slices[-1][1]>len(selected_features_names):
        slices.append((cut_dist,len(selected_features_names),selected_features_names))


pds = MultiPipe()


if Config.REGEN_RANKINGS:
    feature_ranks={'Spearman':{}}
    prev_slice=[]
    slices.reverse()
    for slice in slices:
        current_feats = [x for x in slice[2] if x not in prev_slice]
        rank = np.mean(list(range(len(prev_slice)+1,len(slice[2])+1)))
        print(f'{rank}:{current_feats}')
        # print(list(range(len(prev_slice)+1,len(slice[2])+1)))
        for feat in current_feats:
            feature_ranks['Spearman'][feat]=rank
        prev_slice=slice[2]

distances=[]
count_att=[]
for slice in slices:
    print(slice)
    distances.append(slice[0])
    count_att.append(slice[1])

fig, ax = plt.subplots(figsize=(6, 6))
ax.plot(count_att,distances)
ax.set_title('Attributes Per Slice')
ax.set_xlabel('Remaining Attributes')
ax.set_ylabel('Cut Distance')
ax.grid(visible=True,which='Major',axis='both')
plt.tight_layout()

fig.savefig('./Output Files/Images/Feature Selection/dendrogram_slices.png',format='png',bbox_inches='tight')

In [None]:
if Config.REGEN_RANKINGS:
    # Extract Feature Importances from tree based regression models
    tra=filtered_transformer(X.columns)

    for key,reg in [('RF_FI','Random Forest Regressor'),('XGB_FI','XGBoost Regressor')]:
        
        pipe = make_pipeline(tra,pds.Regressors[reg])
        pipe.fit(X,y)

        importances=[]
        for ori_col in X.columns:
            feat_indexes=[]
            for i,new_col in enumerate(pipe[0].get_feature_names_out()):
                if new_col.startswith(ori_col):
                    feat_indexes.append(i)
            importances.append((ori_col,sum([pipe[1].feature_importances_[x] for x in feat_indexes])))
        importances.sort(key=lambda x: x[1],reverse=True)
        # ranks={}
        feature_ranks[key]={}
        for rnk,feat_name in enumerate(importances,start=1):
            feature_ranks[key][feat_name[0]]=rnk

        # df_feature_ranks[lab]=[ranks[x] for x in df_feature_ranks['Feature']]

In [None]:
if Config.REGEN_RANKINGS:
    for key,reg in pds.Regressors.items():
        # X_train, X_test, y_train, y_test = train_test_split(X[selected_features_names[key]].drop(columns='Desks_Booked'), y, random_state=43,shuffle=True)
        # pipe=pds.QC_Set['Spearman Feat Selection']['Scale'][key]
        feature_ranks[key]={}
        selected_features_names=X.columns.tolist()
        
        print(key,end=': ')
        while len(selected_features_names):
            pert_importances=[]
            for i, (train_index, test_index) in enumerate(pds.CV.split(X)):
                X_train, y_train=X.iloc[train_index],y.iloc[train_index]
                X_test, y_test=X.iloc[test_index],y.iloc[test_index]
                tra=filtered_transformer(selected_features_names)
                pipe=make_pipeline(tra,reg)
                pipe.fit(X_train[selected_features_names], y_train)
                pert_importance=permutation_importance(pipe,X_test[selected_features_names],y_test,scoring='neg_root_mean_squared_error')
                pert_importances.append(pert_importance['importances_mean'])
            pert_importance=np.mean(pert_importances,axis=0)
            # get index of minimum importance value
            least_important=pert_importance.argmin()
            # get name of column to drop
            least_important=X[selected_features_names].columns[least_important]
            # update the rank
            feature_ranks[key][least_important]=len(selected_features_names)
            selected_features_names.remove(least_important)
            print('.',end='')
        print('')


In [None]:
if Config.REGEN_RANKINGS:    
    df_feature_ranks=pd.DataFrame(feature_ranks)
    df_feature_ranks['Mean Rank']=df_feature_ranks.mean(axis=1)
    df_feature_ranks.sort_values('Mean Rank',ascending=True,inplace=True)
    df_feature_ranks.reset_index(inplace=True,names='Feature')
    with engine.connect() as conn:
        df_feature_ranks.to_sql('PermutationFeatureRanks',conn,schema='Gold',if_exists='replace',dtype=sqlcol(df_feature_ranks),index=False)
else:
    with engine.connect() as conn:
        df_feature_ranks = pd.read_sql_table('PermutationFeatureRanks', conn,schema='Gold')
df_feature_ranks

In [None]:
feats={}
for key,val in pds.Regressors.items():
    feats[key]=df_feature_ranks.sort_values(key)['Feature'].to_list()
feats['Spearman']=df_feature_ranks.sort_values('Spearman')['Feature'].to_list()
feats['Mean']=df_feature_ranks.sort_values('Mean Rank')['Feature'].to_list()
feats['RF_FI']=df_feature_ranks.sort_values('RF_FI')['Feature'].to_list()
feats['XGB_FI']=df_feature_ranks.sort_values('XGB_FI')['Feature'].to_list()


for feat_count in range(70,0,-10):
    k='Top ' + str(feat_count)

    # Spearman
    tra=filtered_transformer(feats['Spearman'][:feat_count])
    pds.AddPreProc(tra,'pp'+ k)
    pds.AddQCSet('pp'+ k,'Spearman Feat Ranking')
    _ = pds.CalculateScores('Spearman Feat Ranking','pp'+ k,k,X[feats['Spearman'][:feat_count]],y,verbose=False)
    
    # Mean
    tra=filtered_transformer(feats['Mean'][:feat_count])
    pds.AddPreProc(tra,'pp'+ k)
    pds.AddQCSet('pp'+ k,'Mean Feat Ranking')
    _ = pds.CalculateScores('Mean Feat Ranking','pp'+ k,k,X[feats['Mean'][:feat_count]],y,verbose=False)


    pds.Regressors['RF_FI']=pds.Regressors['Random Forest Regressor']
    pds.Regressors['XGB_FI']=pds.Regressors['XGBoost Regressor']
    # Individual - Perturbation
    for key,reg in pds.Regressors.items():
        tra=filtered_transformer(feats[key][:feat_count])
        pds.AddPreProc(tra,'pp'+ k)
        pds.AddQCSet('pp'+ k,'Regressor Feat Ranking',regs={key:reg}.items())
        _ = pds.CalculateScores('Regressor Feat Ranking','pp'+ k,k,X[feats[key][:feat_count]],y,reg_filter=[key],verbose=False)

    # Individual - Tree Feat Importances
    # for label,key in [('RF_FI','Random Forest Regressor'),('XGB_FI','XGBoost Regressor')]:
    #     # pds.Regressors[label]=(label,pds.Regressors[key])
    #     tra=filtered_transformer(feats[label][:feat_count])
    #     pds.AddPreProc(tra,'pp'+ k)
    #     pds.AddQCSet('pp'+ k,'Tree Feat Importance',regs={label:pds.Regressors[key]}.items())
    #     _ = pds.CalculateScores('Tree Feat Importance','pp'+ k,k,X[feats[label][:feat_count]],y,reg_filter=[label],verbose=False)

In [None]:
_ = pds.GetScores(reg_keys=['Linear Regression','Random Forest Regressor','XGBoost Regressor','Linear SVR'] ,metric_keys=['R^2 Score','RMS Error','Mean Absolute Error'],verbose=False)
fig1,axs1=plt.subplots(1,len(pds.active_metrics),figsize=(0.5+5*len(pds.active_metrics),4))  
pds.GraphScores('Spearman Feat Ranking',axs=axs1)

fig2,axs2=plt.subplots(1,len(pds.active_metrics),figsize=(0.5+5*len(pds.active_metrics),4))  
pds.GraphScores('Mean Feat Ranking',axs2)



_ = pds.GetScores(metric_keys=['R^2 Score','RMS Error','Mean Absolute Error'],verbose=False)
fig3,axs3=plt.subplots(1,len(pds.active_metrics),figsize=(0.5+5*len(pds.active_metrics),4))  
pds.GraphScores('Regressor Feat Ranking',axs3)




# _ = pds.GetScores(reg_keys=['RF_FI','XGB_FI'] ,metric_keys=['R^2 Score','RMS Error','Mean Absolute Error'],verbose=False)
# fig4,axs4=plt.subplots(1,len(pds.active_metrics),figsize=(0.5+5*len(pds.active_metrics),4))  
# pds.GraphScores('Tree Feat Importance',axs4)
# fig4.tight_layout()




for axs in [axs1,axs2,axs3]:
    # axs[0].set_ylim(0.75,1)
    axs[0].set_ylim(0.015,0.035)
    axs[1].set_ylim(0.02,0.045)
    axs[2].set_ylim(0.85,1.0)
    for ax in axs:
        ax.set_xlabel('Selected Features',fontsize=11)
        # ax.set_ylabel(f'Metric Value',fontsize=11)
        # ax.tick_params(axis='x', labelrotation=45, labelsize=10)
        # ax.tick_params(axis='y', labelsize=10)

# axs1[2].set_ylim(0.85,1.00)

for ax in axs3:
    for line in ax.get_lines():
        # print(line.get_label())
        if line.get_label() in ['_child8','_child10','RF_FI','XGB_FI']:
            line.set_linestyle('--')
    leg=ax.get_legend()
    for line in leg.get_lines():
        if line.get_label() in ['_child8','_child10','RF_FI','XGB_FI']:
            line.set_linestyle('--')

fig1.suptitle('HAC Feature Ranking Metrics',fontsize=12,fontweight='bold')
fig1.tight_layout()
fig1.savefig('./Output Files/Images/Feature Selection/HAC_ranking_metrics.png',format='png',bbox_inches='tight')
fig2.suptitle('Mean Feature Ranking Metrics',fontsize=12,fontweight='bold')
fig2.tight_layout()
fig2.savefig('./Output Files/Images/Feature Selection/mean_ranking_metrics.png',format='png',bbox_inches='tight')
fig3.suptitle('Model Specific Feature Ranking Metrics',fontsize=12,fontweight='bold')
fig3.tight_layout()
fig3.savefig('./Output Files/Images/Feature Selection/indi_ranking_metrics.png',format='png',bbox_inches='tight')


In [None]:
spearmanr(X).correlation.shape

In [None]:
corr = np.absolute(spearmanr(X).correlation)
(np.sum(corr)-corr.shape[0])/(corr.shape[0]-1)**2
np.mean(corr)

In [None]:
df_feature_ranks.columns[1:]
correlation_reduction={}
for method in df_feature_ranks.columns[1:]:
    feat_list = df_feature_ranks.sort_values(method,ascending=False)['Feature'].to_list()
    correlation_reduction[method]={}
    for i in range(len(feat_list)-2):
        remaining_feats=feat_list[i:]
        corr = np.absolute(spearmanr(X[remaining_feats]).correlation)
        correlation_reduction[method][len(remaining_feats)]=(np.sum(corr)-corr.shape[0])/(corr.shape[0]-1)**2
df_correlation_reduction = pd.DataFrame.from_dict(correlation_reduction)



In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
for column in df_correlation_reduction:
    ax.plot(df_correlation_reduction[column],label=column)
ax.legend()