In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os 
import sklearn
import shap 
import time
import math
import seaborn as sns
import pathlib
import statistics

In [2]:
font_size_plot = 22
plt.rcParams['font.weight'] = 'bold'
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams['figure.labelweight'] = 'bold'
plt.rcParams['figure.titleweight'] = 'bold'
plt.rcParams['font.size'] = font_size_plot
plt.rcParams['axes.formatter.useoffset'] = False

path = os.path.abspath('')
os.chdir(f'{path}')

In [3]:
method_name_list = ['Handcode','Ascending','Descending']
meta_model_list = ['DT','KNN','LIN']
feature_importance_analysis_list = ['FPI','PDP','SHAP']
metrics_list = ['MAE','MSE','RMSE','R2','MAPE']

std

In [4]:
all_plot_normal_prediction_dataframe = pd.DataFrame()
all_plot_fpi_prediction_dataframe = pd.DataFrame()
all_plot_pdp_prediction_dataframe = pd.DataFrame()
all_plot_shap_prediction_dataframe = pd.DataFrame()

for method_name in method_name_list:
    method_path = '\\'.join(path.split('\\')[:-1])+'\\'+'tuned model'+'\\'+method_name
    for meta_model in meta_model_list:
        for feature_importance_analysis in feature_importance_analysis_list:
            folder_name = method_path+'\\'+f'{feature_importance_analysis}CSVMETA{meta_model}'
            # get dataframe name
            prediction_dataframe_name = folder_name+'\\'+f'PREDICTION{feature_importance_analysis}{meta_model}'
            feature_importance_dataframe_name = folder_name+'\\'+f'RESULTS{feature_importance_analysis}{meta_model}'
            additional_dataframe_name = folder_name+'\\'+f'ADDITIONAL{feature_importance_analysis}{meta_model}'
            # get dataframe
            prediction_dataframe = pd.read_csv(prediction_dataframe_name+'.csv')
            feature_importance_dataframe = pd.read_csv(feature_importance_dataframe_name+'.csv')
            try:
                additional_dataframe = pd.read_csv(additional_dataframe_name+'.csv')
            except:
                additional_dataframe = pd.DataFrame()
            # get the results of each
            # normal prediction
            seed_normal_prediction_std_dataframe = pd.DataFrame(columns=metrics_list+['SEED'])
            for unique_seed in prediction_dataframe['SEED'].unique():
                interested_dataframe = prediction_dataframe[prediction_dataframe['SEED']==unique_seed].drop(['ORDER'],axis='columns')
                seed_std_list = []
                for metric in metrics_list:
                    single_seed_std = interested_dataframe[metric].std()
                    seed_std_list.append(single_seed_std)
                # append the unique seed
                seed_std_list.append(unique_seed)
                # put in dataframe
                seed_normal_prediction_std_dataframe.loc[unique_seed] = seed_std_list
            seed_normal_prediction_std_dataframe['METHOD'] = method_name   
            seed_normal_prediction_std_dataframe['META'] = meta_model  
            seed_normal_prediction_std_dataframe['FEATUREIMPORTANCE'] = feature_importance_analysis  
            all_plot_normal_prediction_dataframe = pd.concat([seed_normal_prediction_std_dataframe,all_plot_normal_prediction_dataframe],axis='rows')
            # feature importance
            # change specifically for each feature importance
            # there is 6 for 6 combinations
            if feature_importance_analysis == 'FPI':
                plot_fpi_dataframe = pd.DataFrame()
                interested_column = feature_importance_dataframe.columns.drop(['TIME','SEED','ORDER','SHUFFLE']).to_list()
                seed_feature_importance_std_dataframe = pd.DataFrame(columns=interested_column+['SEED'])
                for unique_seed in feature_importance_dataframe['SEED'].unique():
                    for unique_shuffle in feature_importance_dataframe['SHUFFLE'].unique():
                        interested_dataframe = feature_importance_dataframe[(feature_importance_dataframe['SEED']==unique_seed) & (feature_importance_dataframe['SHUFFLE']==unique_shuffle)].drop(['ORDER'],axis='columns')
                        temp_df = interested_dataframe[interested_column].std()
                        temp_df['METHOD'] = method_name
                        temp_df['META'] = meta_model
                        temp_df['FEATUREIMPORTANCE'] = feature_importance_analysis
                        temp_df['SHUFFLE'] = unique_shuffle
                        temp_df['SEED'] = unique_seed
                        temp_df = pd.DataFrame(temp_df).transpose()
                        # print(temp_df)
                        plot_fpi_dataframe = pd.concat([temp_df,plot_fpi_dataframe],axis='rows')
                all_plot_fpi_prediction_dataframe = pd.concat([plot_fpi_dataframe,all_plot_fpi_prediction_dataframe],axis='rows')
            elif feature_importance_analysis == 'PDP':
                plot_pdp_dataframe = pd.DataFrame()
                interested_column = feature_importance_dataframe.columns.drop(['SEED','ORDER','INDEX','PDPVALUES']).to_list()
                seed_feature_importance_std_dataframe = pd.DataFrame(columns=interested_column+['SEED'])
                for unique_seed in feature_importance_dataframe['SEED'].unique():
                    for unique_index in feature_importance_dataframe['INDEX'].unique():
                        interested_dataframe = feature_importance_dataframe[(feature_importance_dataframe['SEED']==unique_seed) & (feature_importance_dataframe['INDEX']==unique_index)].drop(['ORDER'],axis='columns')
                        temp_df = interested_dataframe[interested_column].std()
                        temp_df['METHOD'] = method_name
                        temp_df['META'] = meta_model
                        temp_df['FEATUREIMPORTANCE'] = feature_importance_analysis
                        temp_df['INDEX'] = unique_index
                        temp_df['SEED'] = unique_seed
                        temp_df = pd.DataFrame(temp_df).transpose()
                        # print(temp_df)
                        plot_pdp_dataframe = pd.concat([temp_df,plot_pdp_dataframe],axis='rows')
                all_plot_pdp_prediction_dataframe = pd.concat([plot_pdp_dataframe,all_plot_pdp_prediction_dataframe],axis='rows')
            elif feature_importance_analysis == 'SHAP':
                plot_shap_dataframe = pd.DataFrame()
                interested_column = feature_importance_dataframe.columns.drop(['SEED','ORDER','INDEX']).to_list()
                seed_feature_importance_std_dataframe = pd.DataFrame(columns=interested_column+['SEED'])
                for unique_seed in feature_importance_dataframe['SEED'].unique():
                    for unique_index in feature_importance_dataframe['INDEX'].unique():
                        interested_dataframe = feature_importance_dataframe[(feature_importance_dataframe['SEED']==unique_seed) & (feature_importance_dataframe['INDEX']==unique_index)].drop(['ORDER'],axis='columns')
                        temp_df = interested_dataframe[interested_column].std()
                        temp_df['METHOD'] = method_name
                        temp_df['META'] = meta_model
                        temp_df['FEATUREIMPORTANCE'] = feature_importance_analysis
                        temp_df['INDEX'] = unique_index
                        temp_df['SEED'] = unique_seed
                        temp_df = pd.DataFrame(temp_df).transpose()
                        # print(temp_df)
                        plot_shap_dataframe = pd.concat([temp_df,plot_shap_dataframe],axis='rows')
                all_plot_shap_prediction_dataframe = pd.concat([plot_shap_dataframe,all_plot_shap_prediction_dataframe],axis='rows')

# make new save folder
pathlib.Path(f'std data').mkdir(parents=True,exist_ok=True)
# only need 1 normal prediction because they are all the same for each feature importance analysis
all_plot_normal_prediction_dataframe = all_plot_normal_prediction_dataframe[all_plot_normal_prediction_dataframe['FEATUREIMPORTANCE']=='FPI'].drop(['FEATUREIMPORTANCE'],axis='columns')
all_plot_normal_prediction_dataframe.to_csv('std data\\stdnormalpredictiondataframe.csv',index=False)
all_plot_fpi_prediction_dataframe.to_csv('std data\\stdfpipredictiondataframe.csv',index=False)
all_plot_pdp_prediction_dataframe.to_csv('std data\\stdpdppredictiondataframe.csv',index=False)
all_plot_shap_prediction_dataframe.to_csv('std data\\stdshappredictiondataframe.csv',index=False)
# load in addtional information for plot
pd.read_csv('\\'.join(path.split('\\')[:-1])+f'\\tuned model\\Ascending\\PDPCSVMETADT\\ADDITIONALPDPDT.csv').to_csv('std data\\additionalpdp.csv',index=False)
pd.read_csv('\\'.join(path.split('\\')[:-1])+f'\\tuned model\\Ascending\\SHAPCSVMETADT\\ADDITIONALSHAPDT.csv').to_csv('std data\\additionalshap.csv',index=False)

mean

In [5]:
all_plot_normal_prediction_dataframe = pd.DataFrame()
all_plot_fpi_prediction_dataframe = pd.DataFrame()
all_plot_pdp_prediction_dataframe = pd.DataFrame()
all_plot_shap_prediction_dataframe = pd.DataFrame()

for method_name in method_name_list:
    method_path = '\\'.join(path.split('\\')[:-1])+'\\'+'tuned model'+'\\'+method_name
    for meta_model in meta_model_list:
        for feature_importance_analysis in feature_importance_analysis_list:
            folder_name = method_path+'\\'+f'{feature_importance_analysis}CSVMETA{meta_model}'
            # get dataframe name
            prediction_dataframe_name = folder_name+'\\'+f'PREDICTION{feature_importance_analysis}{meta_model}'
            feature_importance_dataframe_name = folder_name+'\\'+f'RESULTS{feature_importance_analysis}{meta_model}'
            additional_dataframe_name = folder_name+'\\'+f'ADDITIONAL{feature_importance_analysis}{meta_model}'
            # get dataframe
            prediction_dataframe = pd.read_csv(prediction_dataframe_name+'.csv')
            feature_importance_dataframe = pd.read_csv(feature_importance_dataframe_name+'.csv')
            try:
                additional_dataframe = pd.read_csv(additional_dataframe_name+'.csv')
            except:
                additional_dataframe = pd.DataFrame()
            # get the results of each
            # normal prediction
            seed_normal_prediction_mean_dataframe = pd.DataFrame(columns=metrics_list+['SEED'])
            for unique_seed in prediction_dataframe['SEED'].unique():
                interested_dataframe = prediction_dataframe[prediction_dataframe['SEED']==unique_seed].drop(['ORDER'],axis='columns')
                seed_mean_list = []
                for metric in metrics_list:
                    single_seed_mean = interested_dataframe[metric].mean()
                    seed_mean_list.append(single_seed_mean)
                # append the unique seed
                seed_mean_list.append(unique_seed)
                # put in dataframe
                seed_normal_prediction_mean_dataframe.loc[unique_seed] = seed_mean_list
            seed_normal_prediction_mean_dataframe['METHOD'] = method_name   
            seed_normal_prediction_mean_dataframe['META'] = meta_model  
            seed_normal_prediction_mean_dataframe['FEATUREIMPORTANCE'] = feature_importance_analysis  
            all_plot_normal_prediction_dataframe = pd.concat([seed_normal_prediction_mean_dataframe,all_plot_normal_prediction_dataframe],axis='rows')
            # feature importance
            # change specifically for each feature importance
            # there is 6 for 6 combinations
            if feature_importance_analysis == 'FPI':
                plot_fpi_dataframe = pd.DataFrame()
                interested_column = feature_importance_dataframe.columns.drop(['TIME','SEED','ORDER','SHUFFLE']).to_list()
                seed_feature_importance_mean_dataframe = pd.DataFrame(columns=interested_column+['SEED'])
                for unique_seed in feature_importance_dataframe['SEED'].unique():
                    for unique_shuffle in feature_importance_dataframe['SHUFFLE'].unique():
                        interested_dataframe = feature_importance_dataframe[(feature_importance_dataframe['SEED']==unique_seed) & (feature_importance_dataframe['SHUFFLE']==unique_shuffle)].drop(['ORDER'],axis='columns')
                        temp_df = interested_dataframe[interested_column].mean()
                        temp_df['METHOD'] = method_name
                        temp_df['META'] = meta_model
                        temp_df['FEATUREIMPORTANCE'] = feature_importance_analysis
                        temp_df['SHUFFLE'] = unique_shuffle
                        temp_df['SEED'] = unique_seed
                        temp_df = pd.DataFrame(temp_df).transpose()
                        # print(temp_df)
                        plot_fpi_dataframe = pd.concat([temp_df,plot_fpi_dataframe],axis='rows')
                all_plot_fpi_prediction_dataframe = pd.concat([plot_fpi_dataframe,all_plot_fpi_prediction_dataframe],axis='rows')
            elif feature_importance_analysis == 'PDP':
                plot_pdp_dataframe = pd.DataFrame()
                interested_column = feature_importance_dataframe.columns.drop(['SEED','ORDER','INDEX','PDPVALUES']).to_list()
                seed_feature_importance_mean_dataframe = pd.DataFrame(columns=interested_column+['SEED'])
                for unique_seed in feature_importance_dataframe['SEED'].unique():
                    for unique_index in feature_importance_dataframe['INDEX'].unique():
                        interested_dataframe = feature_importance_dataframe[(feature_importance_dataframe['SEED']==unique_seed) & (feature_importance_dataframe['INDEX']==unique_index)].drop(['ORDER'],axis='columns')
                        temp_df = interested_dataframe[interested_column].mean()
                        temp_df['METHOD'] = method_name
                        temp_df['META'] = meta_model
                        temp_df['FEATUREIMPORTANCE'] = feature_importance_analysis
                        temp_df['INDEX'] = unique_index
                        temp_df['SEED'] = unique_seed
                        temp_df = pd.DataFrame(temp_df).transpose()
                        # print(temp_df)
                        plot_pdp_dataframe = pd.concat([temp_df,plot_pdp_dataframe],axis='rows')
                all_plot_pdp_prediction_dataframe = pd.concat([plot_pdp_dataframe,all_plot_pdp_prediction_dataframe],axis='rows')
            elif feature_importance_analysis == 'SHAP':
                plot_shap_dataframe = pd.DataFrame()
                interested_column = feature_importance_dataframe.columns.drop(['SEED','ORDER','INDEX']).to_list()
                seed_feature_importance_mean_dataframe = pd.DataFrame(columns=interested_column+['SEED'])
                for unique_seed in feature_importance_dataframe['SEED'].unique():
                    for unique_index in feature_importance_dataframe['INDEX'].unique():
                        interested_dataframe = feature_importance_dataframe[(feature_importance_dataframe['SEED']==unique_seed) & (feature_importance_dataframe['INDEX']==unique_index)].drop(['ORDER'],axis='columns')
                        temp_df = interested_dataframe[interested_column].mean()
                        temp_df['METHOD'] = method_name
                        temp_df['META'] = meta_model
                        temp_df['FEATUREIMPORTANCE'] = feature_importance_analysis
                        temp_df['INDEX'] = unique_index
                        temp_df['SEED'] = unique_seed
                        temp_df = pd.DataFrame(temp_df).transpose()
                        # print(temp_df)
                        plot_shap_dataframe = pd.concat([temp_df,plot_shap_dataframe],axis='rows')
                all_plot_shap_prediction_dataframe = pd.concat([plot_shap_dataframe,all_plot_shap_prediction_dataframe],axis='rows')

# make new save folder
pathlib.Path(f'mean data').mkdir(parents=True,exist_ok=True)
# only need 1 normal prediction because they are all the same for each feature importance analysis
all_plot_normal_prediction_dataframe = all_plot_normal_prediction_dataframe[all_plot_normal_prediction_dataframe['FEATUREIMPORTANCE']=='FPI'].drop(['FEATUREIMPORTANCE'],axis='columns')
all_plot_normal_prediction_dataframe.to_csv('mean data\\meannormalpredictiondataframe.csv',index=False)
all_plot_fpi_prediction_dataframe.to_csv('mean data\\meanfpipredictiondataframe.csv',index=False)
all_plot_pdp_prediction_dataframe.to_csv('mean data\\meanpdppredictiondataframe.csv',index=False)
all_plot_shap_prediction_dataframe.to_csv('mean data\\meanshappredictiondataframe.csv',index=False)
# load in addtional information for plot
pd.read_csv('\\'.join(path.split('\\')[:-1])+f'\\tuned model\\Ascending\\PDPCSVMETADT\\ADDITIONALPDPDT.csv').to_csv('mean data\\additionalpdp.csv',index=False)
pd.read_csv('\\'.join(path.split('\\')[:-1])+f'\\tuned model\\Ascending\\SHAPCSVMETADT\\ADDITIONALSHAPDT.csv').to_csv('mean data\\additionalshap.csv',index=False)