In [1]:
import pickle
import glob
from utils import *
import pandas as pd

  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)


In [2]:
fdf = get_problem_features(glob.glob('features/TSFresh.csv'))
fdf = fdf.pipe(drop_nan_and_non_unique_columns)
all_columns = list(fdf.columns)

get_problem_features took=0:01:19.968911 shape=(100000, 789)
drop_nan_and_non_unique_columns took=0:00:07.605689 shape=(100000, 323)


In [3]:
files = glob.glob('feature_importance/*.p')

In [4]:
df_dict = {x: [] for x in ['ml_algorithm', 'forecasting_algorithm', 'run', 'feature_importance_method'] + all_columns}

for file in files:
    algo = re_find('m_(.+?)_f_', file)
    run = re_find('r_(.+?)_m_', file)
    fe = re_find('_f_(.+?)\.p', file)
    
    meta = pickle.load(open(file, 'rb'))
    
    if 'shap' in fe:
        fe = 'shap'
        
    for forecasting_algo, perf_list in meta.items():
        df_dict['ml_algorithm'].append(algo)
        df_dict['forecasting_algorithm'].append(forecasting_algo)
        df_dict['run'].append(run)
        df_dict['feature_importance_method'].append(fe)
        
        perf_dict = {b:a for a, b in perf_list}
        
        for feature in all_columns:
            if feature in perf_dict:
                df_dict[feature].append(perf_dict[feature])
            else:
                df_dict[feature].append(None)

In [5]:
dffe = pd.DataFrame(df_dict)
dffe.to_csv('feature_importance/merged_feature_importance.csv')

In [6]:
dffe

Unnamed: 0,ml_algorithm,forecasting_algorithm,run,feature_importance_method,value__has_duplicate_max,value__has_duplicate_min,value__has_duplicate,value__sum_values,value__abs_energy,value__mean_abs_change,...,value__lempel_ziv_complexity__bins_3,value__lempel_ziv_complexity__bins_5,value__lempel_ziv_complexity__bins_10,value__lempel_ziv_complexity__bins_100,value__permutation_entropy__dimension_3__tau_1,value__permutation_entropy__dimension_4__tau_1,value__permutation_entropy__dimension_5__tau_1,value__permutation_entropy__dimension_6__tau_1,value__permutation_entropy__dimension_7__tau_1,value__mean_n_absolute_max__number_of_maxima_7
0,M4RandomForestSingleOutputRegressor,106,24,shap,0.000287,,,,,,...,,,,,0.004303,,,,,
1,M4RandomForestRegressor,256,18,shap,0.000317,,,,,,...,,,,,0.003058,,,,,
2,M4DummyMeanRegression,243,5,shap,0.000000,,,,,,...,,,,,0.000000,,,,,
3,M4XGBRegressor,Theta,12,shap,0.000323,,,,,,...,,,,,0.004436,,,,,
4,M4XGBRegressor,078,15,shap,0.000292,,,,,,...,,,,,0.004712,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29275,M4RandomForestRegressor,sNaive,3,shap,0.000301,,,,,,...,,,,,0.002781,,,,,
29276,M4DummyMeanRegression,078,6,shap,0.000000,,,,,,...,,,,,0.000000,,,,,
29277,M4XGBRegressor,219,12,shap,0.000586,,,,,,...,,,,,0.004665,,,,,
29278,M4DummyMeanRegression,126,5,shap,0.000000,,,,,,...,,,,,0.000000,,,,,


In [7]:
check_if_complete_df = dffe.groupby(['ml_algorithm', 'forecasting_algorithm', 'feature_importance_method']).count().reset_index()

In [8]:
check_if_complete_df.query("run!=30")

Unnamed: 0,ml_algorithm,forecasting_algorithm,feature_importance_method,run,value__has_duplicate_max,value__has_duplicate_min,value__has_duplicate,value__sum_values,value__abs_energy,value__mean_abs_change,...,value__lempel_ziv_complexity__bins_3,value__lempel_ziv_complexity__bins_5,value__lempel_ziv_complexity__bins_10,value__lempel_ziv_complexity__bins_100,value__permutation_entropy__dimension_3__tau_1,value__permutation_entropy__dimension_4__tau_1,value__permutation_entropy__dimension_5__tau_1,value__permutation_entropy__dimension_6__tau_1,value__permutation_entropy__dimension_7__tau_1,value__mean_n_absolute_max__number_of_maxima_7


In [9]:
check_if_complete_df['run'].mean()

30.0

In [10]:
check_if_complete_df.sort_values(['run'])

Unnamed: 0,ml_algorithm,forecasting_algorithm,feature_importance_method,run,value__has_duplicate_max,value__has_duplicate_min,value__has_duplicate,value__sum_values,value__abs_energy,value__mean_abs_change,...,value__lempel_ziv_complexity__bins_3,value__lempel_ziv_complexity__bins_5,value__lempel_ziv_complexity__bins_10,value__lempel_ziv_complexity__bins_100,value__permutation_entropy__dimension_3__tau_1,value__permutation_entropy__dimension_4__tau_1,value__permutation_entropy__dimension_5__tau_1,value__permutation_entropy__dimension_6__tau_1,value__permutation_entropy__dimension_7__tau_1,value__mean_n_absolute_max__number_of_maxima_7
0,M4DummyMeanRegression,005,permutation,30,30,0,0,0,0,0,...,0,0,0,0,30,0,0,0,0,0
643,M4XGBRegressor,104,xgboost-gain,30,30,0,0,0,0,0,...,0,0,0,0,30,0,0,0,0,0
644,M4XGBRegressor,104,xgboost-total_cover,30,30,0,0,0,0,0,...,0,0,0,0,30,0,0,0,0,0
645,M4XGBRegressor,104,xgboost-total_gain,30,30,0,0,0,0,0,...,0,0,0,0,30,0,0,0,0,0
646,M4XGBRegressor,104,xgboost-weight,30,30,0,0,0,0,0,...,0,0,0,0,30,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330,M4RandomForestRegressor,252,permutation,30,30,0,0,0,0,0,...,0,0,0,0,30,0,0,0,0,0
331,M4RandomForestRegressor,252,shap,30,30,0,0,0,0,0,...,0,0,0,0,30,0,0,0,0,0
332,M4RandomForestRegressor,253,permutation,30,30,0,0,0,0,0,...,0,0,0,0,30,0,0,0,0,0
334,M4RandomForestRegressor,255,permutation,30,30,0,0,0,0,0,...,0,0,0,0,30,0,0,0,0,0
