In [1]:
import pandas as pd
import scipy.stats as stats
import researchpy as rp
import statsmodels.api as sm
from statsmodels.formula.api import ols
import os
import ast
import numpy as np
import matplotlib.pyplot as plt


# Analysis <font color=blue>after </font> combining RF, DT and SVM into 1C 10 min

In [2]:
def get_class (model):
    if isinstance(model, str):
        model = '{' +'}'.join('{'.join(model.split('{')[1:]).split('}')[0:1]) + '}'
        m = ast.literal_eval(model)
        if 'classifier:__choice__' in m:
            classifier = m['classifier:__choice__']
        else:
            classifier = ''
        return classifier
    else:
        return ''


def parse_tpot(directory):
    result = pd.DataFrame(columns=['dataset', 'accuracy', 'model', 'precision', 'recall', 'f1score', 'time_budget', 'methods'])
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                method = file.split('.')[0].split('_')[2]
                time_budget = file.split('.')[0].split('_')[3]
                run = []
                sub_result = pd.read_csv(os.path.join(subdir, file))
                sub_result.rename(columns={'Unnamed: 0': 'dataset'}, inplace = True)
                run.append(sub_result[['dataset', 'accuracy_1', 'model_1', 'precision_1', 'recall_1', 'f1score_1']])
                run.append(sub_result[['dataset', 'accuracy_2', 'model_2', 'precision_2', 'recall_2', 'f1score_2']])
                run.append(sub_result[['dataset', 'accuracy_3', 'model_3', 'precision_3', 'recall_3', 'f1score_3']])
                for i in range(3):
                    run[i].rename(columns={'accuracy_'+str(i+1): 'accuracy',
                                           'model_'+str(i+1): 'model',
                                           'precision_'+str(i+1): 'precision',
                                           'recall_'+str(i+1): 'recall',
                                           'f1score_'+str(i+1): 'f1score'}, inplace=True)
                    run[i]['methods'] = str(method)
                    run[i]['time_budget'] = int(time_budget)
                    result = pd.concat([result, run[i]], axis=0, sort=True, ignore_index=True)
    result.model = result.model.apply(get_class)
    result = result[~np.isnan(result.f1score)]
    return result
#parse_tpot(r"C:\Users\HassanEldeeb\Documents\GitHub\AutoMLBenchmarking\logs_search_space/")

In [3]:
df = parse_tpot(r"C:\Users\HassanEldeeb\Documents\GitHub\AutoMLBenchmarking\logs_search_space/")
df = df[['dataset', 'time_budget', 'methods', 'f1score']]
df.methods = df.methods.replace("default", "fc")
df.methods = df.methods.replace("3C", "3c")
df.methods = df.methods.replace("SVC", "1c")
df.methods = df.methods.replace("DT", "1c")
df.methods = df.methods.replace("RF", "1c")
fsIs_b = ['vowel', 'openml_phpJNxH0q', 'dataset_31_credit-g', 'dataset_40_sonar']
fsIs_m = ['solar-flare_1', 'wine-quality-red', 'dataset_39_ecoli', 'synthetic_control']
fsIl_b = ['AirlinesCodrnaAdult', 'MagicTelescope', 'electricity-normalized', 'phpmPOD5A']
fsIl_m = ['pokerhand-normalized', 'eye_movements', 'avila-tr']
flIs_b = ['audiology', 'arrhythmia', 'AP_Breast_Lung', 'AP_Omentum_Ovary']
flIs_m = ['Amazon', 'umistfacescropped', 'phpGUrE90']
flIl_b = ['gina_agnostic', 'hiva_agnostic', 'phpZrCzJR', 'phprAeXmK']
flIl_m = ['KDDCup99', 'connect-4', 'dataset_60_waveform-5000', 'dataset_186_satimage']
df_binary = df[df.dataset.isin(fsIs_b) | df.dataset.isin(fsIl_b) | df.dataset.isin(flIs_b) | df.dataset.isin(flIl_b)]
df_multi = df[df.dataset.isin(fsIs_m) | df.dataset.isin(fsIl_m) | df.dataset.isin(flIs_m) | df.dataset.isin(flIl_m)]
#df.drop(df[(df.methods=='fc') & ((df.time_budget==30) | (df.time_budget==10))].index, inplace=True)
#df.drop(df[(df.methods=='3c') & ((df.time_budget==60) | (df.time_budget==10))].index, inplace=True)
df=df_binary
df10 = df[df.time_budget==10]
df30 = df[df.time_budget==30]
df60 = df[df.time_budget==60]

df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(343, 4)

In [4]:
rp.summary_cont(df10['f1score'])





Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,f1score,105.0,0.726433,0.290676,0.028367,0.67018,0.782686


In [5]:
rp.summary_cont(df30['f1score'])





Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,f1score,112.0,0.719853,0.299978,0.028345,0.663685,0.776021


In [6]:
rp.summary_cont(df60['f1score'])





Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,f1score,126.0,0.71997,0.293397,0.026138,0.66824,0.7717


In [7]:
stats.f_oneway(df10['f1score'][df10['methods'] == '1c'], 
             df10['f1score'][df10['methods'] == '3c'],
             df10['f1score'][df10['methods'] == 'fc'])

F_onewayResult(statistic=4.897053243844356, pvalue=0.009316319845083165)

In [8]:
stats.f_oneway(df30['f1score'][df30['methods'] == '1c'], 
             df30['f1score'][df30['methods'] == '3c'],
             df30['f1score'][df30['methods'] == 'fc'])

F_onewayResult(statistic=6.79851696978441, pvalue=0.0016503883328218578)

In [9]:
stats.f_oneway(df60['f1score'][df60['methods'] == '1c'], 
             df60['f1score'][df60['methods'] == '3c'],
             df60['f1score'][df60['methods'] == 'fc'])

F_onewayResult(statistic=8.796702102734306, pvalue=0.00026873246292863557)

In [10]:
    # Fits the model with the interaction term
    # This will also automatically include the main effects for each factor
    model10 = ols('f1score ~ C(methods)', df10).fit()

    # Seeing if the overall model is significant
    print(f"Overall model F({model10.df_model: .0f},{model10.df_resid: .0f}) = {model10.fvalue: .3f}, p = {model10.f_pvalue: .4f}")
    model10.summary()

Overall model F( 2, 102) =  4.897, p =  0.0093


0,1,2,3
Dep. Variable:,f1score,R-squared:,0.088
Model:,OLS,Adj. R-squared:,0.07
Method:,Least Squares,F-statistic:,4.897
Date:,"Fri, 01 Nov 2019",Prob (F-statistic):,0.00932
Time:,22:27:21,Log-Likelihood:,-13.94
No. Observations:,105,AIC:,33.88
Df Residuals:,102,BIC:,41.84
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6483,0.037,17.457,0.000,0.575,0.722
C(methods)[T.3c],0.1843,0.069,2.661,0.009,0.047,0.322
C(methods)[T.fc],0.1588,0.067,2.361,0.020,0.025,0.292

0,1,2,3
Omnibus:,18.727,Durbin-Watson:,2.208
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.366
Skew:,-1.151,Prob(JB):,8.43e-06
Kurtosis:,3.211,Cond. No.,3.18


In [11]:
    # Fits the model with the interaction term
    # This will also automatically include the main effects for each factor
    model30 = ols('f1score ~ C(methods)', df30).fit()

    # Seeing if the overall model is significant
    print(f"Overall model F({model30.df_model: .0f},{model30.df_resid: .0f}) = {model30.fvalue: .3f}, p = {model30.f_pvalue: .4f}")
    model30.summary()

Overall model F( 2, 109) =  6.799, p =  0.0017


0,1,2,3
Dep. Variable:,f1score,R-squared:,0.111
Model:,OLS,Adj. R-squared:,0.095
Method:,Least Squares,F-statistic:,6.799
Date:,"Fri, 01 Nov 2019",Prob (F-statistic):,0.00165
Time:,22:27:21,Log-Likelihood:,-16.983
No. Observations:,112,AIC:,39.97
Df Residuals:,109,BIC:,48.12
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6306,0.036,17.395,0.000,0.559,0.702
C(methods)[T.3c],0.1950,0.072,2.706,0.008,0.052,0.338
C(methods)[T.fc],0.2036,0.064,3.170,0.002,0.076,0.331

0,1,2,3
Omnibus:,18.144,Durbin-Watson:,2.195
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22.542
Skew:,-1.098,Prob(JB):,1.27e-05
Kurtosis:,3.085,Cond. No.,3.21


In [12]:
    # Fits the model with the interaction term
    # This will also automatically include the main effects for each factor
    model60 = ols('f1score ~ C(methods)', df60).fit()

    # Seeing if the overall model is significant
    print(f"Overall model F({model60.df_model: .0f},{model60.df_resid: .0f}) = {model60.fvalue: .3f}, p = {model60.f_pvalue: .4f}")
    model10.summary()

Overall model F( 2, 123) =  8.797, p =  0.0003


0,1,2,3
Dep. Variable:,f1score,R-squared:,0.088
Model:,OLS,Adj. R-squared:,0.07
Method:,Least Squares,F-statistic:,4.897
Date:,"Fri, 01 Nov 2019",Prob (F-statistic):,0.00932
Time:,22:27:21,Log-Likelihood:,-13.94
No. Observations:,105,AIC:,33.88
Df Residuals:,102,BIC:,41.84
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6483,0.037,17.457,0.000,0.575,0.722
C(methods)[T.3c],0.1843,0.069,2.661,0.009,0.047,0.322
C(methods)[T.fc],0.1588,0.067,2.361,0.020,0.025,0.292

0,1,2,3
Omnibus:,18.727,Durbin-Watson:,2.208
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.366
Skew:,-1.151,Prob(JB):,8.43e-06
Kurtosis:,3.211,Cond. No.,3.18


In [13]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

mc = MultiComparison(df10['f1score'], df10['methods'])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
    1c     3c   0.1843 0.0244  0.0195  0.349   True
    1c     fc   0.1588 0.0521 -0.0012 0.3188  False
    3c     fc  -0.0255    0.9 -0.2181 0.1672  False
---------------------------------------------------


In [14]:
mc = MultiComparison(df30['f1score'], df30['methods'])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
    1c     3c    0.195 0.0214  0.0238 0.3663   True
    1c     fc   0.2036 0.0056   0.051 0.3562   True
    3c     fc   0.0085    0.9 -0.1858 0.2029  False
---------------------------------------------------


In [15]:
mc = MultiComparison(df60['f1score'], df60['methods'])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
    1c     3c   0.2074 0.0046  0.0548 0.3601   True
    1c     fc   0.2093  0.002  0.0663 0.3522   True
    3c     fc   0.0018    0.9 -0.1759 0.1796  False
---------------------------------------------------


# Analysis <font color=blue>without </font> combining RF, DT and SVM into 1C 10 min

In [16]:
df = pd.read_excel(r"C:\Users\HassanEldeeb\Documents\GitHub\AutoMLBenchmarking\logs_search_space/skout.xlsx")
df = df[['time_budget', 'methods', 'f1score']]
df.methods = df.methods.replace("['adaboost', 'bernoulli_nb', 'decision_tree', 'extra_trees', 'gaussian_nb', 'gradient_boosting', 'k_nearest_neighbors', 'lda', 'liblinear_svc', 'libsvm_svc', 'multinomial_nb', 'passive_aggressive', 'qda', 'random_forest', 'sgd']", "fc")
df.methods = df.methods.replace("['decision_tree', 'libsvm_svc', 'random_forest']", "3c")
df.methods = df.methods.replace("['libsvm_svc']", "svc")
df.methods = df.methods.replace("['decision_tree']", "dt")
df.methods = df.methods.replace("['random_forest']", "rf")
df10 = df[df.time_budget==10]
df30 = df[df.time_budget==30]
df60 = df[df.time_budget==60]
df.shape

(900, 3)

In [17]:
rp.summary_cont(df10['f1score'])





Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,f1score,300.0,0.748181,0.241727,0.013956,0.720717,0.775646


In [18]:
rp.summary_cont(df30['f1score'])





Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,f1score,300.0,0.77951,0.202243,0.011677,0.756531,0.802488


In [19]:
rp.summary_cont(df60['f1score'])





Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,f1score,300.0,0.772674,0.198329,0.011451,0.75014,0.795208


In [20]:
stats.f_oneway(df10['f1score'][df10['methods'] == '1c'], 
             df10['f1score'][df10['methods'] == '3c'],
             df10['f1score'][df10['methods'] == 'fc'])

  ssbn += _square_of_sums(a - offset) / len(a)


F_onewayResult(statistic=nan, pvalue=nan)

In [21]:
stats.f_oneway(df30['f1score'][df30['methods'] == '1c'], 
             df30['f1score'][df30['methods'] == '3c'],
             df30['f1score'][df30['methods'] == 'fc'])

F_onewayResult(statistic=nan, pvalue=nan)

In [22]:
stats.f_oneway(df60['f1score'][df60['methods'] == '1c'], 
             df60['f1score'][df60['methods'] == '3c'],
             df60['f1score'][df60['methods'] == 'fc'])

F_onewayResult(statistic=nan, pvalue=nan)

In [23]:
    # Fits the model with the interaction term
    # This will also automatically include the main effects for each factor
    model10 = ols('f1score ~ C(methods)', df10).fit()

    # Seeing if the overall model is significant
    print(f"Overall model F({model10.df_model: .0f},{model10.df_resid: .0f}) = {model10.fvalue: .3f}, p = {model10.f_pvalue: .4f}")
    model10.summary()

Overall model F( 4, 295) =  5.433, p =  0.0003


0,1,2,3
Dep. Variable:,f1score,R-squared:,0.069
Model:,OLS,Adj. R-squared:,0.056
Method:,Least Squares,F-statistic:,5.433
Date:,"Fri, 01 Nov 2019",Prob (F-statistic):,0.000311
Time:,22:27:22,Log-Likelihood:,11.466
No. Observations:,300,AIC:,-12.93
Df Residuals:,295,BIC:,5.587
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8037,0.030,26.505,0.000,0.744,0.863
C(methods)[T.dt],-0.0791,0.043,-1.844,0.066,-0.163,0.005
C(methods)[T.fc],0.0012,0.043,0.028,0.978,-0.083,0.086
C(methods)[T.rf],-0.0318,0.043,-0.743,0.458,-0.116,0.053
C(methods)[T.svc],-0.1676,0.043,-3.909,0.000,-0.252,-0.083

0,1,2,3
Omnibus:,62.558,Durbin-Watson:,0.434
Prob(Omnibus):,0.0,Jarque-Bera (JB):,98.299
Skew:,-1.246,Prob(JB):,4.5100000000000005e-22
Kurtosis:,4.287,Cond. No.,5.83


In [24]:
    # Fits the model with the interaction term
    # This will also automatically include the main effects for each factor
    model30 = ols('f1score ~ C(methods)', df30).fit()

    # Seeing if the overall model is significant
    print(f"Overall model F({model30.df_model: .0f},{model30.df_resid: .0f}) = {model30.fvalue: .3f}, p = {model30.f_pvalue: .4f}")
    model30.summary()

Overall model F( 4, 295) =  3.543, p =  0.0077


0,1,2,3
Dep. Variable:,f1score,R-squared:,0.046
Model:,OLS,Adj. R-squared:,0.033
Method:,Least Squares,F-statistic:,3.543
Date:,"Fri, 01 Nov 2019",Prob (F-statistic):,0.00766
Time:,22:27:22,Log-Likelihood:,61.342
No. Observations:,300,AIC:,-112.7
Df Residuals:,295,BIC:,-94.17
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8084,0.026,31.486,0.000,0.758,0.859
C(methods)[T.dt],-0.0452,0.036,-1.244,0.214,-0.117,0.026
C(methods)[T.fc],0.0017,0.036,0.047,0.963,-0.070,0.073
C(methods)[T.rf],0.0058,0.036,0.160,0.873,-0.066,0.077
C(methods)[T.svc],-0.1070,0.036,-2.947,0.003,-0.178,-0.036

0,1,2,3
Omnibus:,57.172,Durbin-Watson:,0.495
Prob(Omnibus):,0.0,Jarque-Bera (JB):,87.881
Skew:,-1.146,Prob(JB):,8.26e-20
Kurtosis:,4.332,Cond. No.,5.83


In [25]:
    # Fits the model with the interaction term
    # This will also automatically include the main effects for each factor
    model60 = ols('f1score ~ C(methods)', df60).fit()

    # Seeing if the overall model is significant
    print(f"Overall model F({model60.df_model: .0f},{model60.df_resid: .0f}) = {model60.fvalue: .3f}, p = {model60.f_pvalue: .4f}")
    model10.summary()

Overall model F( 4, 295) =  2.383, p =  0.0515


0,1,2,3
Dep. Variable:,f1score,R-squared:,0.069
Model:,OLS,Adj. R-squared:,0.056
Method:,Least Squares,F-statistic:,5.433
Date:,"Fri, 01 Nov 2019",Prob (F-statistic):,0.000311
Time:,22:27:22,Log-Likelihood:,11.466
No. Observations:,300,AIC:,-12.93
Df Residuals:,295,BIC:,5.587
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8037,0.030,26.505,0.000,0.744,0.863
C(methods)[T.dt],-0.0791,0.043,-1.844,0.066,-0.163,0.005
C(methods)[T.fc],0.0012,0.043,0.028,0.978,-0.083,0.086
C(methods)[T.rf],-0.0318,0.043,-0.743,0.458,-0.116,0.053
C(methods)[T.svc],-0.1676,0.043,-3.909,0.000,-0.252,-0.083

0,1,2,3
Omnibus:,62.558,Durbin-Watson:,0.434
Prob(Omnibus):,0.0,Jarque-Bera (JB):,98.299
Skew:,-1.246,Prob(JB):,4.5100000000000005e-22
Kurtosis:,4.287,Cond. No.,5.83


In [26]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

mc = MultiComparison(df10['f1score'], df10['methods'])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
    3c     dt  -0.0791 0.3507 -0.1968  0.0386  False
    3c     fc   0.0012    0.9 -0.1165  0.1189  False
    3c     rf  -0.0318    0.9 -0.1495  0.0858  False
    3c    svc  -0.1676 0.0011 -0.2853 -0.0499   True
    dt     fc   0.0803  0.335 -0.0374   0.198  False
    dt     rf   0.0472 0.7803 -0.0705  0.1649  False
    dt    svc  -0.0885  0.238 -0.2062  0.0291  False
    fc     rf   -0.033    0.9 -0.1507  0.0847  False
    fc    svc  -0.1688  0.001 -0.2865 -0.0511   True
    rf    svc  -0.1358 0.0146 -0.2535 -0.0181   True
----------------------------------------------------


In [27]:
mc = MultiComparison(df30['f1score'], df30['methods'])
mc_results = mc.tukeyhsd(model30.f_pvalue)
print(mc_results)

Multiple Comparison of Means - Tukey HSD, FWER=0.01
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
    3c     dt  -0.0452 0.6994 -0.1674  0.077  False
    3c     fc   0.0017    0.9 -0.1205 0.1239  False
    3c     rf   0.0058    0.9 -0.1164  0.128  False
    3c    svc   -0.107 0.0284 -0.2292 0.0152  False
    dt     fc   0.0469  0.673 -0.0753 0.1691  False
    dt     rf    0.051  0.609 -0.0712 0.1732  False
    dt    svc  -0.0618 0.4356  -0.184 0.0604  False
    fc     rf   0.0041    0.9 -0.1181 0.1263  False
    fc    svc  -0.1087 0.0247 -0.2309 0.0135  False
    rf    svc  -0.1128 0.0176  -0.235 0.0094  False
---------------------------------------------------


In [28]:
mc = MultiComparison(df60['f1score'], df60['methods'])
mc_results = mc.tukeyhsd(model60.f_pvalue)
print(mc_results)

Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
    3c     dt  -0.0309    0.9 -0.1289  0.0672  False
    3c     fc   0.0296    0.9 -0.0685  0.1277  False
    3c     rf   -0.002    0.9 -0.1001  0.0961  False
    3c    svc  -0.0742  0.237 -0.1723  0.0239  False
    dt     fc   0.0605 0.4463 -0.0376  0.1585  False
    dt     rf   0.0289    0.9 -0.0692  0.1269  False
    dt    svc  -0.0433 0.7204 -0.1414  0.0548  False
    fc     rf  -0.0316    0.9 -0.1297  0.0665  False
    fc    svc  -0.1038 0.0332 -0.2019 -0.0057   True
    rf    svc  -0.0722 0.2629 -0.1703  0.0259  False
----------------------------------------------------
