# New metrics

In [1]:
import scipy.stats
import numpy as np
import pandas as pd
import math
import os

In [2]:
metrics_path = os.path.join('C:/', 'Users', 'Colin', 'Desktop', 'metrics_full_final')

datasets_directories = [f.path for f in os.scandir(metrics_path) if f.is_dir()]
datasets_names = [dataset_directory.split(os.path.sep)[-1] for dataset_directory in datasets_directories]
datasets_names = sorted(datasets_names)

#### Source for the student t test : https://machinelearningmastery.com/how-to-code-the-students-t-test-from-scratch-in-python/

In [20]:
regressor_names = ['DecisionTree', 'Khiops', 'LinearRegression', 'RandomForest', 'XGBoost']

alpha = 0.05

metrics_dict = {'dataset_name': [e[:12] for e in datasets_names]}

for regressor_name in regressor_names:
    metrics_dict[regressor_name] = []
    
    for dataset_name in datasets_names:
        base_df_path = os.path.join(metrics_path, dataset_name, '2_bins_equal_freq_below_threshold', 'Standard', regressor_name + '_regressor', 'metrics_normal.csv')
        new_df_path = os.path.join(metrics_path, dataset_name, '32_bins_equal_freq_below_threshold', 'RandomForest_classifier', regressor_name + '_regressor', 'metrics_extracted_features.csv')
        
        # If the result file is missing add 'not available' instead of win/loss/defeat
        if not (os.path.isfile(base_df_path) and os.path.isfile(new_df_path)):
            metrics_dict[regressor_name].append('n/a')
        
        else:
            tmp_base_rmse_df = pd.read_csv(base_df_path)
            tmp_new_rmse_df = pd.read_csv(new_df_path)

            # These lists are the RMSEs of each split
            base_rmse_population = np.array(tmp_base_rmse_df['test_root_mean_squared_error'])
            new_rmse_population = np.array(tmp_new_rmse_df['test_root_mean_squared_error'])

            # Calculate the means
            base_mean = np.mean(base_rmse_population)
            new_mean = np.mean(new_rmse_population)

            # Number of paired samples
            n = len(base_rmse_population)

            # Sum squared difference between observations
            d1 = sum([(base_rmse_population[i] - new_rmse_population[i])**2 for i in range(n)])
            # Sum difference between observations
            d2 = sum([base_rmse_population[i] - new_rmse_population[i] for i in range(n)])

            # Standard deviation of the difference between means
            sd = math.sqrt((d1 - (d2**2 / n)) / (n - 1))

            # Standard error of the difference between the means
            sed = sd / math.sqrt(n)

            # Calculate the t statistic
            t_stat = (base_mean - new_mean) / sed

            # Degrees of freedom
            df = n - 1

            # Calculate the critical value
            cv = scipy.stats.t.ppf(1.0 - alpha, df)

            # Calculate the p-value
            p = (1.0 - scipy.stats.t.cdf(abs(t_stat), df)) * 2.0

            # print everything
            # print('t_stat = {0:.3f}'.format(t_stat), 'df =', df, 'cv = {0:.3f}'.format(cv), 'p = {0:.6f}'.format(p))

            # Interpret via p-value
            if p > alpha:
                # Accept null hypothesis that the means are equal.
                metrics_dict[regressor_name].append(0)
            else:
                # Reject the null hypothesis that the means are equal.
                if base_mean < new_mean: # '<' because we are comparing RMSEs
                    # Defeat
                    metrics_dict[regressor_name].append(-1)
                else:
                    # Victory
                    metrics_dict[regressor_name].append(1)

print(pd.DataFrame(metrics_dict))

    dataset_name  DecisionTree Khiops  LinearRegression  RandomForest  XGBoost
0   3D_Road_Netw             1      1                 1             1        1
1   Air_Quality_             1      1                 1             0        0
2   Airfoil_Self             1      1                 1             0        0
3   Appliances_e             1      1                 1             1        1
4   Beijing_PM2.             1      1                 1             1        1
5   Bias_correct             1      1                 1             1        1
6   Bike_Sharing             1      1                 1            -1       -1
7   BlogFeedback             0      1                 1            -1       -1
8   Buzz_in_soci             1      1                 1            -1       -1
9   Combined_Cyc             1      1                 1             0        0
10  Communities_             1      0                 0             0        0
11  Communities_             1      1               

In [24]:
for regressor_name in regressor_names:
    print(regressor_name)
    print('defeats = ' + str(metrics_dict[regressor_name].count(-1)),
         'equalities = ' + str(metrics_dict[regressor_name].count(0)),
         'victories = ' + str(metrics_dict[regressor_name].count(1)))

DecisionTree
defeats = 3 equalities = 2 victories = 30
Khiops
defeats = 2 equalities = 2 victories = 30
LinearRegression
defeats = 0 equalities = 4 victories = 31
RandomForest
defeats = 9 equalities = 17 victories = 9
XGBoost
defeats = 11 equalities = 15 victories = 9


In [22]:
pd.DataFrame(metrics_dict).to_csv(os.path.join(metrics_path, 'result.csv'), index=False)