In [1]:
import numpy
import pandas as pd
from scipy.stats import kendalltau
from codebase.meta_model import preprocessing, label_feature_split, analyse_generic_models_regression, split_dataset
from codebase.generic_processing import dataset_name_generator

NOISE = 0.2
N_SAMPLES = 1000
NUMBER_DATASETS = 10

numpy.random.seed(42)

dataset_name = dataset_name_generator(noise=NOISE, n_samples=N_SAMPLES, number_datasets=NUMBER_DATASETS)
directory = 'toy_datasets/ran_datasets/' + dataset_name + '.csv'
dataset_names = dataset_name + '_seed_'

dataset_df = pd.read_csv(directory)
target_feature = 'MAE'
processed_features_df = preprocessing(dataset_df, target_feature, bad_columns=['MSE', 'r2', 'mean', 'actual_sum'],
                                      using_xgboost=False)

kendalltau_results = []
coefficient_results = []
correlation_coefficient = []
# For each model training a meta-model and storing the correlation coefficient
for i in range((NUMBER_DATASETS + 1)):
    train_df, validation_df = split_dataset(processed_features_df, dataset_names + str(i))
    X_train, y_train = label_feature_split(train_df, target_feature)
    X_validation, y_validation = label_feature_split(validation_df, target_feature)
    analyse_df, correlation_coefficient = analyse_generic_models_regression(X_train, y_train, X_validation,
                                                                            y_validation)

    analyse_df = analyse_df.sort_values(by=['predicted_result'])
    analyse_df['predicted_rankings'] = analyse_df.reset_index().index.values
    analyse_df = analyse_df.sort_values(by=['actual_result'])
    analyse_df['actual_rankings'] = analyse_df.reset_index().index.values
    coefficient_results.append(correlation_coefficient)
    kendalltau_results.append(kendalltau(analyse_df['predicted_rankings'], analyse_df['actual_rankings'])[0])

print('The correlation coefficients')
for coefficient in coefficient_results:
    print(coefficient)
print('The Kendall Tau ranking')
for kendalltau_output in kendalltau_results:
    print(kendalltau_output)
print('section completed')


The correlation coefficients
0.9368695289869061
0.9195554742919462
0.9744109164765465
0.8199899309610421
0.841324144349242
0.4801510811872431
0.8767118157101209
0.8581332833851055
0.9423822300036238
0.872792923109033
0.89777073903019
The Kendall Tau ranking
0.7333333333333333
0.7777777777777777
0.8666666666666666
0.4666666666666666
0.6888888888888888
0.4666666666666666
0.6444444444444444
0.5555555555555555
0.5555555555555555
0.4666666666666666
0.6444444444444444
section completed
