# init

In [1]:
from quantifiers.ACC import ACC
from quantifiers.dys_method import dys_method
from quantifiers.MS import MS_method

from utils.getTrainingScores import getTrainingScores
from utils.getTPRFPR import getTPRFPR
from utils.applyquantifiers import apply_quantifier
from utils.fitQuantifierSchumacherGithub import fitQuantifierSchumacherGithub

import pdb
import quapy as qp
import os
import numpy as np
import pandas as pd
import joblib
from scipy.io.arff import loadarff
from pprint import pprint
import matplotlib.pyplot as plt
import time
import math

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.calibration import CalibrationDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

meta_table_path = './metafeatures/meta-features-table.csv'
path = "./datasets/"
train_data_path = "./train_data/"
test_data_path = "./test_data/"

meta_features_table_index = pd.read_csv('./metafeatures/meta-features-table-index.csv')
files = meta_features_table_index.pop('dataset_name').tolist()
experiment_tables_path = './experiment_tables/'

In [2]:
def load_experiment_tables():
    exp_tables_dict = {key: None for key in ['CC', 'ACC', 'PACC', 'PCC', 'SMM', 'HDy', 'DyS', 'SORD', 'MS', 'MS2', 'MAX', 'X', 'T50']}     
    for key in exp_tables_dict.keys():
        if os.path.isfile(experiment_tables_path + 'experiment_table_' + key + '.csv'):
            exp_tables_dict[key] = pd.read_csv(experiment_tables_path + 'experiment_table_' + key + '.csv')
        else:
            exp_tables_dict[key] = pd.DataFrame(columns=['dataset_name', 'alpha', 'sample_size', 'real_p', 'pred_p', 'abs_error', 'run_time'])
    return exp_tables_dict

experiment_tables_dict = load_experiment_tables()
algList = list(experiment_tables_dict.keys())
df_dict = {key: None for key in list(experiment_tables_dict.keys())}

# preprocess

In [3]:
i = 0
dataframe = None
X = None
y = None
X_train = None
y_train = None
X_test = None
y_test = None
X_list = []
y_list = []
X_train_list = []
y_train_list = []
X_test_list = []
y_test_list = []

for f in files:
  # ALL DATA
  df = pd.read_csv(path + str(f.split('.csv')[0]) + '.csv')
  df = df.dropna()
  y = df.pop(df.columns[-1])
  X = df

  y_list.append(y.to_numpy())
  X_list.append(X.to_numpy())
  

  # TRAIN DATA
  df_train = pd.read_csv(train_data_path + str(f.split('.csv')[0]) + '-TRAIN.csv')
  df_train = df_train.dropna()
  
  y_train = df_train.pop(df_train.columns[-1])
  X_train = df_train

  y_train_list.append(y_train.to_numpy())
  X_train_list.append(X_train.to_numpy())


  # TEST DATA
  df_test = pd.read_csv(test_data_path + str(f.split('.csv')[0]) + '-TEST.csv')
  df_test = df_test.dropna()
  
  y_test = df_test.pop(df_test.columns[-1])
  X_test = df_test

  y_test_list.append(y_test.to_numpy())
  X_test_list.append(X_test.to_numpy())

  i += 1
i = 0

# run classifiers

In [4]:
def run_clf(X, y):
  clf = LogisticRegression(random_state=42, n_jobs=-1)

  scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
  results = cross_validate(estimator=clf, X=X, y=y, cv=5, scoring=scoring)
  

  return results

In [5]:
classification_results = pd.DataFrame(columns=['dataset',
                                               'mean_fit_time',
                                               'std_fit_time',
                                               'mean_score_time',
                                               'std_score_time',
                                               'mean_accuracy',
                                               'std_accuracy',
                                               'mean_precision',
                                               'std_precision',
                                               'mean_recall',
                                               'std_recall',
                                               'mean_f1',
                                               'std_f1',
                                               'mean_roc_auc',
                                               'std_roc_auc'])

i = 0
for i in range(len(X_list)):
    results = run_clf(X_list[i], y_list[i])

    row = [files[i].split('.csv')[0],
           round(results['fit_time'].mean(), 2),
           round(results['fit_time'].std(), 2),
           round(results['score_time'].mean(), 2),
           round(results['score_time'].std(), 2),
           round(results['test_accuracy'].mean(), 2),
           round(results['test_accuracy'].std(), 2),
           round(results['test_precision'].mean(), 2),
           round(results['test_precision'].std(), 2),
           round(results['test_recall'].mean(), 2),
           round(results['test_recall'].std(), 2),
           round(results['test_f1'].mean(), 2),
           round(results['test_f1'].std(), 2),
           round(results['test_roc_auc'].mean(), 2),
           round(results['test_roc_auc'].std(), 2)]
    classification_results.loc[len(classification_results)] = row
    print('Finished ' + str(i) + ': ' + files[i].split('.csv')[0])

classification_results.to_csv('./classification_results/classification_results_rounded.csv', index = False)

Finished 0: 1043_ada_agnostic
Finished 1: 1048_jEdit_4
Finished 2: 1049_pc4
Finished 3: 1050_pc3
Finished 4: 1054_mc2
Finished 5: 1056_mc1
Finished 6: 1061_ar4
Finished 7: 1065_kc3
Finished 8: 1066_kc1-binary
Finished 9: 1068_pc1
Finished 10: 1069_pc2
Finished 11: 1071_mw1
Finished 12: 1073_jEdit_4
Finished 13: 1075_datatrieve
Finished 14: 1115_teachingAssistant
Finished 15: 1121_badges2
Finished 16: 1167_pc1_req
Finished 17: 11_balance-scale
Finished 18: 12_mfeat-factors
Finished 19: 1456_appendicitis
Finished 20: 1460_banana
Finished 21: 1462_banknote-authentication
Finished 22: 1464_blood-transfusion-service-center
Finished 23: 1465_breast-tissue
Finished 24: 1466_cardiotocography
Finished 25: 1467_climate-model-simulation-crashes
Finished 26: 1473_fertility
Finished 27: 1475_first-order-theorem-proving
Finished 28: 1479_hill-valley
Finished 29: 1484_lsvt
Finished 30: 1485_madelon
Finished 31: 1487_ozone-level-8hr
Finished 32: 1488_parkinsons
Finished 33: 1489_phoneme
Finished 34: 1

In [6]:
classification_results = classification_results.drop(['mean_fit_time',
                                                      'std_fit_time',
                                                      'mean_score_time',
                                                      'std_score_time',
                                                      'mean_precision',
                                                      'std_precision',
                                                      'mean_recall',
                                                      'std_recall'], axis=1)
classification_results.to_html('./classification_results/classification_results_rounded.html', index = False)