# init

In [1]:
from quantifiers.ACC import ACC
from quantifiers.dys_method import dys_method
from quantifiers.MS import MS_method

from utils.getTrainingScores import getTrainingScores
from utils.getTPRFPR import getTPRFPR
from utils.applyquantifiers import apply_quantifier
from utils.fitQuantifierSchumacherGithub import fitQuantifierSchumacherGithub

import pdb
import quapy as qp
import os
import numpy as np
import pandas as pd
import joblib
from scipy.io.arff import loadarff
from pprint import pprint
import matplotlib.pyplot as plt
import time
import math

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.calibration import CalibrationDisplay
from sklearn.linear_model import LogisticRegression

meta_table_path = './metafeatures/meta-features-table.csv'
path = "./datasets/"
train_data_path = "./train_data/"
test_data_path = "./test_data/"
# files = os.listdir(path)
meta_features_table_index = pd.read_csv('./metafeatures/meta-features-table-index.csv')
files = meta_features_table_index.pop('dataset_name').tolist()

counters = ['CC', 'ACC', 'PACC', 'PCC', 'SMM', 'HDy', 'DyS', 'SORD', 'MS', 'MS2', 'MAX', 'X', 'T50']

experiment_tables_path = './experiment_tables/'
experiment_tables_dict = None
processed_datasets_df = None

# counters = ["CC","ACC","SMM","HDy","DyS","SORD","MS","MS2","MAX","X","T50","PCC","PACC","GAC","GPAC","FM"]

# preprocess

In [2]:
# i = 0
# dataframe = None
# X = None
# y = None
# X_list = []
# y_list = []

# for f in files:
#   df = pd.read_csv(path + f)
#   df = df.dropna()
  
#   y = df.pop(df.columns[-1])
#   X = df

#   y_list.append(y.to_numpy())
#   X_list.append(X.to_numpy())

#   i += 1
# i = 0

In [3]:
i = 0
dataframe = None
X_train = None
y_train = None
X_test = None
y_test = None
X_train_list = []
y_train_list = []
X_test_list = []
y_test_list = []


if os.path.isfile(experiment_tables_path + 'processed_datasets.csv'):
    processed_datasets_df = pd.read_csv(experiment_tables_path + 'processed_datasets.csv')
else:
    processed_datasets_df = pd.DataFrame(columns=['dataset'])

for f in files:
  if not processed_datasets_df.query('dataset == @f').empty:
     continue

  # TRAIN DATA
  df_train = pd.read_csv(train_data_path + str(f.split('.csv')[0]) + '-TRAIN.csv')
  df_train = df_train.dropna()
  
  y_train = df_train.pop(df_train.columns[-1])
  X_train = df_train

  y_train_list.append(y_train.to_numpy())
  X_train_list.append(X_train.to_numpy())


  # TEST DATA
  df_test = pd.read_csv(test_data_path + str(f.split('.csv')[0]) + '-TEST.csv')
  df_test = df_test.dropna()
  
  y_test = df_test.pop(df_test.columns[-1])
  X_test = df_test

  y_test_list.append(y_test.to_numpy())
  X_test_list.append(X_test.to_numpy())

  i += 1
i = 0

In [4]:
meta_features_table = pd.read_csv(meta_table_path)

if os.path.isfile('./metafeatures/meta-table.csv'):
    meta_table = pd.read_csv('./metafeatures/meta-table.csv')
else:
    meta_table_columns = meta_features_table.columns.tolist()
    for counter in counters:
        meta_table_columns.append('arr_' + counter)
    meta_table = pd.DataFrame(columns=meta_table_columns)

In [5]:
def load_experiment_tables():
    exp_tables_dict = {key: None for key in counters}
        
    for key in exp_tables_dict.keys():
        if os.path.isfile(experiment_tables_path + 'experiment_table_' + key + '.csv'):
            exp_tables_dict[key] = pd.read_csv(experiment_tables_path + 'experiment_table_' + key + '.csv')
        else:
            exp_tables_dict[key] = pd.DataFrame(columns=['dataset_name', 'alpha', 'sample_size', 'real_p', 'pred_p', 'abs_error', 'run_time'])
    
    return exp_tables_dict

def save_experiment_tables(experiment_tables_dict, experiment_tables_path):
    for key in experiment_tables_dict.keys():
        experiment_tables_dict[key].to_csv(experiment_tables_path + 'experiment_table_' + key + '.csv', index=False)

In [6]:
# meta_table_columns = meta_features_table.columns.tolist()
# for counter in counters:
#     meta_table_columns.append('arr_' + counter)
# meta_table_columns

# meta-features

In [7]:
def run_experiment(X_train, y_train, X_test, y_test, dataset_name):
  #......................input/output path directories....................
  
  clf = None
  try:
    clf = joblib.load('./estimator_parameters/' + dataset_name + '.joblib')
    clf.n_jobs = -1
  except:
    clf = LogisticRegression(random_state=42, n_jobs=-1)

  calib_clf = CalibratedClassifierCV(clf, cv=3, n_jobs=-1)
  calib_clf.fit(X_train, y_train)

  scores = getTrainingScores(X_train, y_train, 10, clf)[0] # None
  tprfpr = getTPRFPR(scores)
  clf.fit(X_train, y_train)

  niterations = 10 # how many replicates it will take
  batch_sizes = list([100]) # list(range(10, min(91, max_allowed + 1), 10))# + list(range(100, min(501, max_allowed + 1), 100)) # test set sizes
  alpha_values = [round(x, 2) for x in np.linspace(0,1,20)]   # class proportion

  pos_scores = scores[scores["class"]==1]["scores"]
  neg_scores = scores[scores["class"]==0]["scores"]
  
  X_test = pd.DataFrame(X_test)
  y_test = pd.DataFrame(y_test, columns=[str(len(X_test.columns))])
  df_test = pd.concat([X_test, y_test], axis=1)
  
  # WAS ZERO (0) BEFORE
  df_test_pos = df_test.loc[df_test[df_test.columns[-1]] == 1] # seperating positive test examples
  df_test_neg = df_test.loc[df_test[df_test.columns[-1]] == 0] # seperating negative test examples
  
  table=pd.DataFrame(columns=['quantifier', 'abs-error', 'execution-time'])
  for sample_size in batch_sizes:   # [10,100,500], batch_sizes, Varying test set sizes

    for alpha in alpha_values: # Varying positive class distribution
      error = []


      for iter in range(niterations):
        pos_size = int(round(sample_size * alpha, 2))
        neg_size = sample_size - pos_size
        
        #
        # AVISAR O PROF ANDRE QUE SÓ FUNCIONOU COM REPLACE = TRUE
        #
        # df_test_neg
        # sample_test_pos = df_test_pos.sample( int(pos_size), replace = False)
        # sample_test_neg = df_test_neg.sample( int(neg_size), replace = False)
        sample_test_pos = df_test_pos.sample( int(pos_size), replace = True)
        sample_test_neg = df_test_neg.sample( int(neg_size), replace = True)

        sample_test = pd.concat([sample_test_pos, sample_test_neg])

        test_label = sample_test[sample_test.columns[-1]] # sample_test["class"]

        test_sample = sample_test.drop([sample_test.columns[-1]], axis=1) # sample_test.drop(["class"], axis=1)  #dropping class label columns
        te_scores = clf.predict_proba(test_sample)[:,1]  #estimating test sample scores

        n_pos_sample_test = list(test_label).count(1) #Counting num of actual positives in test sample
        calcultd_pos_prop = round(n_pos_sample_test/len(sample_test), 2) #actual pos class prevalence in generated sample

        # print(counters)
        for quantifier in counters:
          #..............Test Sample QUAPY exp...........................
          te_quapy = None
          external_qnt = None
          #if quantifier in ['EM', 'PWK']:
          #  print('ok')
          #  external_qnt = fitQuantifierSchumacherGithub(quantifier, X_train, y_train)                  
          #  te_quapy = qp.data.LabelledCollection(sample_test.drop(["class","Binary_label"], axis=1), test_label)
          
          #.............Calling of Methods..................................................
          start = time.time()
          pred_pos_prop = apply_quantifier(qntMethod=quantifier,
                                           clf=calib_clf,
                                           scores=scores,
                                           p_score=pos_scores,
                                           n_score=neg_scores,
                                           train_labels=None,
                                           test_score=te_scores,
                                           TprFpr=tprfpr,
                                           thr=0.5,
                                           measure='hellinger',
                                           test_data=test_sample,
                                           test_quapy=te_quapy,
                                           external_qnt=external_qnt) #y_test=test_label
          stop = time.time()
          t = stop - start

          pred_pos_prop = np.round(pred_pos_prop,2)  #predicted class proportion
          
          #..............................RESULTS Evaluation.....................................
          abs_error = round(abs(calcultd_pos_prop - pred_pos_prop), 2) # absolute error

          experiment_tables_dict[quantifier].loc[len(experiment_tables_dict[quantifier])] = [dataset_name,
                                                                                    alpha,
                                                                                    sample_size,
                                                                                    calcultd_pos_prop,
                                                                                    pred_pos_prop,
                                                                                    abs_error,
                                                                                    t]
          
          # table.loc[len(table)] = [quantifier, abs_error, t]
          # error = round(calcultd_pos_prop - pred_pos_prop, 2)     # simple error Biasness
          
  return table

# fold

In [8]:
# skip_count = 0
# table = None

# result = {}
# for counter in counters:
#     result[counter] = []

# file = open('log.txt', 'w')
# file.close()

# result_index = 0
# for i in range(len(meta_table), len(X_train_list)):
#   try:
#     table = run_experiment(X_train_list[i], y_train_list[i], X_test_list[i], y_test_list[i], str(files[i].split('.csv')[0]))

#     table = table.groupby('quantifier')['abs-error', 'execution-time'].aggregate('mean')

#     alpha = 0
#     for key in counters:
#       try:
#         sum = 0
#         for k in counters:
#           if k != key:
#             sum += ((1 - table['abs-error'][key]) / (1 - table['abs-error'][k]))
#             # sum += ((1 - table['abs-error'][key]) / (1 - table['abs-error'][k])) / (1 + alpha * math.log((table['execution-time'][key] / table['execution-time'][k])))
#         arr = sum / (len(counters) - 1)
#         result[key].append(arr)
#       except:
#         result[key].append(-1)    

#     row = meta_features_table.iloc[i].tolist()
#     for key in result:
#       row.append(result[key][result_index])
#     meta_table.loc[len(meta_table.index)] = row
    
#     meta_table.to_csv('./metafeatures/meta-table.csv', index = False)

#     result_index += 1
#     print('Finished ' + str(i))
#   except Exception as e:
#     print('Skipping ' + str(i) + '...\t\t\t' + str(e))
#     # for key in meta_table_dict:
#     #   meta_table_dict[key].drop(i, inplace = True)
#     skip_count += 1
    
#     file = open('log.txt', 'a')
#     file.write('Skipping ' + str(i) + '...\t\t\t' + str(e) + '\n')
#     file.write('Dataset: ' + str(files[i]) + '\n')
#     file.write('table[abs-error][key]:\n' + str(table['abs-error'][key]) + '\n')
#     file.write('table[abs-error][k]:\n' + str(table['abs-error'][k]) + '\n')
#     file.write('table[execution-time][key]:\n' + str(table['execution-time'][key]) + '\n')
#     file.write('table[execution-time][k]:\n' + str(table['execution-time'][k]) + '\n')
#     file.write(str(table['execution-time'][key] / table['execution-time'][k]) + '\n')
#     file.write('\n')
#     file.close()


#   # # TEST
#   # if i == 6:
#   #   break

# # for key in meta_table_dict:
# #   meta_table_dict[key]['arr'] = result[key]

# # for key in result:
# #     meta_features_table[('arr_' + key)] = result[key]

# print('\n\nSkipped ' + str(skip_count) + ' dataset(s)')

# generate experiment tables

In [9]:
experiment_tables_dict = load_experiment_tables()

skip_count = 0
table = None

result = {}
for counter in counters:
    result[counter] = []

file = open('log.txt', 'w')
file.close()

result_index = 0
i = 0
for i in range(len(X_train_list)):
  
  try:
    run_experiment(X_train_list[i], y_train_list[i], X_test_list[i], y_test_list[i], str(files[i].split('.csv')[0]))
    save_experiment_tables(experiment_tables_dict, experiment_tables_path)
    processed_datasets_df.loc[len(processed_datasets_df)] = files[i]
    processed_datasets_df.to_csv(experiment_tables_path + 'processed_datasets.csv', index=False)
    file = open('log.txt', 'a')
    file.write('Finished ' + str(i) + '\n')
    file.write('\n')
    file.close()
  except Exception as e:
    skip_count += 1
    file = open('log.txt', 'a')
    file.write('Skipping ' + str(i) + '...\t\t\t' + str(e) + '\n')
    file.write('Dataset: ' + str(files[i]) + '\n')
    file.write('\n')
    file.close()

  # # TEST
  # if i == 2:
  #   break

print('\n\nSkipped ' + str(skip_count) + ' dataset(s)')

  Fold #1
Training_len 3193
SCORES_Length: 320
  Fold #2
Training_len 3193
SCORES_Length: 640
  Fold #3
Training_len 3193
SCORES_Length: 960
  Fold #4
Training_len 3193
SCORES_Length: 1279
  Fold #5
Training_len 3193
SCORES_Length: 1598
  Fold #6
Training_len 3193
SCORES_Length: 1917
  Fold #7
Training_len 3193
SCORES_Length: 2236
  Fold #8
Training_len 3193
SCORES_Length: 2555
  Fold #9
Training_len 3193
SCORES_Length: 2874
  Fold #10
Training_len 3193
SCORES_Length: 3193
  Fold #1
Training_len 258
SCORES_Length: 26
  Fold #2
Training_len 258
SCORES_Length: 52
  Fold #3
Training_len 258
SCORES_Length: 78
  Fold #4
Training_len 258
SCORES_Length: 104
  Fold #5
Training_len 258
SCORES_Length: 130
  Fold #6
Training_len 258
SCORES_Length: 156
  Fold #7
Training_len 258
SCORES_Length: 182
  Fold #8
Training_len 258
SCORES_Length: 208
  Fold #9
Training_len 258
SCORES_Length: 233
  Fold #10
Training_len 258
SCORES_Length: 258
  Fold #1
Training_len 1020
SCORES_Length: 102
  Fold #2
Trainin