# Start

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp
import random
import pdb
import csv
# import pickle as pk

experiment_tables_path = './experiment_tables/'
experiment_tables_dict = None
processed_datasets_df = None

datasets_path = './datasets/'
files = os.listdir(datasets_path)

# Load data

In [2]:
def load_remove_dataset_list():
    path = './dataset_remove/'
    remove_dataset_list = []
    temp_remove_dataset_list = pd.read_csv(path + 'remover.csv')
    for x in temp_remove_dataset_list.values.tolist():
        remove_dataset_list.append(x[0])
    
    return remove_dataset_list

def remove_dataset_from_experiment_tables_dict(experiment_tables_dict, remove_dataset_list):
    for dataset in remove_dataset_list:
        for key in experiment_tables_dict:
            experiment_tables_dict[key] = experiment_tables_dict[key][experiment_tables_dict[key]['dataset_name'] != dataset]
            experiment_tables_dict[key].reset_index(drop=True, inplace=True)
    return experiment_tables_dict

def remove_dataset_from_processed_datasets_df(processed_datasets_df, remove_dataset_list):
    for dataset in remove_dataset_list:
        processed_datasets_df = processed_datasets_df[processed_datasets_df['dataset'] != dataset + '.csv']
    processed_datasets_df.reset_index(drop=True, inplace=True)
    return processed_datasets_df

def remove_dataset_from_meta_features_table(meta_features_table, remove_dataset_list):
    meta_features_table_index = pd.read_csv('./metafeatures/meta-features-table-index.csv')
    
    for dataset in remove_dataset_list:
        remove_index = meta_features_table_index.index[meta_features_table_index['dataset_name'] == dataset+'.csv'].tolist()[0]
        meta_features_table = meta_features_table.drop([remove_index])
    
    meta_features_table.reset_index(drop=True, inplace=True)
    return meta_features_table


remove_dataset_list = load_remove_dataset_list()


# Here we load the Experiment Table from each quantifier
# The Experiment Table contains every exectuion + results
# from the algorithm.
def load_experiment_tables():
    exp_tables_dict = {key: None for key in ['CC', 'ACC', 'PACC', 'PCC', 'SMM', 'HDy', 'DyS', 'SORD', 'MS', 'MAX', 'X']}
        
    for key in exp_tables_dict.keys():
        if os.path.isfile(experiment_tables_path + 'experiment_table_' + key + '.csv'):
            exp_tables_dict[key] = pd.read_csv(experiment_tables_path + 'experiment_table_' + key + '.csv')
        else:
            exp_tables_dict[key] = pd.DataFrame(columns=['dataset_name', 'alpha', 'sample_size', 'real_p', 'pred_p', 'abs_error', 'run_time'])
    
    return exp_tables_dict


experiment_tables_dict = load_experiment_tables()
experiment_tables_dict = remove_dataset_from_experiment_tables_dict(experiment_tables_dict, remove_dataset_list)

df_dict = {key: None for key in list(experiment_tables_dict.keys())}


for key in experiment_tables_dict:
    df_dict[key] = experiment_tables_dict[key].groupby('dataset_name')['abs_error'].aggregate('mean')


meta_features_table = pd.read_csv('./metafeatures/meta-features-table.csv')
meta_features_table = remove_dataset_from_meta_features_table(meta_features_table, remove_dataset_list)

algList = []
tableList = []
for counter in df_dict.keys():
    algList.append(counter)

    y = df_dict[counter].values

    X = meta_features_table.values
    np.nan_to_num(X, copy=False)
    
    row, column = np.where(X > np.finfo(np.float32).max)
    for i in range(len(row)):
        X[row[i]][column[i]] = np.finfo(np.float32).max

    tableList.append((X, y))

processed_datasets_df = pd.read_csv('./experiment_tables/processed_datasets.csv')
processed_datasets_df = remove_dataset_from_processed_datasets_df(processed_datasets_df, remove_dataset_list)

datasets = processed_datasets_df['dataset'].tolist()

for key in df_dict.keys():
    df = pd.DataFrame({'MAE': df_dict[key].to_list(), 'Dataset': datasets})
    df.to_csv('./experiment_tables/summarized/experiment_table_' + str(key) + '.csv', index = False)
# processed_datasets_df = pd.read_csv('./experiment_tables/processed_datasets.csv')

# Train & test recommender with Leave-one-out

This code evaluates the recommender (regressor) with Leave-one-out

* If this section is commented, just un-comment and run (it's going to take a while)

In [3]:
# Regressor ~ Recommender
rf_reg = RandomForestRegressor()

instance_len = len(tableList[0][0])
rf_results = {}

# tableList contains the meta-table (X, y) from 
# each quantifier
#
# this loop will train and test the RF regressor (recommender)
# using the meta-table (X, y) with Leave-One-Out
j = 0
for (X, y) in tableList:
  rf_results_list = []
  algName = algList[j]
  j += 1

  # LEAVE-ONE-OUT
  for i in range(0, len(X)):
    X_train = np.delete(X, i, 0)
    y_train = np.delete(y, i, 0)

    X_test = X[i]
    X_test = X_test.reshape(1, -1)
    y_test = y[i]

    rf_reg.fit(X_train, y_train)
  
    rf_abs_error = rf_reg.predict(X_test)


    rf_results_list.append([y_test, rf_abs_error[0]])


  rf_results[algName] = rf_results_list


# A results table (named 'recommendation table') is constructed
# for the regressor (recommender)

# # # RANDOM FORESTS
# # #
data = []
cols = []
for key in rf_results:
  cols.append('abs-error-'+key)
  cols.append('abs-error-'+key+'-predicted')
cols.append('abs-error-ideal')
cols.append('quantifier-ideal')
cols.append('quantifier-ideal-num')
cols.append('abs-error-recommended')
cols.append('quantifier-recommended')
cols.append('quantifier-recommended-num')
i = 1
for key in rf_results:
    cols.append('rank-' + str(i))
    i += 1


i = 0
for i in range(0, instance_len):
  abs_error_ideal = 2
  quantifier_ideal = 'NULL'
  quantifier_ideal_num = -1
  abs_error_recommended = 2
  quantifier_recommended = 'NULL'
  quantifier_recommended_num = -1
  row = []
  algNum = 0
  rank = {}

  for a in algList:
    row.append(rf_results[a][i][0])
    row.append(rf_results[a][i][1])

    rank[algNum] = rf_results[a][i][1]

    if rf_results[a][i][0] < abs_error_ideal:
      abs_error_ideal = rf_results[a][i][0]
      quantifier_ideal = a
      quantifier_ideal_num = algNum

    if rf_results[a][i][1] < abs_error_recommended:
      abs_error_recommended = rf_results[a][i][1]
      quantifier_recommended = a
      quantifier_recommended_num = algNum

    algNum += 1
  rank = sorted(rank.items(), key=lambda item: item[1])

  row.append(abs_error_ideal)
  row.append(quantifier_ideal)
  row.append(quantifier_ideal_num)
  row.append(abs_error_recommended)
  row.append(quantifier_recommended)
  row.append(quantifier_recommended_num)
  for key in rank:
    row.append(int(key[0]))

  data.append(row)

rf_table = pd.DataFrame(data, columns = cols)

# # This line saves the results of the leaven-one-out evaluation of the recommender:
rf_table.to_csv("./recommendation/recommendation_table_rf.csv", index = False)

# Load Recommendation Table

In [4]:
# We load the Recommendation Table generated in the previous step
rf_table = pd.read_csv("./recommendation/recommendation_table_rf.csv")

# Friedmann Test + Posthoc with <s>Holm Procedure</s> <s>Nemenyi</s> Conover and p-adjust = Holm

This code constructs the optimal quantifier sets for each dataset using a non-parametric Friedmann Test + Posthoc with <s>Holm Procedure</s> <s>Nemenyi</s> Conover and p-adjust = Holm

In [5]:
# optimal_dict -> for each dataset (key) there is a set of optimal (adequate) quantifiers (for that problem)
# 
# test_dict -> for each dataset (key) there is a dataframe containing the MAE of each quantifier in that problem
# (you can run this code and print 'test_dict' for clarification)
test_dict = {}
optimal_dict = {}
for d in datasets:
    test_dict[d.split('.csv')[0]] = None
    optimal_dict[d.split('.csv')[0]] = None

for dataset in test_dict.keys():
    quantifier_error_dict = {key: [] for key in algList}

    for alg in algList:
        temp_df = experiment_tables_dict[alg].loc[experiment_tables_dict[alg]['dataset_name'] == dataset]
        temp_df = temp_df.groupby('alpha').mean(numeric_only = True)
        temp_df = temp_df.reset_index()
        quantifier_error_dict[alg] = temp_df['abs_error'].tolist()
        
    test_dict[dataset] = pd.DataFrame()
    for key in quantifier_error_dict.keys():
        test_dict[dataset][key] = quantifier_error_dict[key]

# Here we construct the optimal quantifier set for each dataset
#
# The best quantifier is selected (lower error) and then every quantifier
# that is similar is also included (through a Friedmann Test + Posthoc with Holm Procedure)
test_index = 0
for dataset in test_dict.keys():
    groups = []
    for counter in test_dict[dataset].columns.tolist():
        groups.append(test_dict[dataset][counter].tolist())
    
    res = friedmanchisquare(*groups)
    if res.pvalue < 0.05:
        # HOLM
        best = test_dict[dataset].mean().sort_values().index[0]
        best_index = test_dict[dataset].columns.tolist().index(best)
        opt_set = set()
        opt_set.add(best)

        # res = sp.posthoc_ttest(groups, p_adjust='holm')
        # res = sp.posthoc_nemenyi(groups)
        res = sp.posthoc_conover_friedman(np.array(groups).T, p_adjust='holm')
    
        index = 0
        # for element in res[best_index+1]:
        for element in res[best_index]: # CONOVER_FRIEDMAN
            if element >= 0.05:
                opt_set.add(test_dict[dataset].columns.tolist()[index])
            index += 1
        
        optimal_dict[dataset] = opt_set
    else:
        optimal_dict[dataset] = set(algList)

In [6]:
# Now we save optimal_dict as a single pandas dataframe
# This is usefull to plot our "ASetOpt Histogram" -> SEE: plots.ipynb
columns = ['dataset']
columns.extend(algList)
optimal_df = pd.DataFrame(columns = columns)

for key in optimal_dict.keys():
    df_row = [key]
    df_row.extend([0] * len(algList))
    quantifiers_set = optimal_dict[key]

    for quantifier in quantifiers_set:
        df_row[algList.index(quantifier) + 1] = 1

    optimal_df.loc[len(optimal_df)] = df_row
optimal_df.to_csv('./plot_data/optimal_quantifiers.csv', index = False)

# Topline

This code creates the Topline Experiment Table.

The Topline is the ideal case, which would happen if we choose the best algorithm for every dataset.

In [7]:
def generate_topline(rf_table, datasets):
    topline_experiment_table = pd.DataFrame(columns=['MAE', 'Dataset'])

    # Load every dataset (by name)
    dt_list = []
    for d in datasets:
        dt_list.append(d.split('.csv')[0])

    # Get all the Ideal Quantifiers from the Recommendation Table
    rows = rf_table['quantifier-ideal'].to_list()

    # Include in the Experiment Table the MAE of the 'Topline' Quantifier, which would be the best quantifier
    # for that specific dataset
    i = 0
    for r in rows:
        topline_experiment_table.loc[len(topline_experiment_table)] = [rf_table.loc[i]['abs-error-' + r], dt_list[i]]
        i += 1

    # Save the resulting Experiment Table
    topline_experiment_table.to_csv('./experiment_tables/summarized/experiment_table_TOPLINE.csv', index=False)

# Construction of the Top-K sets + weights

This function creates two dictionaries that are important for the next steps:

(1) rf_top_dict:

For each dataset (dictonary key) there is a set of the top-K quantifiers for that dataset (dictonary value).
This is important to calculate the Hit Rate, by comparing the top-K set with the optimal set.


(2) rf_ensemble_top_dict:

For each dataset (dictionary key) there is a list containing the Top-K (i) quantifiers and (ii) their respective weights
This is important for the Top-K Weighted Ensemble Approach, since we need to know the Top-K quantifiers + their respective
weights. For the Top-K Ensemble Approach (not using weights), we only use the Top-K quantifiers (and then we calculate the
mean or median of their estimations).

You can run this code and print "rf_top_dict" and "rf_ensemble_top_dict" for better clarification

In [8]:
# The "top" variable controls the K in the top-K
def generate_top_k_dictionary(top, algList, datasets):
    dt_list = []
    for d in datasets:
        dt_list.append(d.split('.csv')[0])

    rf_top_dict = {key: None for key in dt_list}

    rf_ensemble_top_dict = {}
    for key in dt_list:
        rf_ensemble_top_dict[key] = []

    index = 0
    for dataset in rf_top_dict.keys():
        quantifier_set = set()
        ensemble_quantifier_list = []
        ensemble_error_list = []
        for i in range(0, top):
            # quantifier = random.choice(algList)
            quantifier = algList[rf_table.loc[index]['rank-'+str(i+1)]]
            quantifier_set.add(quantifier)
            ensemble_quantifier_list.append(quantifier)
            ensemble_error_list.append(rf_table.loc[index]['abs-error-'+quantifier+'-predicted'])
        rf_top_dict[dataset] = quantifier_set
        
        ensemble_numerator_list = []
        denominator = 0
        for error in ensemble_error_list:
            denominator += 1/error
            ensemble_numerator_list.append( 1/error )
        
        ensemble_weight_list = []
        for numerator in ensemble_numerator_list:
            ensemble_weight_list.append( numerator / denominator )

        rf_ensemble_top_dict[dataset].append(ensemble_quantifier_list)
        rf_ensemble_top_dict[dataset].append(ensemble_weight_list)
        
        index += 1
    return rf_top_dict, rf_ensemble_top_dict

# Construction of the Top-K sets + weights (BASELINE)

This function is important for constructing the BASELINE Ensemble Method, so we can compare with our Top-K Ensemble Approach.

This code generates (1) rf_top_dict and (2) rf_ensemble_top_dict by picking K RANDOM quantifiers (instead of using the recommender)

--> Picks K random quantifiers and uses them as an ensemble.

In [9]:
# The "top" variable controls the K in the top-K
def generate_top_k_dictionary_baseline(top, algList, datasets):
    dt_list = []
    for d in datasets:
        dt_list.append(d.split('.csv')[0])

    rf_top_dict = {key: None for key in dt_list}

    rf_ensemble_top_dict = {}
    for key in dt_list:
        rf_ensemble_top_dict[key] = []

    index = 0
    for dataset in rf_top_dict.keys():
        quantifier_set = set()
        ensemble_quantifier_list = []
        ensemble_error_list = []
        alg_list = algList.copy()
        for i in range(0, top):
            quantifier = random.choice(alg_list)
            alg_list.remove(quantifier)

            quantifier_set.add(quantifier)
            ensemble_quantifier_list.append(quantifier)
            ensemble_error_list.append(rf_table.loc[index]['abs-error-'+quantifier+'-predicted'])
        rf_top_dict[dataset] = quantifier_set
        
        ensemble_numerator_list = []
        denominator = 0
        for error in ensemble_error_list:
            denominator += 1/error
            ensemble_numerator_list.append( 1/error )
        
        ensemble_weight_list = []
        for numerator in ensemble_numerator_list:
            ensemble_weight_list.append( numerator / denominator )

        rf_ensemble_top_dict[dataset].append(ensemble_quantifier_list)
        rf_ensemble_top_dict[dataset].append(ensemble_weight_list)
        
        index += 1
    return rf_top_dict, rf_ensemble_top_dict

# Top-K Ensemble HitRate

Functions that generate the HitRate for all the quantifiers plus the Top-K ensemble.

Then, save it to "recommender_hit_table"

In [10]:
hit_rate_table = pd.DataFrame(columns=['Method', 'HitRate', 'Std'])
hit_rate_table.to_csv('./recommender_hit_table/recommender_hit_rate_table.csv', index=False)

def calculate_quantifiers_hit_rate(algList, datasets):
    hit_rate_table = None
    if os.path.isfile('./recommender_hit_table/recommender_hit_rate_table.csv'):
        hit_rate_table = pd.read_csv('./recommender_hit_table/recommender_hit_rate_table.csv')
    else:
        hit_rate_table = pd.DataFrame(columns=['Method', 'HitRate', 'Std'])

    dt_list = []
    for d in datasets:
        dt_list.append(d.split('.csv')[0])

    for quantifier in algList:
        rf_top_hit_list = []
        index = 0

        A_recommendation_set = set()
        A_recommendation_set.add(quantifier)
        for dataset in dt_list:
            # if A_recommendation_set INTERSECTION A_optimal_set is not NULL, then a Hit is attributed
            if len(A_recommendation_set.intersection(optimal_dict[dataset])) != 0:
                rf_top_hit_list.append(1)
            else:
                rf_top_hit_list.append(0)
            index += 1

        rf_top_hit_rate = np.mean(rf_top_hit_list)
        rf_top_std = np.std(rf_top_hit_list)
        hit_rate_table.loc[len(hit_rate_table)] = [quantifier, rf_top_hit_rate, rf_top_std]

    # Save the HitRate to the Recommender Hit Table
    hit_rate_table.to_csv('./recommender_hit_table/recommender_hit_rate_table.csv', index=False)



def calculate_ensemble_hit_rate(rf_top_dict, datasets, name):
    hit_rate_table = None
    if os.path.isfile('./recommender_hit_table/recommender_hit_rate_table.csv'):
        hit_rate_table = pd.read_csv('./recommender_hit_table/recommender_hit_rate_table.csv')
    else:
        hit_rate_table = pd.DataFrame(columns=['Method', 'HitRate', 'Std'])


    dt_list = []
    for d in datasets:
        dt_list.append(d.split('.csv')[0])

    rf_top_hit_list = []
    index = 0
    for dataset in dt_list:
        # if A_recommendation_set INTERSECTION A_optimal_set is not NULL, then a Hit is attributed
        A_recommendation_set = rf_top_dict[dataset]
        if len(A_recommendation_set.intersection(optimal_dict[dataset])) != 0:
            rf_top_hit_list.append(1)
        else:
            rf_top_hit_list.append(0)
        index += 1

    rf_top_hit_rate = np.mean(rf_top_hit_list)
    rf_top_std = np.std(rf_top_hit_list)


    # Save the HitRate to the Recommender Hit Table
    # hit_rate_table = pd.read_csv('./recommender_hit_table/recommender_hit_rate_table.csv')
    hit_rate_table.loc[len(hit_rate_table)] = [name, rf_top_hit_rate, rf_top_std]
    hit_rate_table.to_csv('./recommender_hit_table/recommender_hit_rate_table.csv', index=False)



def calculate_ensemble_hit_rate_BASELINE(rf_top_dict, dt_list):
    rf_top_hit_list = []
    index = 0
    for dataset in dt_list:
        # if A_recommendation_set INTERSECTION A_optimal_set is not NULL, then a Hit is attributed
        A_recommendation_set = rf_top_dict[dataset]
        if len(A_recommendation_set.intersection(optimal_dict[dataset])) != 0:
            rf_top_hit_list.append(1)
        else:
            rf_top_hit_list.append(0)
        index += 1

    rf_top_hit_rate = np.mean(rf_top_hit_list)
    rf_top_std = np.std(rf_top_hit_list)

    return rf_top_hit_rate, rf_top_std

# Top-K Ensemble

This function constructs the top-K ensemble method

Each recommender predicts a MAE for their specific quantifier

Quantifiers are then ordered by their MAE (crescent order)

The Top-k are selected and are used to give their own estimates of the class prevalence {p_1, ..., p_k}

The "final" prevalence is the mean OR median (we use both approaches) of the k estimations

Then, the absolute error of our approach (mean or median) is calculated -> ABSOLUTE(real_prevalence - estimated_prevalence)

In [11]:
def construct_top_k_ensemble(rf_top_dict, top, name):
    top_k_experiment_table_mean = pd.DataFrame(columns=['MAE', 'Dataset'])
    top_k_experiment_table_median = pd.DataFrame(columns=['MAE', 'Dataset'])

    dt_list = []
    for d in datasets:
        dt_list.append(d.split('.csv')[0])

    index = 0
    for item in rf_top_dict.items():
        quantifier_set = item[1]
        
        pred_p_mean = [0] * 20
        temp_pred_p_median = [ [] for _ in range(20) ]
        pred_p_median = []
        real_p = None
        for x in quantifier_set:
            # Get the real_p (prevalence) of the dataset and the predicted_p (prevalence) by the quantifier
            real_p = experiment_tables_dict[x].loc[experiment_tables_dict[x]['dataset_name'] == dt_list[index]].groupby('alpha').mean(numeric_only=True)['real_p'].values.tolist()
            pred_p = experiment_tables_dict[x].loc[experiment_tables_dict[x]['dataset_name'] == dt_list[index]].groupby('alpha').mean(numeric_only=True)['pred_p'].values.tolist()
            for i in range(0, len(real_p)):
                pred_p_mean[i] += pred_p[i]
                temp_pred_p_median[i].append(pred_p[i])
        
        # The "final" prevalence will be the mean or median of each estimated prevalence by each quantifier
        for i in range(0, len(pred_p_mean)):
            pred_p_mean[i] /= len(quantifier_set)
            pred_p_median.append(np.median(temp_pred_p_median[i]))
        
        abs_error_mean_list = []
        abs_error_median_list = []
        for i in range(0, len(pred_p_mean)):
            abs_error_mean_list.append(abs(real_p[i] - pred_p_mean[i]))
            abs_error_median_list.append(abs(real_p[i] - pred_p_median[i]))
        abs_error_mean = np.mean(abs_error_mean_list)
        abs_error_median = np.mean(abs_error_median_list)

        top_k_experiment_table_mean.loc[len(top_k_experiment_table_mean)] = [abs_error_mean, dt_list[index]]
        top_k_experiment_table_median.loc[len(top_k_experiment_table_median)] = [abs_error_median, dt_list[index]]
        # experiment_table.loc[len(experiment_table)] = ['Random Forests+Top'+str(top)+' (MEAN)', dt_list[index], abs_error_mean]
        # experiment_table.loc[len(experiment_table)] = ['Random Forests+Top'+str(top)+' (MEDIAN)', dt_list[index], abs_error_median]

        index += 1


    if top == 1:
        top_k_experiment_table_mean.to_csv('./experiment_tables/summarized/'+name+'.csv', index=False)
    else:
        top_k_experiment_table_mean.to_csv('./experiment_tables/summarized/'+name+'_MEAN.csv', index=False)
        top_k_experiment_table_median.to_csv('./experiment_tables/summarized/'+name+'_MEDIAN.csv', index=False)
    
    return top_k_experiment_table_mean, top_k_experiment_table_median

# Top-K Ensemble Baseline

In [12]:
def construct_top_k_ensemble_BASELINE(top, repeat, algList, datasets, name):
    top_k_experiment_mean_dict = {}
    top_k_experiment_median_dict = {}

    top_k_hit_rate_list = []
    top_k_hit_rate_std_list = []

    dt_list = []
    for d in datasets:
        dt_list.append(d.split('.csv')[0])
        top_k_experiment_mean_dict[d.split('.csv')[0]] = []
        top_k_experiment_median_dict[d.split('.csv')[0]] = []
    
    for repeat_index in range(0, repeat):
        rf_top_dict, _ = generate_top_k_dictionary_baseline(top, algList, datasets)

        rf_top_hit_rate, rf_top_std = calculate_ensemble_hit_rate_BASELINE(rf_top_dict, dt_list)
        top_k_hit_rate_list.append(rf_top_hit_rate)
        top_k_hit_rate_std_list.append(rf_top_std)
        
        index = 0
        for item in rf_top_dict.items():
            quantifier_set = item[1]
            
            pred_p_mean = [0] * 20
            temp_pred_p_median = [ [] for _ in range(20) ]
            pred_p_median = []
            real_p = None
            for x in quantifier_set:
                # Get the real_p (prevalence) of the dataset and the predicted_p (prevalence) by the quantifier
                real_p = experiment_tables_dict[x].loc[experiment_tables_dict[x]['dataset_name'] == dt_list[index]].groupby('alpha').mean(numeric_only=True)['real_p'].values.tolist()
                pred_p = experiment_tables_dict[x].loc[experiment_tables_dict[x]['dataset_name'] == dt_list[index]].groupby('alpha').mean(numeric_only=True)['pred_p'].values.tolist()
                for i in range(0, len(real_p)):
                    pred_p_mean[i] += pred_p[i]
                    temp_pred_p_median[i].append(pred_p[i])
            
            # The "final" prevalence will be the mean or median of each estimated prevalence by each quantifier
            for i in range(0, len(pred_p_mean)):
                pred_p_mean[i] /= len(quantifier_set)
                pred_p_median.append(np.median(temp_pred_p_median[i]))
            
            abs_error_mean_list = []
            abs_error_median_list = []
            for i in range(0, len(pred_p_mean)):
                abs_error_mean_list.append(abs(real_p[i] - pred_p_mean[i]))
                abs_error_median_list.append(abs(real_p[i] - pred_p_median[i]))
            abs_error_mean = np.mean(abs_error_mean_list)
            abs_error_median = np.mean(abs_error_median_list)

            # top_k_experiment_table_mean.loc[len(top_k_experiment_table_mean)] = [abs_error_mean, dt_list[index]]
            # top_k_experiment_table_median.loc[len(top_k_experiment_table_median)] = [abs_error_median, dt_list[index]]

            top_k_experiment_mean_dict[dt_list[index]].append(abs_error_mean)
            top_k_experiment_median_dict[dt_list[index]].append(abs_error_median)

            index += 1


    top_k_experiment_table_mean = pd.DataFrame(columns=['MAE', 'Dataset'])
    top_k_experiment_table_median = pd.DataFrame(columns=['MAE', 'Dataset'])
    for dt in dt_list:
        abs_error_mean = np.mean(top_k_experiment_mean_dict[dt])
        abs_error_median = np.mean(top_k_experiment_median_dict[dt])

        # pdb.set_trace()

        top_k_experiment_table_mean.loc[len(top_k_experiment_table_mean)] = [abs_error_mean, dt]
        top_k_experiment_table_median.loc[len(top_k_experiment_table_median)] = [abs_error_median, dt]

    top_k_hit_rate = np.mean(top_k_hit_rate_list)
    top_k_hit_rate_std = np.mean(top_k_hit_rate_std_list)

    if top == 1:
        top_k_experiment_table_mean.to_csv('./experiment_tables/summarized/'+name+'.csv', index=False)
    else:
        top_k_experiment_table_mean.to_csv('./experiment_tables/summarized/'+name+'_MEAN.csv', index=False)
        top_k_experiment_table_median.to_csv('./experiment_tables/summarized/'+name+'_MEDIAN.csv', index=False)
    
    return top_k_experiment_table_mean, top_k_experiment_table_median

# Top-K Weighted Ensemble

This function implements the weighted top-K ensemble method

Each recommender predicts a MAE for their specific quantifier

Quantifiers are then ordered by their MAE (crescent order)

A weight is attributed to each quantifier based on their MAE. The lower the MAE, the bigger the weight. The first quantifier (Top-1) has the biggest weight, the second (Top-2) has the seccond biggest weight, and so on.

ALL of the weights sum up to one -> weight_quantifier_1 + weight_quantifier_2 + ... + weight_quantifier_k = 1

The Top-k are selected and are used to give it's own estimate of the class prevalence {p_1, ..., p_k}

The "final" prevalence is the sum of the k estimations multiplied by their respective weight:

prevalence_estimation = weight_quantifier_1 * prevalence_quantifier_1 + ... + weight_quantifier_k * prevalence_quantifier_k

Then, the absolute error of our approach is calculated -> ABSOLUTE(real_prevalence - estimated_prevalence)

In [13]:
def construct_top_k_weighted_ensemble(rf_ensemble_top_dict, top, name):
    top_k_weighted_experiment_table = pd.DataFrame(columns=['MAE', 'Dataset'])

    dt_list = []
    for d in datasets:
        dt_list.append(d.split('.csv')[0])

    index = 0
    for item in rf_ensemble_top_dict.items():
        quantifier_list = item[1][0]
        weight_list = item[1][1]
        prevalence_list = [0] * 20

        real_p = None
        for x in quantifier_list:
            # Get the real_p (prevalence) of the dataset and the predicted_p (prevalence) by the quantifier
            real_p = experiment_tables_dict[x].loc[experiment_tables_dict[x]['dataset_name'] == dt_list[index]].groupby('alpha').mean(numeric_only=True)['real_p'].values.tolist()
            pred_p = experiment_tables_dict[x].loc[experiment_tables_dict[x]['dataset_name'] == dt_list[index]].groupby('alpha').mean(numeric_only=True)['pred_p'].values.tolist()
            
            # The "final" prevalence will be the prevalences of the K quantifiers adjusted by their weights.
            # Note that pred_p has 20 prevalences, since we test a quantifier on a dataset 20 times (varying the class distribution).
            for i in range(0, len(pred_p)):
                prevalence_list[i] += pred_p[i] * weight_list[quantifier_list.index(x)]
        
        ensemble_abs_error_list = []
        for i in range(0, len(prevalence_list)):
            ensemble_abs_error_list.append( abs( real_p[i] - prevalence_list[i] ) )

        ensemble_abs_error = np.mean(ensemble_abs_error_list)
        
        top_k_weighted_experiment_table.loc[len(top_k_weighted_experiment_table)] = [ensemble_abs_error, dt_list[index]]
        # experiment_table.loc[len(experiment_table)] = ['Random Forests+Top'+str(top)+'+Weighted (ENSEMBLE)', dt_list[index], ensemble_abs_error]
        index += 1


    if top != 1: 
        top_k_weighted_experiment_table.to_csv('./experiment_tables/summarized/'+name+'_WEIGHTED.csv', index=False)

    return top_k_weighted_experiment_table

# Generate Results

Here, we run all the functions above to generate the results for the following:
* Topline
* Baseline Ensemble Top 3
* Baseline Ensemble Top 3 + Weighted
* Baseline Ensemble Top 5
* Baseline Ensemble Top 5 + Weighted
* Recommender Top 1
* Recommender Top 3
* Recommender Top 3 + Weighted
* Recommender Top 5
* Recommender Top 5 + Weighted

In [14]:
############################################################### TOPLINE
generate_topline(rf_table, datasets)
# PS: TOPLINE HIT RATE IS OBVIOUSLY 100%


############################################################### BASELINE TOP 1
top = 1
construct_top_k_ensemble_BASELINE(top, 30, algList, datasets, 'experiment_table_BASELINE_TOP1')


############################################################### BASELINE TOP 3
top = 3
construct_top_k_ensemble_BASELINE(top, 30, algList, datasets, 'experiment_table_BASELINE_TOP3')


############################################################### BASELINE TOP 5
top = 5
construct_top_k_ensemble_BASELINE(top, 30, algList, datasets, 'experiment_table_BASELINE_TOP5')


############################################################### TOP 1
top = 1

# We set rf_top_dict AND rf_ensemble_top_weighted_dict with
# the function that uses our recommendation (OUR APPROACH)
rf_top_dict, rf_ensemble_top_weighted_dict = generate_top_k_dictionary(top, algList, datasets)

# Then we construct the ensembles with them
construct_top_k_ensemble(rf_top_dict, top, 'experiment_table_TOP1')
construct_top_k_weighted_ensemble(rf_ensemble_top_weighted_dict, top, 'experiment_table_TOP1')
calculate_ensemble_hit_rate(rf_top_dict, datasets, 'Top1')


############################################################### TOP 3
top = 3

# We set rf_top_dict AND rf_ensemble_top_weighted_dict with
# the function that uses our recommendation (OUR APPROACH)
rf_top_dict, rf_ensemble_top_weighted_dict = generate_top_k_dictionary(top, algList, datasets)

# Then we construct the ensembles with them
construct_top_k_ensemble(rf_top_dict, top, 'experiment_table_TOP3')
construct_top_k_weighted_ensemble(rf_ensemble_top_weighted_dict, top, 'experiment_table_TOP3')
calculate_ensemble_hit_rate(rf_top_dict, datasets, 'Top3')


############################################################### TOP 5
top = 5

# We set rf_top_dict AND rf_ensemble_top_weighted_dict with the function that uses our recommendation (OUR APPROACH)
rf_top_dict, rf_ensemble_top_weighted_dict = generate_top_k_dictionary(top, algList, datasets)

# Then we construct the ensembles with them
construct_top_k_ensemble(rf_top_dict, top, 'experiment_table_TOP5')
construct_top_k_weighted_ensemble(rf_ensemble_top_weighted_dict, top, 'experiment_table_TOP5')
calculate_ensemble_hit_rate(rf_top_dict, datasets, 'Top5')


################################################################ BASE QUANTIFIERS HIT RATE
calculate_quantifiers_hit_rate(algList, datasets)

# Unify all the Experiment Tables

In the previous steps, we created Experiment Tables for each quantifier. The experiment tables for our Top-K approach were created here, while the experiment tables for the other quantifiers were created in the APP function and were summarized here (refer to 'summarized' folder).

Now we are going to unify all these experiment tables into a single Experiment Table (even more summarized) consisting of a DataFrame containing (1) the method (algorithm), (2) the Dataset name and (3) MAE.

This is just to facilitate our analysis and to condense all the results to a single file.

In [15]:
# The 'even more summarized' Experiment Table containts only the Method, Dataset and MAE
even_more_summarized_experiment_table = pd.DataFrame(columns=['Method', 'Dataset', 'MAE'])

dt_list = []
for d in datasets:
    dt_list.append(d.split('.csv')[0])

for filename in os.listdir('./experiment_tables/summarized/'):
    if(filename == 'BKP'):
        continue

    # We load each experiment table
    df = pd.read_csv('./experiment_tables/summarized/'+filename)
    
    method_name = filename.split('experiment_table_')[1].split('.csv')[0]
    method_name = method_name.replace('_', '+')
    
    # And then we include each row in the 'even more summarized' one
    index = 0
    for row in df.iterrows():
        even_more_summarized_experiment_table.loc[len(even_more_summarized_experiment_table)] = [method_name, dt_list[index], row[1]['MAE']]
        index += 1

# Save the resulting 'even more summarized' experiment table
even_more_summarized_experiment_table.to_csv('./experiment_tables/even_more_summarized/experiment_table.csv', index=False)

# Save info to plot data folder
even_more_summarized_experiment_table.to_csv('./plot_data/plot_tables/experiment_table.csv', index=False)
hit_rate_table = pd.read_csv('./recommender_hit_table/recommender_hit_rate_table.csv')
hit_rate_table.to_csv('./plot_data/plot_tables/recommender_hit_rate_table.csv', index=False)