___
## <font color="#CA3532"> Cuantificación </font>

##### <font color="#CA7868"> TFM datos reales </font>
##### <font color="#CA7868"> HDX HDX_OVR </font>
##### <font color="#CA7868"> Fuente de datos: https://zenodo.org/record/6546188#.Yp28_DlByHs </font>
###### <font color="#CA7868"> Este data set contiene un total de 28 clases compuesto por 20.000 ejemplos de entrenamiento cada uno descrito por 300 caracteristicas además de 5000 muestras de test cada una con 1000 ejemplos </font>
##### <font color="#CA7868"> Abrán Yiu-sen Yuen Durán</font>

___

___
#### <font color="#CA3532"> Paquetes Necesarios </font>
___

In [1]:
import numpy as np 

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics.pairwise import euclidean_distances
from quantificationlib.multiclass.df import HDy,HDX,DFX

from quantificationlib.decomposition.multiclass import OneVsRestQuantifier  # --> NECESARIO PARA CUANTIFICAR


from quantificationlib.estimators.cross_validation import CV_estimator


def absolute_error(prevs, prevs_hat):
    assert prevs.shape == prevs_hat.shape, 'wrong shape {prevs.shape} vs. {prevs_hat.shape}'
    return abs(prevs_hat - prevs).mean(axis=-1)


def relative_absolute_error(p, p_hat, eps=None):
    def __smooth(prevs, epsilon):
        n_classes = prevs.shape[-1]
        return (prevs + epsilon) / (epsilon * n_classes + 1)

    p = __smooth(p, eps)
    p_hat = __smooth(p_hat, eps)
    return (abs(p-p_hat)/p).mean(axis=-1)


def load_training_set(dfile):
    data = np.genfromtxt(dfile, skip_header=1, delimiter=',')

    X = data[:, 1:]
    y = data[:, 0].astype(int)
    return X, y


def load_testing_bag(dfile):
    X = np.genfromtxt(dfile, skip_header=1, delimiter=',')
    return X


def load_prevalences(dfile):
    data = np.genfromtxt(dfile, skip_header=1, delimiter=',')

    prevalences = data[:, 1:]
    return prevalences


import pandas as pd
from pathlib import Path  

import warnings
warnings.filterwarnings("ignore")

___
#### <font color="#CA3532"> Propensiones por Método HDX , OVR-HDX</font>
___

In [2]:
def main(path, dataset, estimator_name, n_bags=100, bag_inicial=0, master_seed=2032):

    # <<<>>>> =================================================================================== <<<>>>> 
    X_train, y_train = load_training_set(path + dataset + '/public/training_data.txt')

    # Clasificadores disponibles
    # ===================================================================================================
    if estimator_name == 'LR':   #1000
        skf_train = StratifiedKFold(n_splits=10, shuffle=True, random_state=master_seed)
        estimator = LogisticRegression(C=0.01, max_iter=1000, class_weight='balanced')
        estimator_train = CV_estimator(estimator=estimator, cv=skf_train)
        estimator_test = None 
    elif estimator_name == 'CLLR':  #1000
        skf_train = StratifiedKFold(n_splits=10, shuffle=True, random_state=master_seed)
        estimator = CalibratedClassifierCV(LogisticRegression(C=0.01, max_iter=1000, class_weight='balanced'))
        estimator_train = CV_estimator(estimator=estimator, cv=skf_train)
        estimator_test = None 
    else:
        raise ValueError('Unknwon estimator')

    print('Fitting Training Estimator')
    estimator_train.fit(X_train, y_train)
    print('Training Estimator fitted', flush=True)
    probs_train = estimator_train.predict_proba(X_train)
    print('Prediction_train computed')
    
    # Méthods
    # Multivariantes
    # ===================================================================================================
    # ===================================================================================================
    # HDX
    hdx = HDX(n_bins=4, bin_strategy='equal_width')
    hdx.fit(X_train, y_train)
    print('hdx fitted')
        
    # One vs Rest
    # ===================================================================================================
    # ===================================================================================================
    # En algunos casos se precisa de un estimador One versus rest, sobre todo para las Y
    ovr_estimator = OneVsRestClassifier(estimator, n_jobs=-1)  
    
    # ovr_HDX 
    ovr_hdx = OneVsRestQuantifier(base_quantifier=HDX(n_bins=4, bin_strategy='equal_width'))
    ovr_hdx.fit(X_train, y_train)
    print('ovr_hdx fitted')
    
    # Definicion de Estimador según caso
    # ===================================================================================================
    print('Fitting Estimator Test')
    if estimator_test is None:
        estimator_test = estimator_train
    else:
        estimator_test.fit(X_train, y_train)
    print('Estimator test fitted')
    
    print('Fitting ovr_estimator Test')
    if ovr_estimator is None:
        ovr_estimator = estimator_train
    else:
        ovr_estimator.fit(X_train, y_train)
    print('ovr_estimator test fitted')

    # Carga de prevalencias reales
    # <<<>>>> =================================================================================== <<<>>>>  
    prev_true = load_prevalences(path + dataset + '/public/test_prevalences.txt')
        
    # Export de prevalencias según método
    # ===================================================================================================
    df_export=[]
    df_export_ovr=[]
    for n_bag in range(n_bags):
        # <<<>>>> =================================================================================== <<<>>>>  
        X_test = load_testing_bag(path + dataset + '/public/test_samples/' + str(n_bag) + '.txt')
        
        probs_test = estimator_test.predict_proba(X_test)
        probs_test_ovr = ovr_estimator.predict_proba(X_test)  
        
        # EXPORT HDX
        prev_preds_HDX = [hdx.predict(X=X_test)]
        true_prev=prev_true[bag_inicial + n_bag, :]
        df_real = pd.DataFrame(true_prev,columns=['REAL'])
        df_prev = pd.DataFrame(prev_preds_HDX[0],columns=['PREDICHAS']) 
        df_prev['BAG']=n_bag
        df_prev['METODO']='HDx'   
        df=df_real.join(df_prev)
        # append para exportar
        df=df.append(df_export)
        df_export=df
        
        # EXPORT OVR-HDX
        prev_preds_OVR_HDX = [ovr_hdx.predict(X=X_test, predictions_test=probs_test_ovr)] 
        true_prev=prev_true[bag_inicial + n_bag, :]
        df_real = pd.DataFrame(true_prev,columns=['REAL'])
        df_prev = pd.DataFrame(prev_preds_OVR_HDX[0],columns=['PREDICHAS']) 
        df_prev['BAG']=n_bag
        df_prev['METODO']='OVR-HDx'  
        df=df_real.join(df_prev)
        # append para exportar
        df=df.append(df_export_ovr)
        df_export_ovr=df
        
                        
    # <<<>>>> =================================================================================== <<<>>>>      
    filepath = Path('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/HDX-'+str(estimator_name)+'-n_bags-'+str(n_bag+1)+'.csv')  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    df_export.to_csv(filepath) 
    
    filepath = Path('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/OVR_HDX-'+str(estimator_name)+'-n_bags-'+str(n_bag+1)+'.csv')  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    df_export_ovr.to_csv(filepath) 
    
    

In [3]:
main(path='/Users/abran.yuen/Desktop/tfm_dataset/', dataset='T1B',  estimator_name='CLLR', n_bags=100, bag_inicial=0, master_seed=2022)

Fitting Training Estimator
Training Estimator fitted
Prediction_train computed
hdx fitted
ovr_hdx fitted
Fitting Estimator Test
Estimator test fitted
Fitting ovr_estimator Test
ovr_estimator test fitted


In [4]:
main(path='/Users/abran.yuen/Desktop/tfm_dataset/', dataset='T1B',  estimator_name='LR', n_bags=100, bag_inicial=0, master_seed=2022)

Fitting Training Estimator
Training Estimator fitted
Prediction_train computed
hdx fitted
ovr_hdx fitted
Fitting Estimator Test
Estimator test fitted
Fitting ovr_estimator Test
ovr_estimator test fitted


## <font color="#CA7868"> MAE </font>

In [5]:
import pandas as pd

# MAE
def absolute_error(prevs, prevs_hat):
    #assert prevs.shape == prevs_hat.shape, 'wrong shape {prevs.shape} vs. {prevs_hat.shape}'
    return abs(prevs_hat - prevs).mean(axis=-1)

# RMSE
def relative_absolute_error(p, p_hat, eps=None):
    def __smooth(prevs, epsilon):
        n_classes = prevs.shape[-1]
        return (prevs + epsilon) / (epsilon * n_classes + 1)

    p = __smooth(p, eps)
    p_hat = __smooth(p_hat, eps)
    return (abs(p-p_hat)/p).mean(axis=-1)

# Preprocesado para calcular errores
def prepro(df):
    REALES=df['REAL'].to_numpy()
    PREDICHAS=df['PREDICHAS'].to_numpy()
    return REALES,PREDICHAS

HDX=pd.read_csv('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/HDX-LR-n_bags-100.csv')  
OVR_HDX=pd.read_csv('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/OVR_HDX-LR-n_bags-100.csv')  

In [6]:
if HDX.shape != OVR_HDX.shape:
    print('HDX y OVR_HDX necesita preprocesado')
else:
    print('HDX  y OVR_HDX   tienen misma dimensión')

HDX  y OVR_HDX   tienen misma dimensión


#### <font color="#CA7868"> MAE para LR </font>

In [15]:
REALES_HDX=prepro(HDX)[0]
PREDICHAS_HDX=prepro(HDX)[1]
REALES_OVR_HDX=prepro(OVR_HDX)[0]
PREDICHAS_OVR_HDX=prepro(OVR_HDX)[1]
print('================= MAE =================')
print('HDX     ',absolute_error(REALES_HDX,PREDICHAS_HDX))
print('OVR_HDX ',absolute_error(REALES_OVR_HDX,PREDICHAS_OVR_HDX))
print(' ')

HDX      0.02039439247464871
OVR_HDX  0.029838098089264697
 


In [10]:
print('================= RMSE =================')
print('HDX     ',relative_absolute_error(REALES_HDX,PREDICHAS_HDX,eps=0.0005))
print('OVR_HDX ',relative_absolute_error(REALES_OVR_HDX,PREDICHAS_OVR_HDX,eps=0.0005))
print(' ')

HDX      2.1764206659084735
OVR_HDX  4.193248342973778
 


#### <font color="#CA7868"> MAE para CLLR</font>

In [11]:
HDX=pd.read_csv('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/HDX-CLLR-n_bags-100.csv')  
OVR_HDX=pd.read_csv('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/OVR_HDX-CLLR-n_bags-100.csv')  

In [12]:
REALES_HDX=prepro(HDX)[0]
PREDICHAS_HDX=prepro(HDX)[1]
REALES_OVR_HDX=prepro(OVR_HDX)[0]
PREDICHAS_OVR_HDX=prepro(OVR_HDX)[1]
print('================= MAE =================')
print('EDY     ',absolute_error(REALES_HDX,PREDICHAS_HDX))
print('OVR_EDY ',absolute_error(REALES_OVR_HDX,PREDICHAS_OVR_HDX))
print(' ')

EDY      0.02039439247464871
OVR_EDY  0.029838098089264697
 


In [13]:
print('================= RMSE =================')
print('HDX     ',relative_absolute_error(REALES_HDX,PREDICHAS_HDX,eps=0.0005))
print('OVR_HDX ',relative_absolute_error(REALES_OVR_HDX,PREDICHAS_OVR_HDX,eps=0.0005))
print(' ')

HDX      2.1764206659084735
OVR_HDX  4.193248342973778
 


In [14]:
#FIN