___
## <font color="#CA3532"> Cuantificación </font>

##### <font color="#CA7868"> TFM datos reales </font>
##### <font color="#CA7868"> HDY HDY_OVR </font>
##### <font color="#CA7868"> Fuente de datos: https://zenodo.org/record/6546188#.Yp28_DlByHs </font>
###### <font color="#CA7868"> Este data set contiene un total de 28 clases compuesto por 20.000 ejemplos de entrenamiento cada uno descrito por 300 caracteristicas además de 5000 muestras de test cada una con 1000 ejemplos </font>
##### <font color="#CA7868"> Abrán Yiu-sen Yuen Durán</font>

___

___
#### <font color="#CA3532"> Paquetes Necesarios </font>
___

In [11]:
import numpy as np 

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics.pairwise import euclidean_distances
from quantificationlib.multiclass.df import HDy,HDX,DFX
from quantificationlib.decomposition.multiclass import OneVsRestQuantifier  # --> NECESARIO PARA CUANTIFICAR


from quantificationlib.estimators.cross_validation import CV_estimator


def absolute_error(prevs, prevs_hat):
    assert prevs.shape == prevs_hat.shape, 'wrong shape {prevs.shape} vs. {prevs_hat.shape}'
    return abs(prevs_hat - prevs).mean(axis=-1)


def relative_absolute_error(p, p_hat, eps=None):
    def __smooth(prevs, epsilon):
        n_classes = prevs.shape[-1]
        return (prevs + epsilon) / (epsilon * n_classes + 1)

    p = __smooth(p, eps)
    p_hat = __smooth(p_hat, eps)
    return (abs(p-p_hat)/p).mean(axis=-1)


def load_training_set(dfile):
    data = np.genfromtxt(dfile, skip_header=1, delimiter=',')

    X = data[:, 1:]
    y = data[:, 0].astype(int)
    return X, y


def load_testing_bag(dfile):
    X = np.genfromtxt(dfile, skip_header=1, delimiter=',')
    return X


def load_prevalences(dfile):
    data = np.genfromtxt(dfile, skip_header=1, delimiter=',')

    prevalences = data[:, 1:]
    return prevalences


import pandas as pd
from pathlib import Path  

import warnings
warnings.filterwarnings("ignore")

___
#### <font color="#CA3532"> Propensiones por Método HDY , OVR-HDY</font>
___

In [12]:
def main(path, dataset, estimator_name, n_bags=100, bag_inicial=0, master_seed=2032):

    # <<<>>>> =================================================================================== <<<>>>> 
    X_train, y_train = load_training_set(path + dataset + '/public/training_data.txt')

    # Clasificadores disponibles
    # ===================================================================================================
    if estimator_name == 'LR':
        skf_train = StratifiedKFold(n_splits=10, shuffle=True, random_state=master_seed)
        estimator = LogisticRegression(C=0.01, max_iter=1000, class_weight='balanced')
        estimator_train = CV_estimator(estimator=estimator, cv=skf_train)
        estimator_test = None 
    elif estimator_name == 'CLLR':
        skf_train = StratifiedKFold(n_splits=10, shuffle=True, random_state=master_seed)
        estimator = CalibratedClassifierCV(LogisticRegression(C=0.01, max_iter=1000, class_weight='balanced'))
        estimator_train = CV_estimator(estimator=estimator, cv=skf_train)
        estimator_test = None 
    else:
        raise ValueError('Unknwon estimator')

    print('Fitting Training Estimator')
    estimator_train.fit(X_train, y_train)
    print('Training Estimator fitted', flush=True)
    probs_train = estimator_train.predict_proba(X_train)
    print('Prediction_train computed')
    
    # Méthods
    # Multivariantes
    # ===================================================================================================
    # ===================================================================================================
    # HDY
    hdy = HDy(n_bins=4, bin_strategy='equal_width')
    hdy.fit(X_train, y_train, predictions_train=probs_train)
    print('HDY fitted')
        
    # One vs Rest
    # ===================================================================================================
    # ===================================================================================================
    # En algunos casos se precisa de un estimador One versus rest, sobre todo para las Y
    ovr_estimator = OneVsRestClassifier(estimator, n_jobs=-1)  
    # ovr_HDY 
    ovr_hdy = OneVsRestQuantifier(base_quantifier=HDy(n_bins=4, bin_strategy='equal_width'),
                                  estimator_train=ovr_estimator, estimator_test=ovr_estimator) # Necesita estimador OVR
    ovr_hdy.fit(X_train, y_train)
    print('ovr_hdy fitted')
    
    
    # Definicion de Estimador según caso
    # ===================================================================================================
    print('Fitting Estimator Test')
    if estimator_test is None:
        estimator_test = estimator_train
    else:
        estimator_test.fit(X_train, y_train)
    print('Estimator test fitted')
    
    print('Fitting ovr_estimator Test')
    if ovr_estimator is None:
        ovr_estimator = estimator_train
    else:
        ovr_estimator.fit(X_train, y_train)
    print('ovr_estimator test fitted')

    # Carga de prevalencias reales
    # <<<>>>> =================================================================================== <<<>>>>  
    prev_true = load_prevalences(path + dataset + '/public/test_prevalences.txt')
    
    
    # Export de prevalencias según método
    # ===================================================================================================
    df_export=[]
    df_export_ovr=[]
    for n_bag in range(n_bags):
        # <<<>>>> =================================================================================== <<<>>>>  
        X_test = load_testing_bag(path + dataset + '/public/test_samples/' + str(n_bag) + '.txt')
        
        probs_test = estimator_test.predict_proba(X_test)
        probs_test_ovr = ovr_estimator.predict_proba(X_test)  
        
        # EXPORT HDY
        prev_preds_HDY = [hdy.predict(X=None, predictions_test=probs_test)]
        true_prev=prev_true[bag_inicial + n_bag, :]
        df_real = pd.DataFrame(true_prev,columns=['REAL'])
        df_prev = pd.DataFrame(prev_preds_HDY[0],columns=['PREDICHAS']) 
        df_prev['BAG']=n_bag
        df_prev['METODO']='HDY'   
        df=df_real.join(df_prev)
        # append para exportar
        df=df.append(df_export)
        df_export=df
        
        # EXPORT OVR-HDY
        prev_preds_OVR_HDY = [ovr_hdy.predict(X=None, predictions_test=probs_test_ovr)] 
        true_prev=prev_true[bag_inicial + n_bag, :]
        df_real = pd.DataFrame(true_prev,columns=['REAL'])
        df_prev = pd.DataFrame(prev_preds_OVR_HDY[0],columns=['PREDICHAS']) 
        df_prev['BAG']=n_bag
        df_prev['METODO']='OVR-HDY'  
        df=df_real.join(df_prev)
        # append para exportar
        df=df.append(df_export_ovr)
        df_export_ovr=df
        
                        
    # <<<>>>> =================================================================================== <<<>>>>      
    filepath = Path('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/HDY-'+str(estimator_name)+'-n_bags-'+str(n_bag+1)+'.csv')  
                    
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    df_export.to_csv(filepath) 
    
    filepath = Path('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/OVR_HDY-'+str(estimator_name)+'-n_bags-'+str(n_bag+1)+'.csv')  
                    
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    df_export_ovr.to_csv(filepath) 
    


In [13]:
main(path='/Users/abran.yuen/Desktop/tfm_dataset/', dataset='T1B',  estimator_name='CLLR', n_bags=100, bag_inicial=0, master_seed=2022)

Fitting Training Estimator
Training Estimator fitted
Prediction_train computed
HDY fitted
ovr_hdy fitted
Fitting Estimator Test
Estimator test fitted
Fitting ovr_estimator Test
ovr_estimator test fitted


In [14]:
main(path='/Users/abran.yuen/Desktop/tfm_dataset/', dataset='T1B',  estimator_name='LR', n_bags=100, bag_inicial=0, master_seed=2022)

Fitting Training Estimator
Training Estimator fitted
Prediction_train computed
HDY fitted
ovr_hdy fitted
Fitting Estimator Test
Estimator test fitted
Fitting ovr_estimator Test
ovr_estimator test fitted


## <font color="#CA7868"> MAE </font>

In [23]:
import pandas as pd

# MAE
def absolute_error(prevs, prevs_hat):
    #assert prevs.shape == prevs_hat.shape, 'wrong shape {prevs.shape} vs. {prevs_hat.shape}'
    return abs(prevs_hat - prevs).mean(axis=-1)

# RMSE
def relative_absolute_error(p, p_hat, eps=None):
    def __smooth(prevs, epsilon):
        n_classes = prevs.shape[-1]
        return (prevs + epsilon) / (epsilon * n_classes + 1)

    p = __smooth(p, eps)
    p_hat = __smooth(p_hat, eps)
    return (abs(p-p_hat)/p).mean(axis=-1)

# Preprocesado para calcular errores
def prepro(df):
    REALES=df['REAL'].to_numpy()
    PREDICHAS=df['PREDICHAS'].to_numpy()
    return REALES,PREDICHAS

HDY=pd.read_csv('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/HDY-LR-n_bags-100.csv')  
OVR_HDY=pd.read_csv('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/OVR_HDY-LR-n_bags-100.csv')  

In [24]:
if HDY.shape != OVR_HDY.shape:
    print('HDY y OVR_HDY necesita preprocesado')
else:
    print('HDY  y OVR_HDY   tienen misma dimensión')

HDY  y OVR_HDY   tienen misma dimensión


#### <font color="#CA7868"> MAE para LR </font>

In [25]:
REALES_HDY=prepro(HDY)[0]
PREDICHAS_HDY=prepro(HDY)[1]
REALES_OVR_HDY=prepro(OVR_HDY)[0]
PREDICHAS_OVR_HDY=prepro(OVR_HDY)[1]
print('================= MAE =================')
print('HDY     ',absolute_error(REALES_HDY,PREDICHAS_HDY))
print('OVR_HDY ',absolute_error(REALES_OVR_HDY,PREDICHAS_OVR_HDY))
print(' ')

HDY      0.011954580872043074
OVR_HDY  0.038988841514951696
 


In [26]:
print('================= RMSE =================')
print('HDY     ',relative_absolute_error(REALES_HDY,PREDICHAS_HDY,eps=0.0005))
print('OVR_HDY ',relative_absolute_error(REALES_OVR_HDY,PREDICHAS_OVR_HDY,eps=0.0005))
print(' ')

HDY      1.0388820127215572
OVR_HDY  5.5590625970584275
 


#### <font color="#CA7868"> MAE para CLLR</font>

In [27]:
HDY=pd.read_csv('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/HDY-CLLR-n_bags-100.csv')  
OVR_HDY=pd.read_csv('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/OVR_HDY-CLLR-n_bags-100.csv')  

In [28]:
REALES_HDY=prepro(HDY)[0]
PREDICHAS_HDY=prepro(HDY)[1]
REALES_OVR_HDY=prepro(OVR_HDY)[0]
PREDICHAS_OVR_HDY=prepro(OVR_HDY)[1]
print('================= MAE =================')
print('HDY     ',absolute_error(REALES_HDY,PREDICHAS_HDY))
print('OVR_HDY ',absolute_error(REALES_OVR_HDY,PREDICHAS_OVR_HDY))
print(' ')

HDY      0.017777129827112884
OVR_HDY  0.035214709634294114
 


In [29]:
print('================= RMSE =================')
print('HDY     ',relative_absolute_error(REALES_HDY,PREDICHAS_HDY,eps=0.0005))
print('OVR_HDY ',relative_absolute_error(REALES_OVR_HDY,PREDICHAS_OVR_HDY,eps=0.0005))
print(' ')

HDY      1.7851032188053568
OVR_HDY  4.077302698877278
 


#FIN