___
## <font color="#CA3532"> Cuantificación </font>

##### <font color="#CA7868"> TFM datos reales </font>
##### <font color="#CA7868"> EDX EDX_OVR </font>
##### <font color="#CA7868"> Fuente de datos: https://zenodo.org/record/6546188#.Yp28_DlByHs </font>
###### <font color="#CA7868"> Este data set contiene un total de 28 clases compuesto por 20.000 ejemplos de entrenamiento cada uno descrito por 300 caracteristicas además de 5000 muestras de test cada una con 1000 ejemplos </font>
##### <font color="#CA7868"> Abrán Yiu-sen Yuen Durán</font>

___

___
#### <font color="#CA3532"> Paquetes Necesarios </font>
___

In [21]:
import numpy as np 

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics.pairwise import euclidean_distances

from quantificationlib.multiclass.energy import EDy

from quantificationlib.decomposition.multiclass import OneVsRestQuantifier  # --> NECESARIO PARA CUANTIFICAR

from quantificationlib.estimators.cross_validation import CV_estimator

def absolute_error(prevs, prevs_hat):
    assert prevs.shape == prevs_hat.shape, 'wrong shape {prevs.shape} vs. {prevs_hat.shape}'
    return abs(prevs_hat - prevs).mean(axis=-1)


def relative_absolute_error(p, p_hat, eps=None):
    def __smooth(prevs, epsilon):
        n_classes = prevs.shape[-1]
        return (prevs + epsilon) / (epsilon * n_classes + 1)

    p = __smooth(p, eps)
    p_hat = __smooth(p_hat, eps)
    return (abs(p-p_hat)/p).mean(axis=-1)


def load_training_set(dfile):
    data = np.genfromtxt(dfile, skip_header=1, delimiter=',')

    X = data[:, 1:]
    y = data[:, 0].astype(int)
    return X, y


def load_testing_bag(dfile):
    X = np.genfromtxt(dfile, skip_header=1, delimiter=',')
    return X


def load_prevalences(dfile):
    data = np.genfromtxt(dfile, skip_header=1, delimiter=',')

    prevalences = data[:, 1:]
    return prevalences


import pandas as pd
from pathlib import Path  

import warnings
warnings.filterwarnings("ignore")

___
#### <font color="#CA3532"> Propensiones por Método EDY , OVR-EDY</font>
___

In [22]:
def main(path, dataset, estimator_name, n_bags=100, bag_inicial=0, master_seed=2032):

    # <<<>>>> =================================================================================== <<<>>>> 
    X_train, y_train = load_training_set(path + dataset + '/public/training_data.txt')

    # Clasificadores disponibles
    # ===================================================================================================
    if estimator_name == 'LR':   #1000
        skf_train = StratifiedKFold(n_splits=10, shuffle=True, random_state=master_seed)
        estimator = LogisticRegression(C=0.01, max_iter=1000, class_weight='balanced')
        estimator_train = CV_estimator(estimator=estimator, cv=skf_train)
        estimator_test = None 
    elif estimator_name == 'CLLR':  #1000
        skf_train = StratifiedKFold(n_splits=10, shuffle=True, random_state=master_seed)
        estimator = CalibratedClassifierCV(LogisticRegression(C=0.01, max_iter=1000, class_weight='balanced'))
        estimator_train = CV_estimator(estimator=estimator, cv=skf_train)
        estimator_test = None 
    else:
        raise ValueError('Unknwon estimator')

    print('Fitting Training Estimator')
    estimator_train.fit(X_train, y_train)
    print('Training Estimator fitted', flush=True)
    probs_train = estimator_train.predict_proba(X_train)
    print('Prediction_train computed')
    
    # Méthods
    # Multivariantes
    # ===================================================================================================
    # ===================================================================================================
    edy = EDy()
    edy.fit(X_train, y_train, predictions_train=probs_train)
    print('edy fitted')
    
    # One vs Rest
    # ===================================================================================================
    # ===================================================================================================
    # En algunos casos se precisa de un estimador One versus rest, sobre todo para las Y
    ovr_estimator = OneVsRestClassifier(estimator, n_jobs=-1)  
    
    # ovr_edy
    ovr_edy = OneVsRestQuantifier(base_quantifier=EDy(),
                                  estimator_train=ovr_estimator, estimator_test=ovr_estimator) # Necesita estimador OVR
  
    ovr_edy.fit(X_train, y_train)
    print('ovr_edy fitted')
    
    # Definicion de Estimador según caso
    # ===================================================================================================
    print('Fitting Estimator Test')
    if estimator_test is None:
        estimator_test = estimator_train
    else:
        estimator_test.fit(X_train, y_train)
    print('Estimator test fitted')
    
    print('Fitting ovr_estimator Test')
    if ovr_estimator is None:
        ovr_estimator = estimator_train
    else:
        ovr_estimator.fit(X_train, y_train)
    print('ovr_estimator test fitted')

    # Carga de prevalencias reales
    # <<<>>>> =================================================================================== <<<>>>>  
    prev_true = load_prevalences(path + dataset + '/public/test_prevalences.txt')
    
    
    # Export de prevalencias según método
    # ===================================================================================================
    df_export=[]
    df_export_ovr=[]
    for n_bag in range(n_bags):
        # <<<>>>> =================================================================================== <<<>>>>  
        X_test = load_testing_bag(path + dataset + '/public/test_samples/' + str(n_bag) + '.txt')
        
        probs_test = estimator_test.predict_proba(X_test)
        probs_test_ovr = ovr_estimator.predict_proba(X_test)  
        
        # EXPORT EDY
        ##prev_preds_EDY = [edy.predict(X=X_test)]
        prev_preds_EDY = [edy.predict(X=None, predictions_test=probs_test)]
        true_prev=prev_true[bag_inicial + n_bag, :]
        df_real = pd.DataFrame(true_prev,columns=['REAL'])
        df_prev = pd.DataFrame(prev_preds_EDY[0],columns=['PREDICHAS']) 
        df_prev['BAG']=n_bag
        df_prev['METODO']='EDy'   
        df=df_real.join(df_prev)
        # append para exportar
        df=df.append(df_export)
        df_export=df
        
        # EXPORT OVR-EDY
        #prev_preds_OVR_EDY = [ovr_edy.predict(X=X_test, predictions_test=probs_test_ovr)]
        prev_preds_OVR_EDY = [ovr_edy.predict(X=None, predictions_test=probs_test_ovr)] 
        true_prev=prev_true[bag_inicial + n_bag, :]
        df_real = pd.DataFrame(true_prev,columns=['REAL'])
        df_prev = pd.DataFrame(prev_preds_OVR_EDY[0],columns=['PREDICHAS']) 
        df_prev['BAG']=n_bag
        df_prev['METODO']='OVR-EDY'  
        df=df_real.join(df_prev)
        # append para exportar
        df=df.append(df_export_ovr)
        df_export_ovr=df
        
                        
    # <<<>>>> =================================================================================== <<<>>>>      
    filepath = Path('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/EDY-'+str(estimator_name)+'-n_bags-'+str(n_bag+1)+'.csv')  
                   
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    df_export.to_csv(filepath) 
    
    filepath = Path('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/OVR_EDY-'+str(estimator_name)+'-n_bags-'+str(n_bag+1)+'.csv')  
                   
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    df_export_ovr.to_csv(filepath) 

In [23]:
main(path='/Users/abran.yuen/Desktop/tfm_dataset/', dataset='T1B',  estimator_name='CLLR', n_bags=100, bag_inicial=0, master_seed=2022)

Fitting Training Estimator
Training Estimator fitted
Prediction_train computed
edy fitted
ovr_edy fitted
Fitting Estimator Test
Estimator test fitted
Fitting ovr_estimator Test
ovr_estimator test fitted


In [24]:
main(path='/Users/abran.yuen/Desktop/tfm_dataset/', dataset='T1B',  estimator_name='LR', n_bags=100, bag_inicial=0, master_seed=2022)

Fitting Training Estimator
Training Estimator fitted
Prediction_train computed
edy fitted
ovr_edy fitted
Fitting Estimator Test
Estimator test fitted
Fitting ovr_estimator Test
ovr_estimator test fitted


## <font color="#CA7868"> MAE </font>

In [32]:
import pandas as pd

# MAE
def absolute_error(prevs, prevs_hat):
    #assert prevs.shape == prevs_hat.shape, 'wrong shape {prevs.shape} vs. {prevs_hat.shape}'
    return abs(prevs_hat - prevs).mean(axis=-1)

# RMSE
def relative_absolute_error(p, p_hat, eps=None):
    def __smooth(prevs, epsilon):
        n_classes = prevs.shape[-1]
        return (prevs + epsilon) / (epsilon * n_classes + 1)

    p = __smooth(p, eps)
    p_hat = __smooth(p_hat, eps)
    return (abs(p-p_hat)/p).mean(axis=-1)

# Preprocesado para calcular errores
def prepro(df):
    REALES=df['REAL'].to_numpy()
    PREDICHAS=df['PREDICHAS'].to_numpy()
    return REALES,PREDICHAS

EDY=pd.read_csv('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/EDY-LR-n_bags-100.csv')  
OVR_EDY=pd.read_csv('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/OVR_EDY-LR-n_bags-100.csv')  

In [33]:
if EDY.shape != OVR_EDY.shape:
    print('EDY y OVR_EDY necesita preprocesado')
else:
    print('EDY  y OVR_EDY   tienen misma dimensión')

EDY  y OVR_EDY   tienen misma dimensión


#### <font color="#CA7868"> MAE para LR </font>

In [34]:
REALES_EDY=prepro(EDY)[0]
PREDICHAS_EDY=prepro(EDY)[1]
REALES_OVR_EDY=prepro(OVR_EDY)[0]
PREDICHAS_OVR_EDY=prepro(OVR_EDY)[1]
print('================= MAE =================')
print('EDY     ',absolute_error(REALES_EDY,PREDICHAS_EDY))
print('OVR_EDY ',absolute_error(REALES_OVR_EDY,PREDICHAS_OVR_EDY))
print(' ')

EDY      0.010721981250491652
OVR_EDY  0.057056103575760166
 


In [35]:
print('================= RMSE =================')
print('EDY     ',relative_absolute_error(REALES_EDY,PREDICHAS_EDY,eps=0.0005))
print('OVR_EDY ',relative_absolute_error(REALES_OVR_EDY,PREDICHAS_OVR_EDY,eps=0.0005))
print(' ')

EDY      1.0057369181159062
OVR_EDY  1.7664976039742546
 


#### <font color="#CA7868"> MAE para CLLR</font>

In [36]:
EDY=pd.read_csv('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/EDY-CLLR-n_bags-100.csv')  
OVR_EDY=pd.read_csv('/Users/abran.yuen/00_FINAL_TFM/00_Real_Data_Export/OVR_EDY-CLLR-n_bags-100.csv')  

In [37]:
REALES_EDY=prepro(EDY)[0]
PREDICHAS_EDY=prepro(EDY)[1]
REALES_OVR_EDY=prepro(OVR_EDY)[0]
PREDICHAS_OVR_EDY=prepro(OVR_EDY)[1]
print('================= MAE =================')
print('EDY     ',absolute_error(REALES_EDY,PREDICHAS_EDY))
print('OVR_EDY ',absolute_error(REALES_OVR_EDY,PREDICHAS_OVR_EDY))
print(' ')

EDY      0.012773802401182057
OVR_EDY  0.02278372939662762
 


In [38]:
print('================= RMSE =================')
print('EDY     ',relative_absolute_error(REALES_EDY,PREDICHAS_EDY,eps=0.0005))
print('OVR_EDY ',relative_absolute_error(REALES_OVR_EDY,PREDICHAS_OVR_EDY,eps=0.0005))
print(' ')

EDY      1.2763288283981367
OVR_EDY  2.789585019008503
 


In [39]:
#FIN