In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,roc_auc_score

import matplotlib.pyplot as plt

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [50]:
def dame_train_test(c, mu_1, corr_1, corr_2, cant_ruido, proporcion):    
    
    n1 = 100
    ruido0,ruido1 = {},{}
    cov = np.identity(3)
    cov[1,0], cov[0,1]=corr_1, corr_1
    cov2 = cov
    if (corr_1 != corr_2):
        cov2 = np.identity(3)
        cov2[1,0], cov2[0,1]=corr_2, corr_2
    
    poblacion0 = pd.DataFrame(np.random.multivariate_normal([mu_1, mu_1, 1],cov,n1))
    poblacion0["y"] = 0
    
    poblacion1 = pd.DataFrame(np.random.multivariate_normal([c*mu_1,c*mu_1,1],cov2,int(proporcion * n1)))
    poblacion1["y"] = 1

    # meto ruido
    if (cant_ruido!=0):
        for r in range(cant_ruido):
            ruido0[r] = np.random.normal(loc=1, scale=1, size=poblacion0.shape[0])
            ruido1[r] = np.random.normal(loc=1, scale=1, size=poblacion1.shape[0])
        

        poblacion0 = pd.concat([poblacion0,pd.DataFrame(ruido0)], axis = 1)
        poblacion1 = pd.concat([poblacion1,pd.DataFrame(ruido1)], axis = 1)

        
    data = pd.concat([poblacion0,poblacion1], axis = 0)
    
    X = data.drop('y', axis=1)
    y = data.y
    X_train,X_test, y_train, y_test=train_test_split(X, y, test_size=0.5, random_state=0)

    return X_train,X_test, y_train, y_test

In [3]:
def dame_metricas(X_train, X_test, y_train, y_test):
    
    # Entrenamos los modelos
    lr = LogisticRegression(random_state = 0).fit(X_train, y_train)
    lda = LinearDiscriminantAnalysis().fit(X_train, y_train)
    qda = QuadraticDiscriminantAnalysis().fit(X_train, y_train)
    gnb = GaussianNB().fit(X_train, y_train)
    tree = DecisionTreeClassifier(random_state = 0).fit(X_train, y_train)
    bagging = BaggingClassifier(base_estimator = DecisionTreeClassifier(), random_state = 0).fit(X_train, y_train)
    rf = RandomForestClassifier(random_state = 0).fit(X_train, y_train)
    boosting = GradientBoostingClassifier(random_state = 0).fit(X_train, y_train)
    knn = KNeighborsClassifier().fit(X_train, y_train)
    svm_lineal = SVC(random_state = 0, kernel = "linear").fit(X_train, y_train)
    svm_cuadratico = SVC(random_state = 0, kernel = "poly", degree = 2).fit(X_train, y_train)
    svm_radial = SVC(random_state = 0, kernel = "rbf").fit(X_train, y_train)

    
    
    # Generamos las metricas para cada modelo
    accuracy_lr = accuracy_score(y_test, lr.predict(X_test))
    # precision_lr = precision_score(y_test, lr.predict(X_test))
    # recall_lr = recall_score(y_test, lr.predict(X_test))
    f1_lr = f1_score(y_test, lr.predict(X_test))
    auc_lr=roc_auc_score(y_test,lr.predict(X_test))
    
    accuracy_lda = accuracy_score(y_test, lda.predict(X_test))
    # precision_lda = precision_score(y_test, lda.predict(X_test))
    # recall_lda = recall_score(y_test, lda.predict(X_test))
    f1_lda = f1_score(y_test, lda.predict(X_test))
    auc_lda=roc_auc_score(y_test,lda.predict(X_test))
    
    accuracy_qda = accuracy_score(y_test, qda.predict(X_test))
    # precision_qda = precision_score(y_test, qda.predict(X_test))
    # recall_qda = recall_score(y_test, qda.predict(X_test))
    f1_qda = f1_score(y_test, qda.predict(X_test))
    auc_qda=roc_auc_score(y_test,qda.predict(X_test))
    
    accuracy_gnb = accuracy_score(y_test, gnb.predict(X_test))
    # precision_gnb = precision_score(y_test, gnb.predict(X_test))
    # recall_gnb = recall_score(y_test, gnb.predict(X_test))
    f1_gnb = f1_score(y_test, gnb.predict(X_test))
    auc_gnb=roc_auc_score(y_test,gnb.predict(X_test))
    
    accuracy_tree = accuracy_score(y_test, tree.predict(X_test))
    # precision_tree = precision_score(y_test, tree.predict(X_test))
    # recall_tree = recall_score(y_test, tree.predict(X_test))
    f1_tree = f1_score(y_test, tree.predict(X_test))
    auc_tree=roc_auc_score(y_test,tree.predict(X_test))
    
    
    accuracy_bag = accuracy_score(y_test, bagging.predict(X_test))
    # precision_bag = precision_score(y_test, bagging.predict(X_test))
    # recall_bag = recall_score(y_test, bagging.predict(X_test))
    f1_bag = f1_score(y_test, bagging.predict(X_test))
    auc_bag=roc_auc_score(y_test,bagging.predict(X_test))
    
    accuracy_rf = accuracy_score(y_test, rf.predict(X_test))
    # precision_rf = precision_score(y_test, rf.predict(X_test))
    # recall_rf = recall_score(y_test, rf.predict(X_test))
    f1_rf = f1_score(y_test, rf.predict(X_test))  
    auc_rf=roc_auc_score(y_test,rf.predict(X_test))
    
    accuracy_boost = accuracy_score(y_test, boosting.predict(X_test))
    # precision_boost = precision_score(y_test, boosting.predict(X_test))
    # recall_boost = recall_score(y_test, boosting.predict(X_test))
    f1_boost = f1_score(y_test, boosting.predict(X_test)) 
    auc_boost=roc_auc_score(y_test,boosting.predict(X_test))
    
    accuracy_knn = accuracy_score(y_test, knn.predict(X_test))
    # precision_knn = precision_score(y_test, knn.predict(X_test))
    # recall_knn = recall_score(y_test, knn.predict(X_test))
    f1_knn = f1_score(y_test, knn.predict(X_test)) 
    auc_knn=roc_auc_score(y_test,knn.predict(X_test))
    
    accuracy_svm_l = accuracy_score(y_test, svm_lineal.predict(X_test))
    # precision_svm_l = precision_score(y_test, svm_lineal.predict(X_test))
    # recall_svm_l = recall_score(y_test, svm_lineal.predict(X_test))
    f1_svm_l = f1_score(y_test, svm_lineal.predict(X_test))    
    auc_svm_l=roc_auc_score(y_test,svm_lineal.predict(X_test))
    
    accuracy_svm_poly = accuracy_score(y_test, svm_cuadratico.predict(X_test))
    # precision_svm_poly = precision_score(y_test, svm_cuadratico.predict(X_test))
    # recall_svm_poly = recall_score(y_test, svm_cuadratico.predict(X_test))
    f1_svm_poly = f1_score(y_test, svm_cuadratico.predict(X_test)) 
    auc_svm_poly=roc_auc_score(y_test,svm_cuadratico.predict(X_test))
    
    accuracy_svm_radial = accuracy_score(y_test, svm_radial.predict(X_test))
    # precision_svm_radial = precision_score(y_test, svm_radial.predict(X_test))
    # recall_svm_radial = recall_score(y_test, svm_radial.predict(X_test))
    f1_svm_radial = f1_score(y_test, svm_radial.predict(X_test)) 
    auc_svm_radial=roc_auc_score(y_test,svm_radial.predict(X_test))

    
    return auc_lr, auc_lda, auc_qda, auc_gnb, auc_tree, auc_bag, auc_rf, auc_boost, auc_knn, auc_svm_l, auc_svm_poly,auc_svm_radial,accuracy_lr, accuracy_lda, accuracy_qda, accuracy_gnb, accuracy_tree, accuracy_bag, accuracy_rf,accuracy_boost, accuracy_knn, accuracy_svm_l, accuracy_svm_poly, accuracy_svm_radial,f1_lr, f1_lda, f1_qda, f1_gnb, f1_tree, f1_bag, f1_rf, f1_boost, f1_knn, f1_svm_l, f1_svm_poly, f1_svm_radial
    

In [39]:
#Vamos a suponer el centro de la poblacion1 en "5"
mu_1 = 5
n_rep = 50

In [5]:
def dame_tabla_esperanzas(c = 1.2, mu_1 = mu_1, corr_1 = 0, corr_2 = 0, n_rep=20, cant_ruido = 0, proporcion = 1):
    auc_lr, auc_lda, auc_qda, auc_gnb, auc_tree, auc_bag, auc_rf, auc_boost, auc_knn, auc_svm_l, auc_svm_poly, auc_svm_radial = {},{},{},{},{},{},{},{},{},{},{},{}
    accuracy_lr, accuracy_lda, accuracy_qda, accuracy_gnb, accuracy_tree, accuracy_bag, accuracy_rf, accuracy_boost, accuracy_knn, accuracy_svm_l, accuracy_svm_poly, accuracy_svm_radial = {},{},{},{},{},{},{},{},{},{},{},{}
    f1_lr, f1_lda, f1_qda, f1_gnb, f1_tree, f1_bag, f1_rf, f1_boost, f1_knn, f1_svm_l, f1_svm_poly, f1_svm_radial = {},{},{},{},{},{},{},{},{},{},{},{}
    
    for i in tqdm(range(n_rep)):    
        X_train,X_test, y_train, y_test = dame_train_test(c = c, mu_1 = mu_1, corr_1 = corr_1, corr_2 = corr_2, cant_ruido = cant_ruido, proporcion = proporcion)
        auc_lr[i], auc_lda[i], auc_qda[i], auc_gnb[i], auc_tree[i], auc_bag[i], auc_rf[i], auc_boost[i], auc_knn[i], auc_svm_l[i], auc_svm_poly[i], auc_svm_radial[i], accuracy_lr[i], accuracy_lda[i], accuracy_qda[i], accuracy_gnb[i], accuracy_tree[i], accuracy_bag[i], accuracy_rf[i], accuracy_boost[i], accuracy_knn[i], accuracy_svm_l[i], accuracy_svm_poly[i], accuracy_svm_radial[i],f1_lr[i], f1_lda[i], f1_qda[i], f1_gnb[i], f1_tree[i], f1_bag[i], f1_rf[i], f1_boost[i], f1_knn[i], f1_svm_l[i], f1_svm_poly[i], f1_svm_radial[i] =dame_metricas(X_train, X_test, y_train, y_test)

    tabla_metricas = pd.DataFrame({"Accuracy":[np.array(list(accuracy_lr.values())).mean(), np.array(list(accuracy_lda.values())).mean(), np.array(list(accuracy_qda.values())).mean(), np.array(list(accuracy_gnb.values())).mean(), np.array(list(accuracy_tree.values())).mean(), np.array(list(accuracy_bag.values())).mean(), np.array(list(accuracy_rf.values())).mean(), np.array(list(accuracy_boost.values())).mean(), np.array(list(accuracy_knn.values())).mean(), np.array(list(accuracy_svm_l.values())).mean(),np.array(list(accuracy_svm_poly.values())).mean(), np.array(list(accuracy_svm_radial.values())).mean()],
                                   "F1_score":[np.array(list(f1_lr.values())).mean(), np.array(list(f1_lda.values())).mean(), np.array(list(f1_qda.values())).mean(), np.array(list(f1_gnb.values())).mean(), np.array(list(f1_tree.values())).mean(), np.array(list(f1_bag.values())).mean(), np.array(list(f1_rf.values())).mean(), np.array(list(f1_boost.values())).mean(), np.array(list(f1_knn.values())).mean(), np.array(list(f1_svm_l.values())).mean(),np.array(list(f1_svm_poly.values())).mean(), np.array(list(f1_svm_radial.values())).mean()],
                                   "AUC":[np.array(list(auc_lr.values())).mean(), np.array(list(auc_lda.values())).mean(), np.array(list(auc_qda.values())).mean(), np.array(list(auc_gnb.values())).mean(), np.array(list(auc_tree.values())).mean(), np.array(list(auc_bag.values())).mean(), np.array(list(auc_rf.values())).mean(), np.array(list(auc_boost.values())).mean(), np.array(list(auc_knn.values())).mean(), np.array(list(auc_svm_l.values())).mean(),np.array(list(auc_svm_poly.values())).mean(), np.array(list(auc_svm_radial.values())).mean()]}).transpose()
    tabla_metricas.columns = ["LOGIST. REG.","LDA", "QDA", "GNB", "CART", "BAGGING","RANDOM FOREST", "XGBOOST", "KNN", "SVM lineal(SVC)", "SVM CUADRATICO", "SVM RADIAL"]
    return tabla_metricas 

# 1)

* PRIMERAS DOS VARIABLES INFORMATICAS, LA 3ERA NO INFORMATIVA

* RO = 0 para ambas poblaciones, misma varianza en ambas poblaciones
* Distribucion Normal de las features.
* Misma varianza en cada categoria


#### **c = 1.2**

In [6]:
dame_tabla_esperanzas(c = 1.2, mu_1 = mu_1, corr_1 = 0, corr_2 = 0, n_rep=n_rep)

100%|█████████████████████████████████████████| 500/500 [02:29<00:00,  3.34it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.74926,0.74898,0.74286,0.7475,0.66192,0.6933,0.71336,0.69772,0.71484,0.74712,0.74642,0.74772
F1_score,0.752469,0.752327,0.746375,0.751277,0.666462,0.680766,0.712973,0.701622,0.717044,0.749774,0.745205,0.747978
AUC,0.750904,0.750611,0.744464,0.749046,0.663206,0.697764,0.715684,0.699152,0.71656,0.748834,0.749046,0.749993


*Observamos que al estar mas cerca las 2 poblaciones, a los clasificadores, se les dificulta mucho mas distinguir entre una y otra poblacion*

#### **c = 2**

In [7]:
dame_tabla_esperanzas(c = 2, mu_1 = mu_1, corr_1 = 0, corr_2 = 0, n_rep=n_rep)

100%|█████████████████████████████████████████| 500/500 [02:18<00:00,  3.60it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.99976,0.99974,0.99956,0.99974,0.99132,0.99418,0.99628,0.99214,0.99974,0.9995,0.9995,0.99966
F1_score,0.999773,0.999755,0.999585,0.999755,0.991797,0.994444,0.996479,0.99256,0.999755,0.999528,0.999526,0.999679
AUC,0.999762,0.99974,0.999558,0.999738,0.991342,0.994408,0.99632,0.992175,0.99974,0.999499,0.999514,0.999658


#### **c = 10**

In [8]:
dame_tabla_esperanzas(c = 10, mu_1 = mu_1, corr_1 = 0, corr_2 = 0, n_rep=n_rep)

100%|█████████████████████████████████████████| 500/500 [02:19<00:00,  3.58it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
F1_score,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
AUC,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### **c = 100**

In [9]:
dame_tabla_esperanzas(c = 100, mu_1 = mu_1, corr_1 = 0, corr_2 = 0, n_rep=n_rep)

100%|█████████████████████████████████████████| 500/500 [02:20<00:00,  3.56it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
F1_score,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
AUC,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


*Todos los clasificadores distinguen sin dificultades a los individuos de cada poblacion*

##### **A partir de ahora trabajaremos con una distancia entre los centro c=1.2, para poder forzar a los clasificadores y ver mas claramente como se comportan al ir modificando parametros de las poblaciones a clasificar**

In [10]:
c = 1.2

# 2) 
* Mismas distribuciones que en el 1, pero con ro = [0.5, 0.85, 1]. Aumentamos la correlacion entre las poblaciones

* Se mantiene la misma varianza entre las categorias 
* Se mantiene la distribucion normal de las features


In [11]:
dame_tabla_esperanzas(c = c, mu_1 = mu_1, corr_1 = 0.5, corr_2 = 0.5, n_rep=n_rep)

100%|█████████████████████████████████████████| 500/500 [02:37<00:00,  3.18it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.70784,0.70722,0.69724,0.7128,0.61796,0.64828,0.67468,0.65312,0.66774,0.7056,0.70314,0.70656
F1_score,0.708999,0.708555,0.697976,0.717627,0.62151,0.630123,0.673151,0.655262,0.668524,0.70662,0.696091,0.70319
AUC,0.709853,0.709196,0.699253,0.714026,0.619248,0.653106,0.677021,0.654767,0.669633,0.707576,0.706648,0.709308


*Vemos que una correlacion del orden del 50%, reduce la performance en un 5% aproximadamente*

In [12]:
dame_tabla_esperanzas(c = c, mu_1 = mu_1, corr_1 = 0.85, corr_2 = 0.85, n_rep=n_rep)

100%|█████████████████████████████████████████| 500/500 [02:30<00:00,  3.32it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.68822,0.68638,0.67422,0.69438,0.60412,0.62708,0.65098,0.6287,0.64726,0.68536,0.68314,0.68786
F1_score,0.686624,0.685425,0.673458,0.699027,0.609556,0.604981,0.646683,0.62917,0.648339,0.684394,0.671154,0.682326
AUC,0.690703,0.688755,0.676452,0.69558,0.605142,0.632267,0.653668,0.630554,0.649075,0.687663,0.68738,0.69078


In [13]:
dame_tabla_esperanzas(c = c, mu_1 = mu_1, corr_1 = 1, corr_2 = 1, n_rep=n_rep)

100%|█████████████████████████████████████████| 500/500 [02:30<00:00,  3.33it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.6812,0.68122,0.65938,0.68502,0.59514,0.6127,0.6291,0.61796,0.6403,0.68024,0.67488,0.67716
F1_score,0.678859,0.678876,0.658396,0.689281,0.598334,0.587904,0.624958,0.617533,0.63823,0.677138,0.660492,0.668189
AUC,0.683791,0.683815,0.660162,0.686326,0.596494,0.61809,0.631675,0.619899,0.642595,0.682895,0.679455,0.680637


*Observamos que al tener una correlacion perfecta entre las 2 poblaciones, nos queda una matriz de correlacion no inversible*

*Observamos que al haber colinealidad entre las poblaciones, la performance de los clasificadores, decae un poco, en torno al 5/7%, para nuestro caso*

*Los mas afectados son CART y los ensambles que utilizan arboles en su metodo de clasificacion*

Esto ocurre debido a que al haber una alta correlacion entre las features de cada una de las poblaciones, luego del algoritmo realizar la primer seleccion de variable, la siguiente, pierde una gran cantidad de poder de clasificacion, ya que el anterior corte, utilizo la misma informacion para separar, y lo que no pudo separar la anterior, le costara mucho mas a la siguiente, si estan correlacionadas.

# 3)
* Misma distribucion que el item 1, pero ahora con diferentes matrices de correlacion.
* Distinta varianza de cada categoria (Cov_poblacion1(X1,X2,X3) != Cov_poblacion2(X1,X2,X3))
* Se mantiene la distribucion normal de las features

In [14]:
dame_tabla_esperanzas(c = c, mu_1 = mu_1, corr_1 = 0.05, corr_2 = 0.95, n_rep=n_rep)

100%|█████████████████████████████████████████| 500/500 [02:30<00:00,  3.32it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.70936,0.708,0.81756,0.71304,0.71524,0.74066,0.75082,0.74488,0.7504,0.7081,0.71714,0.72454
F1_score,0.70608,0.702085,0.828242,0.707539,0.722391,0.732243,0.749077,0.748148,0.76809,0.696129,0.700384,0.70427
AUC,0.712294,0.711449,0.816593,0.716375,0.715943,0.74472,0.753507,0.746439,0.748284,0.712637,0.722688,0.730743


In [40]:
dame_tabla_esperanzas(c = c, mu_1 = mu_1, corr_1 = -0.5, corr_2 = 0.9, n_rep=n_rep)

100%|███████████████████████████████████████████| 50/50 [00:14<00:00,  3.39it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.7504,0.7476,0.8278,0.7496,0.7656,0.7874,0.7946,0.7916,0.7914,0.7504,0.7622,0.772
F1_score,0.747333,0.735886,0.831779,0.739454,0.775348,0.783418,0.79314,0.795443,0.79998,0.733538,0.74589,0.748981
AUC,0.753617,0.752637,0.828996,0.754283,0.765428,0.790835,0.797459,0.792943,0.791285,0.756459,0.768386,0.779751


In [38]:
dame_tabla_esperanzas(c = c, mu_1 = mu_1, corr_1 = 0.4, corr_2 = 0.6, n_rep=n_rep)

100%|█████████████████████████████████████████| 500/500 [02:32<00:00,  3.27it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.70736,0.70626,0.7007,0.7113,0.62472,0.65366,0.67648,0.65696,0.67284,0.70492,0.70282,0.70514
F1_score,0.707765,0.705882,0.697865,0.713261,0.630103,0.634343,0.672741,0.657521,0.676507,0.703654,0.693918,0.697932
AUC,0.709516,0.708574,0.703457,0.713091,0.62579,0.658757,0.679204,0.658876,0.674245,0.707315,0.706652,0.708595


*Al tener distintas matrices de correlacion, y valores altos de correlacion entre features, vemos que el mas afectado es CART, por lo mencionado en el ejercicio anterior*

*Notamos que si bien no se cumple uno de los supuestos de LDA, clasifica bastante bien las categorias. Sin embargo, QDA performa hasta un 10% mejor en nuestro caso mas critico, dado que no necesita cumplir el supuesto de que ambas categorias tengan la misma varianza.*

# 4)
* Matrices de correlacion DISTINTAS (mismas que en el item 3)
* Metiendo ruido (variables no informativas p2 = 10, 50, 100)
* Distribucion de las features de cada poblacion normal

In [17]:
dame_tabla_esperanzas(c = c, mu_1 = mu_1, corr_1 = 0.05, corr_2 = 0.95, n_rep=n_rep, cant_ruido = 10)

100%|█████████████████████████████████████████| 500/500 [02:31<00:00,  3.30it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.67538,0.67166,0.73806,0.68726,0.6715,0.71054,0.71506,0.72098,0.6521,0.67334,0.68996,0.69316
F1_score,0.677282,0.670707,0.730496,0.685261,0.676718,0.689776,0.70338,0.720437,0.661385,0.670884,0.672149,0.674507
AUC,0.677243,0.674052,0.742002,0.689894,0.672612,0.716615,0.719649,0.723352,0.652354,0.675967,0.695389,0.698822


In [18]:
dame_tabla_esperanzas(c = c, mu_1 = mu_1, corr_1 = 0.05, corr_2 = 0.95, n_rep=n_rep, cant_ruido = 50)

100%|█████████████████████████████████████████| 500/500 [04:20<00:00,  1.92it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.61114,0.59362,0.54126,0.64042,0.65608,0.69338,0.6819,0.70348,0.5922,0.59646,0.64244,0.65124
F1_score,0.614938,0.598477,0.614857,0.63696,0.660544,0.664331,0.6568,0.700818,0.589433,0.601368,0.627085,0.619742
AUC,0.612475,0.594783,0.530639,0.64306,0.657359,0.700458,0.688339,0.706207,0.593896,0.597578,0.646929,0.658189


In [19]:
dame_tabla_esperanzas(c = c, mu_1 = mu_1, corr_1 = 0.05, corr_2 = 0.95, n_rep=n_rep, cant_ruido = 100)

100%|█████████████████████████████████████████| 500/500 [05:24<00:00,  1.54it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.60424,0.5295,0.53206,0.61406,0.65674,0.6811,0.64656,0.70188,0.57082,0.59022,0.62194,0.6266
F1_score,0.607782,0.537588,0.561765,0.608473,0.66139,0.649493,0.606435,0.699091,0.562637,0.596397,0.61143,0.570756
AUC,0.605672,0.530099,0.529111,0.616834,0.657963,0.6883,0.654566,0.70469,0.572987,0.591226,0.625585,0.636186


*Se puede observar el trade off sesgo varianza en funcion del kernel que utiliza SVM, en el caso de un kernel lineal (Support Vector Classifier), se asemeja mas a LDA y Logistic Regression, metodos que tienen mas sesgo que varianza, y la eleccion de un kernel RADIAL, se asemeja mas a un KNN, donde el clasificador mira de manera local en ambos casos*

*Tambien notamos que clasificadores que dependen de toda la muestra en general para clasificar, como QDA y LDA, se rompen mucho mas rapido con variables basura, mientras que clasificadores que son capaces de mirar de manera mas local y seleccionar entre variables, son mas robustos a estos problemas en su forma de clasificar*

*La adicion de muchas variables ruido, afecto de manera MUY significativa a LDA, QDA y KNN, llevando a los clasificadores a tener casi el mismo efecto que tirar una moneda. Tambien notamos como los ensambles y los algoritmos que utilizan CART son los que mejor se comportan y logran distinguir lo que es util de lo que no para clasificar.*

# 5)

* Matrices de correlacion DISTINTAS (mismas que en el item 3)
* Metiendo desbalance entre categorias(n1 != n2) con proporcion = [0.1, 0.25, 0.3, 0.5] (n1 = proporcion * n2)
* Distribucion de las features de cada poblacion normal


DATA IMBALANCEADA, `UTILIZAMOS F1-SCORE`, COMO METRICA, LA CUAL PONDERA RECALL/SENSITIVITY (CANTIDAD DE FALSOS NEGATIVOS) Y PRECISION (CANTIDAD DE FALSOS POSITIVOS)

In [20]:
dame_tabla_esperanzas(c = c, mu_1 = mu_1, corr_1 = 0.5, corr_2 = 0.9, n_rep=n_rep, proporcion = 0.1)

100%|█████████████████████████████████████████| 500/500 [01:56<00:00,  4.28it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.925818,0.922145,0.915127,0.906327,0.850691,0.893018,0.918145,0.874582,0.918145,0.925164,0.925236,0.927455
F1_score,0.148602,0.1634,0.204544,0.23522,0.199674,0.190188,0.168576,0.194018,0.123515,0.062735,0.073454,0.0076
AUC,0.552436,0.55898,0.573397,0.598373,0.578279,0.570922,0.559358,0.574343,0.542309,0.523975,0.527701,0.502402


In [21]:
dame_tabla_esperanzas(c = c, mu_1 = mu_1, corr_1 = 0.5, corr_2 = 0.9, n_rep=n_rep, proporcion = 0.25)

100%|█████████████████████████████████████████| 500/500 [01:58<00:00,  4.22it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.842254,0.837841,0.841397,0.815302,0.748317,0.811683,0.828,0.79019,0.813683,0.838889,0.842794,0.848635
F1_score,0.368015,0.375852,0.447675,0.427363,0.357075,0.367485,0.388354,0.378859,0.387263,0.250527,0.273056,0.178986
AUC,0.629017,0.634021,0.67167,0.668979,0.626409,0.629345,0.640098,0.637342,0.641487,0.589534,0.595911,0.558979


In [22]:
dame_tabla_esperanzas(c = c, mu_1 = mu_1, corr_1 = 0.5, corr_2 = 0.9, n_rep=n_rep, proporcion = 0.5)

100%|█████████████████████████████████████████| 500/500 [02:28<00:00,  3.37it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.74328,0.740213,0.7684,0.737253,0.668213,0.716667,0.73272,0.70168,0.720747,0.739253,0.742667,0.744987
F1_score,0.536589,0.536345,0.611015,0.576907,0.507304,0.505149,0.54227,0.523028,0.558804,0.499875,0.496944,0.463205
AUC,0.672485,0.671355,0.71789,0.692495,0.633591,0.649409,0.672088,0.652882,0.67813,0.660083,0.658292,0.647115


*mirando f1-score*

*Observamos que desbalances monstruosos (10 a 1), destruyen a los clasificadores, ya que terminan identificando todo como clase mayoritaria.*

*Los clasificadores del grupo SVM, son los mas afectados por desbalance entre categorias. QDA y NB, parece lograr muy buenos resultados en casos de desbalance entre clases,* 

*Si bien, no se observa notoriamente, Boosting deberia conseguir mejorar un poco el desbalance por su metodo de reasignacion de pesos a las observaciones mal clasificadas*

*Random Forest se confunde, ya que en la busqueda greedy del algoritmo, por conseguir nodos mas puros(intentando mejorar la métrica correspondiente), terminaría ignorando a la clase minoritaria, debido a que solo serian pocas observaciones contra la clase mayoritaria. Para solucionar esto, habría que ponerle una penalidad mayor para la métrica que utiliza en la separación, a clasificar mal una observación de la clase minoritaria.*

# 6) 
* Las features tienen distribucion Normal.
* Hay 2 features informativas, 1 feature ruido.
* Alpha = 0.5 (Misma cantidad de observaciones por categoria)
* rho 1 == rho 2 = 0

In [23]:
def dame_train_test_radial(p1, p2, n, alfa, rho1, rho2):

    c1 = np.sqrt(-2*np.log(1-alfa))
    matriz_correlacion1 = np.ones([p1, p1])*rho1
    matriz_correlacion2 = np.ones([p2, p2])*rho2
    np.fill_diagonal(matriz_correlacion1, np.ones(matriz_correlacion1.shape[0]))  
    np.fill_diagonal(matriz_correlacion2, np.ones(matriz_correlacion2.shape[0]))  

    X1=pd.DataFrame(np.random.multivariate_normal(np.zeros(p1),matriz_correlacion1,n))
    X2=pd.DataFrame(np.random.multivariate_normal(np.zeros(p2),matriz_correlacion2,n))
    X2["y"]=(X1**2).apply(lambda x: 0 if sum(x) > c1 else 1, axis = 1)

    data = pd.concat([X1, X2], axis = 1)
    X = data.drop('y', axis=1)
    y = data.y
    X_train,X_test, y_train, y_test=train_test_split(X, y, test_size=0.5, random_state=0)
    
    return X_train,X_test, y_train, y_test

In [24]:
def dame_tabla_esperanzas_radial(p1=2, p2=1, n=200, alfa=0.5, rho1 = 0, rho2 = 0):
    auc_lr, auc_lda, auc_qda, auc_gnb, auc_tree, auc_bag, auc_rf, auc_boost, auc_knn, auc_svm_l, auc_svm_poly, auc_svm_radial = {},{},{},{},{},{},{},{},{},{},{},{}
    accuracy_lr, accuracy_lda, accuracy_qda, accuracy_gnb, accuracy_tree, accuracy_bag, accuracy_rf, accuracy_boost, accuracy_knn, accuracy_svm_l, accuracy_svm_poly, accuracy_svm_radial = {},{},{},{},{},{},{},{},{},{},{},{}
    f1_lr, f1_lda, f1_qda, f1_gnb, f1_tree, f1_bag, f1_rf, f1_boost, f1_knn, f1_svm_l, f1_svm_poly, f1_svm_radial = {},{},{},{},{},{},{},{},{},{},{},{}
    
    for i in tqdm(range(n_rep)):    
        X_train,X_test, y_train, y_test = dame_train_test_radial(p1=p1, p2=p2, n=n, alfa=alfa, rho1 = rho1, rho2 = rho2)
        auc_lr[i], auc_lda[i], auc_qda[i], auc_gnb[i], auc_tree[i], auc_bag[i], auc_rf[i], auc_boost[i], auc_knn[i], auc_svm_l[i], auc_svm_poly[i], auc_svm_radial[i], accuracy_lr[i], accuracy_lda[i], accuracy_qda[i], accuracy_gnb[i], accuracy_tree[i], accuracy_bag[i], accuracy_rf[i], accuracy_boost[i], accuracy_knn[i], accuracy_svm_l[i], accuracy_svm_poly[i], accuracy_svm_radial[i],f1_lr[i], f1_lda[i], f1_qda[i], f1_gnb[i], f1_tree[i], f1_bag[i], f1_rf[i], f1_boost[i], f1_knn[i], f1_svm_l[i], f1_svm_poly[i], f1_svm_radial[i] =dame_metricas(X_train, X_test, y_train, y_test)

    tabla_metricas = pd.DataFrame({"Accuracy":[np.array(list(accuracy_lr.values())).mean(), np.array(list(accuracy_lda.values())).mean(), np.array(list(accuracy_qda.values())).mean(), np.array(list(accuracy_gnb.values())).mean(), np.array(list(accuracy_tree.values())).mean(), np.array(list(accuracy_bag.values())).mean(), np.array(list(accuracy_rf.values())).mean(), np.array(list(accuracy_boost.values())).mean(), np.array(list(accuracy_knn.values())).mean(), np.array(list(accuracy_svm_l.values())).mean(),np.array(list(accuracy_svm_poly.values())).mean(), np.array(list(accuracy_svm_radial.values())).mean()],
                                   "F1_score":[np.array(list(f1_lr.values())).mean(), np.array(list(f1_lda.values())).mean(), np.array(list(f1_qda.values())).mean(), np.array(list(f1_gnb.values())).mean(), np.array(list(f1_tree.values())).mean(), np.array(list(f1_bag.values())).mean(), np.array(list(f1_rf.values())).mean(), np.array(list(f1_boost.values())).mean(), np.array(list(f1_knn.values())).mean(), np.array(list(f1_svm_l.values())).mean(),np.array(list(f1_svm_poly.values())).mean(), np.array(list(f1_svm_radial.values())).mean()],
                                   "AUC":[np.array(list(auc_lr.values())).mean(), np.array(list(auc_lda.values())).mean(), np.array(list(auc_qda.values())).mean(), np.array(list(auc_gnb.values())).mean(), np.array(list(auc_tree.values())).mean(), np.array(list(auc_bag.values())).mean(), np.array(list(auc_rf.values())).mean(), np.array(list(auc_boost.values())).mean(), np.array(list(auc_knn.values())).mean(), np.array(list(auc_svm_l.values())).mean(),np.array(list(auc_svm_poly.values())).mean(), np.array(list(auc_svm_radial.values())).mean()]}).transpose()
    tabla_metricas.columns = ["LOGIST. REG.","LDA", "QDA", "GNB", "CART", "BAGGING","RANDOM FOREST", "XGBOOST", "KNN", "SVM lineal(SVC)", "SVM CUADRATICO", "SVM RADIAL"]
    return tabla_metricas 

In [25]:
dame_tabla_esperanzas_radial()

100%|█████████████████████████████████████████| 500/500 [02:32<00:00,  3.29it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.49502,0.49512,0.89242,0.90572,0.91288,0.92414,0.9236,0.92506,0.84456,0.5482,0.92778,0.91528
F1_score,0.245845,0.250175,0.867136,0.882966,0.89983,0.91108,0.91131,0.914898,0.840697,0.222608,0.924063,0.905429
AUC,0.46967,0.470147,0.884118,0.897477,0.910732,0.920713,0.920798,0.924213,0.851959,0.519116,0.932865,0.915577


* Observamos que al tener observaciones de dos categorias distribuidas de manera radial, una dentro de la otra, los clasificadores que tienen mayor sesgo, son los que performan peor. 

* Logistic Regression performa mal, dado que le cuesta encontrar con un hiperplano una separacion de categorias que tienen esta distribucion.

* Para el caso de LDA, si bien se cumple el supuesto de normalidad de las features, la varianza de cada categoria es distinta. Ademas, su metodo de clasificacion se basa en encontrar la distancia que separa el punto medio de cada categoria, midiendo sobre la direccion que maximiza esta distancia y minimiza la varianza. El asunto, es que los centros de ambas categorias estan muy proximos en todas las direcciones "w", que calcule LDA, ya que la geometria es circular.

* Para el caso de SVM (SUPPORT VECTOR CLASSIFIER), sucede lo mismo que con la regresion logistica, le cuesta clasificar data distribuida de manera radial, con un hiperplano.

# 7)
* Las features tienen distribucion Normal.
* Hay 2 features informativas, 1 feature ruido.
* Alpha = 0.5 (Misma cantidad de observaciones por categoria)
* rho 1 = [0.5, 0.95, 1]


In [26]:
dame_tabla_esperanzas_radial(rho1 = 0.5)

100%|█████████████████████████████████████████| 500/500 [02:30<00:00,  3.31it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.49084,0.49012,0.89596,0.92134,0.91644,0.92814,0.92822,0.92668,0.84674,0.5384,0.92152,0.91136
F1_score,0.362428,0.364314,0.88221,0.912344,0.911293,0.922557,0.923685,0.922827,0.853726,0.362985,0.92315,0.909045
AUC,0.485915,0.485322,0.893216,0.919152,0.916182,0.927545,0.927715,0.92663,0.850404,0.533326,0.92394,0.912273


In [27]:
dame_tabla_esperanzas_radial(rho1 = 0.95)

100%|█████████████████████████████████████████| 500/500 [02:28<00:00,  3.38it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.57022,0.56196,0.91244,0.9499,0.94488,0.9479,0.95552,0.9427,0.92408,0.61296,0.95444,0.94606
F1_score,0.650457,0.642407,0.91827,0.953359,0.950097,0.952673,0.959923,0.948254,0.933427,0.689741,0.960438,0.952269
AUC,0.543594,0.537096,0.914275,0.951894,0.943829,0.947662,0.95416,0.941299,0.919083,0.581717,0.949547,0.942783


In [28]:
dame_tabla_esperanzas_radial(rho1 = 1)

100%|█████████████████████████████████████████| 500/500 [02:23<00:00,  3.48it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.57754,0.57746,0.7815,0.94956,0.9902,0.98834,0.98902,0.9902,0.9272,0.62114,0.95466,0.94442
F1_score,0.665636,0.665508,0.786678,0.952709,0.991143,0.989425,0.990092,0.991143,0.935912,0.708021,0.960873,0.950886
AUC,0.547341,0.547424,0.781231,0.95219,0.989935,0.988224,0.988704,0.989935,0.922589,0.585797,0.949447,0.940697


*Aumentar la correlacion de las features informativas, mejora la clasificacion*

# 8)
* Las features tienen distribucion Normal.
* Hay 2 features informativas, p2 = [10, 50, 100] features ruido.
* Alpha = 0.5 (Misma cantidad de observaciones por categoria)
* rho 1 == rho 2 = 0

In [29]:
dame_tabla_esperanzas_radial(p2 = 10)

100%|█████████████████████████████████████████| 500/500 [02:24<00:00,  3.47it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.5058,0.50542,0.7289,0.8374,0.89902,0.91606,0.87118,0.92352,0.60236,0.5099,0.68218,0.63874
F1_score,0.396722,0.401248,0.636446,0.793016,0.883207,0.899854,0.842978,0.912697,0.616491,0.393313,0.637202,0.540096
AUC,0.494204,0.494442,0.711142,0.825859,0.896419,0.911109,0.864558,0.922086,0.615129,0.498548,0.682154,0.626818


In [30]:
dame_tabla_esperanzas_radial(p2 = 50)

100%|█████████████████████████████████████████| 500/500 [04:02<00:00,  2.06it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.50764,0.50712,0.53468,0.7386,0.88764,0.8742,0.71814,0.91834,0.51814,0.50956,0.54628,0.53972
F1_score,0.448659,0.454695,0.139858,0.65929,0.870069,0.837506,0.609773,0.906435,0.490983,0.45512,0.292417,0.293846
AUC,0.502471,0.503058,0.500784,0.723595,0.885155,0.866005,0.702095,0.916613,0.521097,0.505145,0.522283,0.5146


In [31]:
dame_tabla_esperanzas_radial(p2 = 100)

100%|█████████████████████████████████████████| 500/500 [04:58<00:00,  1.67it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.50364,0.50138,0.5054,0.69098,0.8859,0.80992,0.6678,0.91788,0.50706,0.50242,0.54158,0.53748
F1_score,0.43286,0.459468,0.482402,0.587441,0.866234,0.735055,0.494249,0.904508,0.452139,0.439685,0.169651,0.19858
AUC,0.497235,0.499741,0.507441,0.673114,0.882965,0.796251,0.643849,0.915608,0.504516,0.497058,0.506316,0.502794


Aunque cambio la geometria en la que estan distribuidas nuestras categorias, al haber muchas variables ruido, observamos resultados similares que en el caso anterior.

Notamos que clasificadores que dependen de toda la muestra en general para clasificar, como QDA y LDA, se rompen mucho mas rapido con variables basura, mientras que clasificadores que son capaces de mirar de manera mas local y seleccionar entre variables, son mas robustos a estos problemas en su forma de clasificar. 

Los ensambles y los algoritmos que utilizan CART son los que mejor se comportan y logran distinguir lo que es util de lo que no para clasificar. El caso de Random Forest que tiene una menor clasificacion incluso que CART, se debe a la gran proporcion de variables basura respecto a las variables que tienen informacion, que al momento de hacer el sampleo con reposicion en el nodo, lo perjudica.


# 9)

* Las features tienen distribucion Normal.
* Hay 2 features informativas, p2 = 1 features ruido.
* Alpha = [0.1, 0.25, 0.5] (n1!=n2, n1!=n2, n1==n2) [Desbalance muestral]
* rho 1 == rho 2 = 0


DATA DESBALANCEADA, `UTILIZAMOS F1-SCORE`, COMO METRICA, LA CUAL PONDERA RECALL/SENSITIVITY (CANTIDAD DE FALSOS NEGATIVOS) Y PRECISION (CANTIDAD DE FALSOS POSITIVOS)

In [32]:
dame_tabla_esperanzas_radial(alfa = 0.1)

100%|█████████████████████████████████████████| 500/500 [02:31<00:00,  3.31it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.79182,0.79144,0.89652,0.90106,0.93586,0.93636,0.92356,0.94664,0.88894,0.7941,0.82006,0.85718
F1_score,0.001212,0.001385,0.653179,0.665864,0.832163,0.818338,0.76408,0.859884,0.724515,0.0,0.199681,0.459143
AUC,0.498906,0.498711,0.754438,0.761009,0.888129,0.866956,0.825902,0.903085,0.833053,0.5,0.58435,0.689619


In [33]:
dame_tabla_esperanzas_radial(alfa = 0.25)

100%|█████████████████████████████████████████| 500/500 [02:30<00:00,  3.32it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.66444,0.663,0.88654,0.89286,0.92654,0.93322,0.92804,0.93822,0.86664,0.6838,0.91638,0.91012
F1_score,0.018703,0.020755,0.778638,0.790172,0.879306,0.886391,0.875363,0.899394,0.799573,0.001721,0.867785,0.852357
AUC,0.488804,0.488139,0.826578,0.833805,0.910111,0.910552,0.900207,0.925803,0.864007,0.500323,0.922268,0.894143


In [34]:
dame_tabla_esperanzas_radial(alfa = 0.5)

100%|█████████████████████████████████████████| 500/500 [02:29<00:00,  3.34it/s]


Unnamed: 0,LOGIST. REG.,LDA,QDA,GNB,CART,BAGGING,RANDOM FOREST,XGBOOST,KNN,SVM lineal(SVC),SVM CUADRATICO,SVM RADIAL
Accuracy,0.49586,0.4954,0.89354,0.90848,0.9152,0.92512,0.92436,0.9263,0.84002,0.54574,0.92582,0.91456
F1_score,0.257592,0.260623,0.868165,0.886375,0.90312,0.912717,0.9127,0.91622,0.836546,0.235908,0.921834,0.904775
AUC,0.472246,0.472138,0.885638,0.90062,0.913648,0.922202,0.922202,0.925521,0.8482,0.518504,0.931645,0.915492


En casos de desbalance, los clasificadores que mejor llegan a identificar a que grupo pertenecen, son los que tienen mas varianza, los clasificadores con mayor sesgo, Logistic Regression, LDA, SVC, les cuesta mucho clasificar, y su performance es pesima.