In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from Datos import Datos
import EstrategiaParticionado
import Clasificador

from tabulate import tabulate
import numpy as np
import random

from sklearn.preprocessing import OneHotEncoder

# <font color='red'>Entrenamiento y clasificación con Clasificador.py</font>


## <font color='blue'> Dataset tic-tac-toe </font>

In [3]:
dataset_ttt = Datos("../ConjuntosDatos/tic-tac-toe.data")
ndata, ncols = dataset_ttt.datos.shape # nfeat = ncols-1 
print("Numero de ejemplos de entrenamiento:", ndata, ", Numero de columnas:", ncols)

Numero de ejemplos de entrenamiento: 958 , Numero de columnas: 10


In [4]:
np.random.shuffle(dataset_ttt.datos)

NB = Clasificador.ClasificadorNaiveBayes()

### Validación simple

In [5]:
strat_simple = EstrategiaParticionado.ValidacionSimple(0.8)

In [6]:
NB.validacion(strat_simple, dataset_ttt, NB)

[0.29166666666666663]

### Validación simple con varias repeticiones (3 reps)

In [7]:
strat_simple_rep3 = EstrategiaParticionado.ValidacionSimple(0.8, nreps=3)

In [8]:
errores = NB.validacion(strat_simple_rep3, dataset_ttt, NB)
print("Errores:", errores)
print("Error medio:", np.mean(errores), "+-", np.std(errores))

Errores: [0.30208333333333337, 0.27083333333333337, 0.34375]
Error medio: 0.3055555555555556 +- 0.029869184955009107


### Validación Cruzada (5 folds)

In [9]:
strat_cross = EstrategiaParticionado.ValidacionCruzada(k_fold=5)

In [10]:
errores = NB.validacion(strat_cross, dataset_ttt, NB)
print("Errores:", errores)
print("Error medio:", np.mean(errores), "+-", np.std(errores))

Errores: [0.3612565445026178, 0.31937172774869105, 0.23560209424083767, 0.2722513089005235, 0.3036649214659686]
Error medio: 0.29842931937172773 +- 0.042534232485005034


## <font color='blue'> Dataset German </font>

In [11]:
dataset_ger = Datos("../ConjuntosDatos/german.data")
ndata, ncols = dataset_ger.datos.shape # nfeat = ncols-1 
print("Numero de ejemplos de entrenamiento:", ndata, ", Numero de columnas:", ncols)

Numero de ejemplos de entrenamiento: 1000 , Numero de columnas: 21


In [12]:
np.random.shuffle(dataset_ger.datos)

### Validación simple

In [13]:
strat_simple_ger = EstrategiaParticionado.ValidacionSimple(0.8)
NB.validacion(strat_simple_ger, dataset_ger, NB)

[0.19999999999999996]

### Validación simple con varias repeticiones (3 reps)

In [14]:
strat_simple_rep3_ger = EstrategiaParticionado.ValidacionSimple(0.8, nreps=3)
errores = NB.validacion(strat_simple_rep3_ger, dataset_ger, NB)
print("Errores:", errores)
print("Error medio:", np.mean(errores), "+-", np.std(errores))

Errores: [0.24, 0.24, 0.22999999999999998]
Error medio: 0.23666666666666666 +- 0.004714045207910321


### Validación cruzada (5 folds)

In [15]:
strat_cross_ger = EstrategiaParticionado.ValidacionCruzada(k_fold=3)
errores = NB.validacion(strat_cross_ger, dataset_ger, NB)
print("Errores:", errores)
print("Error medio:", np.mean(errores), "+-", np.std(errores))

Errores: [0.26126126126126126, 0.2582582582582582, 0.2492492492492493]
Error medio: 0.25625625625625625 +- 0.005104123637229987


## <font color='green'> Tablas de Resultados </font>

### <font color='grey'> Validación simple </font>

In [16]:
def val_simple(percentage):
    max_reps = 5
    L = []
    for i in range(1, max_reps+1):
        L.append([])
        strat = EstrategiaParticionado.ValidacionSimple(percentage, nreps=i)
        errores_ttt = NB.validacion(strat, dataset_ttt, NB)
        errores_ger = NB.validacion(strat, dataset_ger, NB)
        L[i-1].append(str(i))
        L[i-1].append("%.2f +- %.3f"% (np.mean(errores_ttt), np.std(errores_ttt)))
        L[i-1].append("%.2f +- %.3f"% (np.mean(errores_ger), np.std(errores_ger)))
    print("Validación Simple con " + str(int(percentage*100)) +   "% Train\n")
    print(tabulate(L, headers = ["N_REPS", "TTT_DB", "GER_DB"], tablefmt="grid")) 

In [17]:
val_simple(0.70)

Validación Simple con 70% Train

+----------+---------------+---------------+
|   N_REPS | TTT_DB        | GER_DB        |
|        1 | 0.31 +- 0.000 | 0.26 +- 0.000 |
+----------+---------------+---------------+
|        2 | 0.30 +- 0.005 | 0.23 +- 0.020 |
+----------+---------------+---------------+
|        3 | 0.30 +- 0.022 | 0.27 +- 0.013 |
+----------+---------------+---------------+
|        4 | 0.29 +- 0.020 | 0.26 +- 0.009 |
+----------+---------------+---------------+
|        5 | 0.32 +- 0.037 | 0.26 +- 0.013 |
+----------+---------------+---------------+


In [18]:
val_simple(0.75)

Validación Simple con 75% Train

+----------+---------------+---------------+
|   N_REPS | TTT_DB        | GER_DB        |
|        1 | 0.32 +- 0.000 | 0.26 +- 0.000 |
+----------+---------------+---------------+
|        2 | 0.33 +- 0.010 | 0.25 +- 0.034 |
+----------+---------------+---------------+
|        3 | 0.29 +- 0.039 | 0.26 +- 0.028 |
+----------+---------------+---------------+
|        4 | 0.32 +- 0.020 | 0.25 +- 0.015 |
+----------+---------------+---------------+
|        5 | 0.29 +- 0.014 | 0.26 +- 0.012 |
+----------+---------------+---------------+


In [19]:
val_simple(0.80)

Validación Simple con 80% Train

+----------+---------------+---------------+
|   N_REPS | TTT_DB        | GER_DB        |
|        1 | 0.32 +- 0.000 | 0.24 +- 0.000 |
+----------+---------------+---------------+
|        2 | 0.27 +- 0.016 | 0.25 +- 0.003 |
+----------+---------------+---------------+
|        3 | 0.32 +- 0.032 | 0.22 +- 0.031 |
+----------+---------------+---------------+
|        4 | 0.29 +- 0.049 | 0.26 +- 0.020 |
+----------+---------------+---------------+
|        5 | 0.29 +- 0.025 | 0.27 +- 0.038 |
+----------+---------------+---------------+


### <font color='grey'> Validación cruzada </font>

In [20]:
max_folds = 11
j = 0
L = []
for i in range(3, max_folds+1, 2):
    L.append([])
    strat = EstrategiaParticionado.ValidacionCruzada(k_fold=i)
    errores_ttt = NB.validacion(strat, dataset_ttt, NB)
    errores_ger = NB.validacion(strat, dataset_ger, NB)
    L[j].append(str(i))
    L[j].append("%.2f +- %.3f"% (np.mean(errores_ttt), np.std(errores_ttt)))
    L[j].append("%.2f +- %.3f"% (np.mean(errores_ger), np.std(errores_ger)))
    j+=1
print(tabulate(L, headers = ["K_FOLDS", "TTT_DB", "GER_DB"], tablefmt="grid"))

+-----------+---------------+---------------+
|   K_FOLDS | TTT_DB        | GER_DB        |
|         3 | 0.30 +- 0.014 | 0.25 +- 0.017 |
+-----------+---------------+---------------+
|         5 | 0.30 +- 0.022 | 0.26 +- 0.024 |
+-----------+---------------+---------------+
|         7 | 0.31 +- 0.034 | 0.25 +- 0.052 |
+-----------+---------------+---------------+
|         9 | 0.30 +- 0.047 | 0.25 +- 0.042 |
+-----------+---------------+---------------+
|        11 | 0.30 +- 0.043 | 0.25 +- 0.034 |
+-----------+---------------+---------------+


# <font color='red'>Entrenamiento y clasificación con funciones de sklearn (encapsuladas en Clasificador.py)</font>

In [16]:
# Las funciones de sklearn requieren una codificación de los atributos discretos diferente a la nuestra.
# Por lo tanto, realizamos un pequeño trabajo de preprocesado de datos para que sus funciones de sklearn
# puiedan realizar un entrenamiento y clasificación equiparable al nuestro

def preprocess_data(dataset):
    X = dataset.datos[:,:-1] # all rows, all columns but last one
    Y = dataset.datos[:,-1]  # all rows, just last column (class)
    
    # One hot encoding for discrete features
    enc = OneHotEncoder(sparse=False, categories='auto')
    X_enc = np.array(enc.fit_transform(X)) 
    # Concatenating encoded data matrix and classes
    datos_aux = np.concatenate((X_enc, Y[:,None]), axis=1)
    # Updating dataset
    dataset.datos = np.zeros(datos_aux.shape)
    dataset.datos = datos_aux
    return dataset 

## <font color='blue'> Dataset tic-tac-toe </font>

In [17]:
ttt_db = Datos('../ConjuntosDatos/tic-tac-toe.data')
ndata, ncols = ttt_db.datos.shape # nfeat = ncols-1 
print("Numero de ejemplos de entrenamiento:", ndata, ", Numero de columnas:", ncols)

Numero de ejemplos de entrenamiento: 958 , Numero de columnas: 10


In [18]:
ttt_db = preprocess_data(ttt_db)
np.random.shuffle(ttt_db.datos)

NBSK = Clasificador.ClasificadorNaiveBayesSK(gaussian_feat=False)

### Validación simple

In [19]:
strat_simple_sk = EstrategiaParticionado.ValidacionSimple(0.8)

errs = NBSK.validacion(strat_simple_sk, ttt_db, NBSK)

print(errs)

[0.30729166666666663]


### Validación simple con varias repeticiones (3 reps)

In [20]:
strat_simple_rep3_sk = EstrategiaParticionado.ValidacionSimple(0.8, nreps=3)

errs = NBSK.validacion(strat_simple_rep3_sk, ttt_db, NBSK)

print("Errores: ", errs)
print("Error medio:", np.mean(errs), "+-", np.std(errs))

Errores:  [0.29166666666666663, 0.26041666666666663, 0.30208333333333337]
Error medio: 0.2847222222222222 +- 0.01770492886664164


### Validacion cruzada (5 folds)

In [21]:
strat_cross_sk = EstrategiaParticionado.ValidacionCruzada(k_fold=5)

errs = NBSK.validacion(strat_cross_sk, ttt_db, NBSK)

print("Errores: ", errs)
print("Error medio:", np.mean(errs), "+-", np.std(errs))

Errores:  [0.2879581151832461, 0.2827225130890052, 0.2827225130890052, 0.3507853403141361, 0.31413612565445026]
Error medio: 0.30366492146596863 +- 0.026282513922536405


## <font color='blue'> Dataset German (todos los atributos se consideran discretos)</font>

In [22]:
ger_db = Datos('../ConjuntosDatos/german.data')
ndata, ncols = ger_db.datos.shape # nfeat = ncols-1 
print("Numero de ejemplos de entrenamiento:", ndata, ", Numero de columnas:", ncols)

Numero de ejemplos de entrenamiento: 1000 , Numero de columnas: 21


In [23]:
ger_db = preprocess_data(ger_db)
np.random.shuffle(ger_db.datos)

NBSK = Clasificador.ClasificadorNaiveBayesSK(gaussian_feat=False)

### Validación simple

In [24]:
strat_simple_sk = EstrategiaParticionado.ValidacionSimple(0.8)

errs = NBSK.validacion(strat_simple_sk, ger_db, NBSK)

print(errs)

[0.28500000000000003]


### Validación simple con varias repeticiones (3 reps)

In [25]:
strat_simple_rep3_sk = EstrategiaParticionado.ValidacionSimple(0.8, nreps=3)

errs = NBSK.validacion(strat_simple_rep3_sk, ger_db, NBSK)

print("Errores: ", errs)
print("Error medio:", np.mean(errs), "+-", np.std(errs))

Errores:  [0.255, 0.30000000000000004, 0.31000000000000005]
Error medio: 0.2883333333333334 +- 0.023921166824012227


### Validación cruzada (5 folds)

In [26]:
strat_cross_sk = EstrategiaParticionado.ValidacionCruzada(k_fold=5)

errs = NBSK.validacion(strat_cross_sk, ger_db, NBSK)

print("Errores: ", errs)
print("Error medio:", np.mean(errs), "+-", np.std(errs))

Errores:  [0.27, 0.22499999999999998, 0.29000000000000004, 0.275, 0.29500000000000004]
Error medio: 0.271 +- 0.024779023386727755


## <font color='blue'> Dataset German (todos los atributos se consideran continuos)</font>

In [27]:
ger_db_cont = Datos('../ConjuntosDatos/german.data')
ndata, ncols = ger_db_cont.datos.shape # nfeat = ncols-1 
print("Numero de ejemplos de entrenamiento:", ndata, ", Numero de columnas:", ncols)

Numero de ejemplos de entrenamiento: 1000 , Numero de columnas: 21


In [28]:
np.random.shuffle(ger_db.datos)

NBSK = Clasificador.ClasificadorNaiveBayesSK(gaussian_feat=True)

### Validación simple

In [29]:
strat_simple_sk = EstrategiaParticionado.ValidacionSimple(0.8)

errs = NBSK.validacion(strat_simple_sk, ger_db_cont, NBSK)

print(errs)

[0.275]


### Validación simple con varias repeticiones (3 reps)

In [30]:
strat_simple_rep3_sk = EstrategiaParticionado.ValidacionSimple(0.8, nreps=3)

errs = NBSK.validacion(strat_simple_rep3_sk, ger_db_cont, NBSK)

print("Errores: ", errs)
print("Error medio:", np.mean(errs), "+-", np.std(errs))

Errores:  [0.235, 0.245, 0.30500000000000005]
Error medio: 0.26166666666666666 +- 0.030912061651652372


### Validación cruzada (5 folds)

In [31]:
strat_cross_sk = EstrategiaParticionado.ValidacionCruzada(k_fold=5)

errs = NBSK.validacion(strat_cross_sk, ger_db_cont, NBSK)

print("Errores: ", errs)
print("Error medio:", np.mean(errs), "+-", np.std(errs))

Errores:  [0.28, 0.22499999999999998, 0.29500000000000004, 0.245, 0.30000000000000004]
Error medio: 0.269 +- 0.02922327839240494


## <font color='green'> Tablas de Resultados </font>

### <font color='grey'> Validación simple </font>

In [37]:
def val_simpleSK(percentage):
    dataset_ttt = Datos("../ConjuntosDatos/tic-tac-toe.data")
    dataset_ger = Datos("../ConjuntosDatos/german.data")
    NBSK = Clasificador.ClasificadorNaiveBayesSK(gaussian_feat=False) # Clasificador Para TTT
    NBSK_d = Clasificador.ClasificadorNaiveBayesSK(gaussian_feat=False) # Clasificador Para GER en caso Discreto
    NBSK_c = Clasificador.ClasificadorNaiveBayesSK(gaussian_feat=True) # Clasificador Para GER en caso Continuo
    max_reps = 5
    L = []
    for i in range(1, max_reps+1):
        L.append([])
        strat = EstrategiaParticionado.ValidacionSimple(percentage, nreps=i)
        errores_ttt = NBSK.validacion(strat, dataset_ttt, NBSK)
        errores_ger_d = NBSK_d.validacion(strat, preprocess_data(dataset_ger), NBSK_d)
        errores_ger_c = NBSK_c.validacion(strat, dataset_ger, NBSK_c)
        L[i-1].append(str(i))
        L[i-1].append("%.2f +- %.3f"% (np.mean(errores_ttt), np.std(errores_ttt)))
        L[i-1].append("%.2f +- %.3f"% (np.mean(errores_ger_d), np.std(errores_ger_d)))
        L[i-1].append("%.2f +- %.3f"% (np.mean(errores_ger_c), np.std(errores_ger_c)))
    print("Validación Simple con " + str(int(percentage*100)) +   "% Train\n")
    print(tabulate(L, headers = ["N_REPS", "TTT_DB", "GER_DB_D", "GER_DB_C"], tablefmt="grid")) 

In [38]:
val_simpleSK(0.7)

Validación Simple con 70% Train

+----------+---------------+---------------+---------------+
|   N_REPS | TTT_DB        | GER_DB_D      | GER_DB_C      |
|        1 | 0.35 +- 0.000 | 0.32 +- 0.000 | 0.64 +- 0.000 |
+----------+---------------+---------------+---------------+
|        2 | 0.32 +- 0.016 | 0.26 +- 0.013 | 0.61 +- 0.007 |
+----------+---------------+---------------+---------------+
|        3 | 0.36 +- 0.003 | 0.22 +- 0.036 | 0.65 +- 0.029 |
+----------+---------------+---------------+---------------+
|        4 | 0.34 +- 0.024 | 0.27 +- 0.014 | 0.63 +- 0.009 |
+----------+---------------+---------------+---------------+
|        5 | 0.35 +- 0.011 | 0.27 +- 0.007 | 0.63 +- 0.019 |
+----------+---------------+---------------+---------------+


In [39]:
val_simpleSK(0.75)

Validación Simple con 75% Train

+----------+---------------+---------------+---------------+
|   N_REPS | TTT_DB        | GER_DB_D      | GER_DB_C      |
|        1 | 0.32 +- 0.000 | 0.29 +- 0.000 | 0.61 +- 0.000 |
+----------+---------------+---------------+---------------+
|        2 | 0.36 +- 0.023 | 0.30 +- 0.044 | 0.62 +- 0.008 |
+----------+---------------+---------------+---------------+
|        3 | 0.32 +- 0.018 | 0.28 +- 0.022 | 0.62 +- 0.013 |
+----------+---------------+---------------+---------------+
|        4 | 0.35 +- 0.019 | 0.26 +- 0.024 | 0.65 +- 0.003 |
+----------+---------------+---------------+---------------+
|        5 | 0.35 +- 0.045 | 0.27 +- 0.027 | 0.62 +- 0.023 |
+----------+---------------+---------------+---------------+


In [40]:
val_simpleSK(0.80)

Validación Simple con 80% Train

+----------+---------------+---------------+---------------+
|   N_REPS | TTT_DB        | GER_DB_D      | GER_DB_C      |
|        1 | 0.38 +- 0.000 | 0.21 +- 0.000 | 0.66 +- 0.000 |
+----------+---------------+---------------+---------------+
|        2 | 0.33 +- 0.005 | 0.28 +- 0.013 | 0.63 +- 0.008 |
+----------+---------------+---------------+---------------+
|        3 | 0.33 +- 0.021 | 0.27 +- 0.035 | 0.59 +- 0.012 |
+----------+---------------+---------------+---------------+
|        4 | 0.36 +- 0.018 | 0.26 +- 0.022 | 0.62 +- 0.021 |
+----------+---------------+---------------+---------------+
|        5 | 0.34 +- 0.017 | 0.24 +- 0.015 | 0.63 +- 0.032 |
+----------+---------------+---------------+---------------+


### <font color='grey'> Validación cruzada </font>

In [98]:
dataset_ger_d = preprocess_data(dataset_ger)
NBSK = Clasificador.ClasificadorNaiveBayesSK(gaussian_feat=False) # Clasificador Para TTT
NBSK_d = Clasificador.ClasificadorNaiveBayesSK(gaussian_feat=False) # Clasificador Para GER en caso Discreto
NBSK_c = Clasificador.ClasificadorNaiveBayesSK(gaussian_feat=True) # Clasificador Para GER en caso Continuo
max_folds = 11
j = 0
L = []
for i in range(3, max_folds+1, 2):
    L.append([])
    strat = EstrategiaParticionado.ValidacionCruzada(k_fold=i)
    errores_ttt = NBSK.validacion(strat, dataset_ttt, NBSK)
    errores_ger_d = NBSK_d.validacion(strat, dataset_ger_d, NBSK_d)
    errores_ger_c = NBSK_c.validacion(strat, dataset_ger, NBSK_c)
    L[j].append(str(i))
    L[j].append("%.2f +- %.3f"% (np.mean(errores_ttt), np.std(errores_ttt)))
    L[j].append("%.2f +- %.3f"% (np.mean(errores_ger_d), np.std(errores_ger_d)))
    L[j].append("%.2f +- %.3f"% (np.mean(errores_ger_c), np.std(errores_ger_c)))
    j+=1
print(tabulate(L, headers = ["K_FOLDS", "TTT_DB", "GER_DB_D", "GER_DB_C"], tablefmt="grid")) 

+-----------+---------------+---------------+---------------+
|   K_FOLDS | TTT_DB        | GER_DB_D      | GER_DB_C      |
|         3 | 0.34 +- 0.017 | 0.27 +- 0.028 | 0.62 +- 0.010 |
+-----------+---------------+---------------+---------------+
|         5 | 0.34 +- 0.036 | 0.26 +- 0.031 | 0.63 +- 0.022 |
+-----------+---------------+---------------+---------------+
|         7 | 0.34 +- 0.026 | 0.27 +- 0.034 | 0.64 +- 0.025 |
+-----------+---------------+---------------+---------------+
|         9 | 0.34 +- 0.047 | 0.28 +- 0.048 | 0.63 +- 0.050 |
+-----------+---------------+---------------+---------------+
|        11 | 0.34 +- 0.034 | 0.26 +- 0.037 | 0.62 +- 0.072 |
+-----------+---------------+---------------+---------------+


In [145]:
def create_confMatrix(pred, real):
    i = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    L = []
    L.append([])
    L.append([])
    
    for pred_val in pred:
        if pred_val == 0 and pred_val==real[i]:
            tn += 1
        elif pred_val == 0 and pred_val!=real[i]:
            fn += 1
        elif pred_val == 1 and pred_val==real[i]:
            tp += 1
        elif pred_val == 1 and pred_val!=real[i]:
            fp += 1
        i += 1
        
    L[0].append(tp)
    L[0].append(fn)
    L[1].append(fp)
    L[1].append(tn) 
    
    return L
    
    

In [146]:
def print_confMatrix(m):
    print(tabulate([["", "POS", "NEG"], ["POS", m[0][0], m[0][1]], ["NEG", m[1][0], m[1][1]]], headers="firstrow", tablefmt="grid")) 

In [147]:
def ROC_Analysis(dataset, clf, particionado):
    seed = random.seed(0)
    np.random.shuffle(dataset.datos)
    particionado.creaParticiones(dataset.datos, seed)
    pred = []
    real = []
    for particion in particionado.particiones:

        datostrain = dataset.datos[particion.indicesTrain, :]
        datostest = dataset.datos[particion.indicesTest, :]
            
        clf.entrenamiento(datostrain, dataset.nominalAtributos, dataset.diccionario)
            
        pred = clf.clasifica(datostest, dataset.nominalAtributos, dataset.diccionario)
        real = datostest[:,-1] 
        
        m = create_confMatrix(pred, real)
        rtp = int(m[0][0]) / (int(m[0][0]) + int(m[0][1]))
        rtn = int(m[1][1]) / (int(m[1][1]) + int(m[1][0]))
        
    print_confMatrix(m)
    print(rtp, rtn)
 

In [170]:
dataset_ttt = Datos("../ConjuntosDatos/tic-tac-toe.data")
dataset_ger = Datos("../ConjuntosDatos/german.data")
particionado = EstrategiaParticionado.ValidacionSimple(0.8)

clf = Clasificador.ClasificadorNaiveBayesSK(gaussian_feat=False)



ROC_Analysis(dataset_ger, clf, particionado)

+-----+-------+-------+
|     |   POS |   NEG |
| POS |    24 |    39 |
+-----+-------+-------+
| NEG |    33 |   104 |
+-----+-------+-------+
0.38095238095238093 0.7591240875912408
