# Proyecto Equipo 5
# Uso del Módulo 4 - Machine Learning
## División del dataset por K-fold y evaluación de clasificadores
## Decision Tree Classifier y Naïve Bayes Classifier

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings("ignore")

## Lectura y organización de los datos

In [None]:
df = pd.read_csv('../archive/data_features.csv')
df

In [None]:
#Asignación de características y etiquetas a X y Y respectivamente
X = df[['FLAG_OWN_CAR','FLAG_OWN_REALTY','AMT_INCOME_TOTAL','MONTHS_EMPLOYED','timespan']].values
y = df['bin_labels'].values
print(f'Tamaño X: {X.shape}\nTamaño y: {y.shape}')

Tamaño X: (36457, 5)
Tamaño y: (36457,)


## Selección de sets de entrenamiento y prueba por K-fold, 5 iteraciones

In [None]:
#Arreglos para guardar los datos de K-fold
list_X_train = []
list_X_test = []
list_y_train = []
list_y_test = []
#División de índices por k-fold
num_pliegues = 5
kf = KFold(n_splits=num_pliegues, shuffle=True, random_state=42)
for i, (train_indices, test_indices) in enumerate(kf.split(X)):
    X_entrenamiento, X_prueba = X[train_indices], X[test_indices]
    y_entrenamiento, y_prueba = y[train_indices], y[test_indices]

    # Imprimir información sobre el pliegue actual
    print(f"Pliegue {i + 1}:")
    print(f"Tamaño del conjunto de entrenamiento: {len(X_entrenamiento)}")
    print(f"Tamaño del conjunto de prueba: {len(X_prueba)}")
    print("\n")
    list_X_train.append(X_entrenamiento)
    list_X_test.append(X_prueba)
    list_y_train.append(y_entrenamiento)
    list_y_test.append(y_prueba)

Pliegue 1:
Tamaño del conjunto de entrenamiento: 29165
Tamaño del conjunto de prueba: 7292


Pliegue 2:
Tamaño del conjunto de entrenamiento: 29165
Tamaño del conjunto de prueba: 7292


Pliegue 3:
Tamaño del conjunto de entrenamiento: 29166
Tamaño del conjunto de prueba: 7291


Pliegue 4:
Tamaño del conjunto de entrenamiento: 29166
Tamaño del conjunto de prueba: 7291


Pliegue 5:
Tamaño del conjunto de entrenamiento: 29166
Tamaño del conjunto de prueba: 7291




## Funciones para entrenar modelos y obtener métricas

In [None]:
#Función de entrenamiento y prueba para el modelo
def train_test_modelo(modelo, X_train, y_train, X_test, y_test):
	modelo.fit(X_train,y_train)
	pred_train = modelo.predict(X_train)
	confmat_train = confusion_matrix(y_train, pred_train)
	pred_test = modelo.predict(X_test)
	confmat_test = confusion_matrix(y_test, pred_test)
	return confmat_train, confmat_test


In [None]:
def obtener_metricas_confmat(confmat):
	TP = confmat[1, 1]
	TN = confmat[0, 0]
	FP = confmat[0, 1]
	FN = confmat[1, 0]
	accuracy = (TP + TN) / (TP + TN + FP + FN)
	precision = TP / (TP + FP)
	recall = TP / (TP + FN)
	specificity = TN / (TN + FP)
	nombres_columnas = ['TP','TN','FP','FN','accuracy','precision','recall','specificity']
	metricas = pd.DataFrame([[TP,TN,FP,FN,accuracy,precision,recall,specificity]], columns=nombres_columnas)
	return metricas


## Árbol de decisión (Decision Tree Classifier)

In [None]:
#árbol de decisión
modelo_arbol = DecisionTreeClassifier(random_state=42)
modelo_arbol

In [None]:
#Entrenamiento y prueba del árbol para todos los sets
metricas_train = pd.DataFrame([], columns=['TP','TN','FP','FN','accuracy','precision','recall','specificity'])
metricas_test = pd.DataFrame([], columns=['TP','TN','FP','FN','accuracy','precision','recall','specificity'])
for i in range(5):
	(confmat_train, confmat_test) = train_test_modelo(modelo_arbol, list_X_train[i], list_y_train[i], list_X_test[i], list_y_test[i])
	metricas_train_aux = obtener_metricas_confmat(confmat_train)
	metricas_test_aux = obtener_metricas_confmat(confmat_test)
	metricas_train = pd.concat([metricas_train,metricas_train_aux], ignore_index=True)
	metricas_test = pd.concat([metricas_test,metricas_test_aux], ignore_index=True)


In [None]:
print(f'Métricas de predicción con datos de entrenamiento:')
metricas_train

Métricas de predicción con datos de entrenamiento:


Unnamed: 0,TP,TN,FP,FN,accuracy,precision,recall,specificity
0,11475,16878,144,668,0.972158,0.987607,0.944989,0.99154
1,11573,16770,146,676,0.971816,0.987542,0.944812,0.991369
2,11577,16765,150,674,0.971748,0.987209,0.944984,0.991132
3,11594,16750,148,674,0.971816,0.987396,0.94506,0.991242
4,11578,16772,137,679,0.972022,0.988306,0.944603,0.991898


In [None]:
print(f'Estadísticas de las métricas de predicción con datos de entrenamiento')
metricas_train.describe()

Estadísticas de las métricas de predicción con datos de entrenamiento


Unnamed: 0,accuracy,precision,recall,specificity
count,5.0,5.0,5.0,5.0
mean,0.971912,0.987612,0.94489,0.991436
std,0.000172,0.000417,0.000184,0.000299
min,0.971748,0.987209,0.944603,0.991132
25%,0.971816,0.987396,0.944812,0.991242
50%,0.971816,0.987542,0.944984,0.991369
75%,0.972022,0.987607,0.944989,0.99154
max,0.972158,0.988306,0.94506,0.991898


In [None]:
print(f'\nMétricas de predicción con datos de prueba:')
metricas_test


Métricas de predicción con datos de prueba:


Unnamed: 0,TP,TN,FP,FN,accuracy,precision,recall,specificity
0,2335,3532,611,814,0.80458,0.7926,0.741505,0.852522
1,2258,3598,651,785,0.803072,0.776212,0.742031,0.846787
2,2316,3596,654,725,0.810863,0.779798,0.761592,0.846118
3,2217,3610,657,807,0.799204,0.771399,0.733135,0.846028
4,2303,3558,698,732,0.803868,0.767411,0.758814,0.835996


In [None]:
print(f'Estadísticas de las métricas de predicción con datos de entrenamiento')
metricas_test.describe()

Estadísticas de las métricas de predicción con datos de entrenamiento


Unnamed: 0,accuracy,precision,recall,specificity
count,5.0,5.0,5.0,5.0
mean,0.804317,0.777484,0.747415,0.84549
std,0.004207,0.009669,0.012235,0.005957
min,0.799204,0.767411,0.733135,0.835996
25%,0.803072,0.771399,0.741505,0.846028
50%,0.803868,0.776212,0.742031,0.846118
75%,0.80458,0.779798,0.758814,0.846787
max,0.810863,0.7926,0.761592,0.852522


## Naïve Bayes Classifier

In [None]:
#Naive Bayes
modelo_NaiveBayes = GaussianNB()

In [None]:
#Entrenamiento y prueba de la SVM para todos los sets
metricas_train = pd.DataFrame([], columns=['TP','TN','FP','FN','accuracy','precision','recall','specificity'])
metricas_test = pd.DataFrame([], columns=['TP','TN','FP','FN','accuracy','precision','recall','specificity'])
for i in range(5):
	(confmat_train, confmat_test) = train_test_modelo(modelo_NaiveBayes, list_X_train[i], list_y_train[i], list_X_test[i], list_y_test[i])
	metricas_train_aux = obtener_metricas_confmat(confmat_train)
	metricas_test_aux = obtener_metricas_confmat(confmat_test)
	metricas_train = pd.concat([metricas_train,metricas_train_aux], ignore_index=True)
	metricas_test = pd.concat([metricas_test,metricas_test_aux], ignore_index=True)


In [None]:
print(f'Métricas de predicción con datos de entrenamiento:')
metricas_train

Métricas de predicción con datos de entrenamiento:


Unnamed: 0,TP,TN,FP,FN,accuracy,precision,recall,specificity
0,6577,13811,3211,5566,0.699057,0.671945,0.541629,0.811362
1,6614,13715,3201,5635,0.697034,0.673867,0.539962,0.810771
2,6675,13679,3236,5576,0.697867,0.673494,0.544853,0.808691
3,6651,13643,3255,5617,0.69581,0.671411,0.542142,0.807374
4,6646,13699,3210,5611,0.697559,0.67431,0.542221,0.81016


In [None]:
print(f'Estadísticas de las métricas de predicción con datos de entrenamiento')
metricas_train.describe()

Estadísticas de las métricas de predicción con datos de entrenamiento


Unnamed: 0,accuracy,precision,recall,specificity
count,5.0,5.0,5.0,5.0
mean,0.697466,0.673005,0.542162,0.809671
std,0.001186,0.00126,0.001759,0.001624
min,0.69581,0.671411,0.539962,0.807374
25%,0.697034,0.671945,0.541629,0.808691
50%,0.697559,0.673494,0.542142,0.81016
75%,0.697867,0.673867,0.542221,0.810771
max,0.699057,0.67431,0.544853,0.811362


In [None]:
print(f'\nMétricas de predicción con datos de prueba:')
metricas_test


Métricas de predicción con datos de prueba:


Unnamed: 0,TP,TN,FP,FN,accuracy,precision,recall,specificity
0,1667,3385,758,1482,0.692814,0.687423,0.529374,0.817041
1,1699,3413,836,1344,0.701042,0.670217,0.558331,0.803248
2,1635,3448,802,1406,0.697161,0.670907,0.537652,0.811294
3,1608,3493,774,1416,0.69963,0.675063,0.531746,0.818608
4,1682,3391,865,1353,0.695789,0.660385,0.554201,0.796758


In [None]:
print(f'Estadísticas de las métricas de predicción con datos de entrenamiento')
metricas_test.describe()

Estadísticas de las métricas de predicción con datos de entrenamiento


Unnamed: 0,accuracy,precision,recall,specificity
count,5.0,5.0,5.0,5.0
mean,0.697287,0.672799,0.542261,0.80939
std,0.003235,0.00979,0.013216,0.009279
min,0.692814,0.660385,0.529374,0.796758
25%,0.695789,0.670217,0.531746,0.803248
50%,0.697161,0.670907,0.537652,0.811294
75%,0.69963,0.675063,0.554201,0.817041
max,0.701042,0.687423,0.558331,0.818608
