# Preprocesamiento y selección de características


##Cargar el conjunto de datos

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

datos = pd.read_csv('default_of_credit_card_clients_original.csv')
datos.shape

(30000, 25)

##1. Valores faltantes
###No hay valores faltante

##2. Seleccionar caracteristicas y discretizar datos

In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore")

X = datos.drop(['ID','SEX', 'BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6'], axis=1)
Y = datos['default payment next month'].values.astype(int)

enc = KBinsDiscretizer(n_bins=10)
X_binned = enc.fit_transform(X)

##3. Agregación y eliminación de características irrelevantes.

###No realizamos agregación en nuestro conjunto de datos

#Minando los datos

In [0]:
#Porcentaje de división
from sklearn.model_selection import train_test_split
X_train_binned, X_test_binned, y_train_binned, y_test_binned = train_test_split(X_binned, Y, test_size=0.33, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=0)

## Funciones Generales

In [0]:
from sklearn import metrics
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def print_binary_confusion_matrix(y_true, y_pred):
  confusion_matrix = metrics.confusion_matrix(y_true, y_pred)
  TN = confusion_matrix[0,0]
  FN = confusion_matrix[1,0]
  FP = confusion_matrix[0,1]
  TP = confusion_matrix[1,1]
  
  ### INICIO DE TU CÓDIGO
  exactitud = (TP+TN)/(TP+FP+TN+FN)
  precision = TP/(TP+FP)
  exhaustividad = TP/(TP+FN)
  ### FIN DE TU CÓDIGO

  print ('              +-----------------+')
  print ('              |   Predicción    |')
  print ('              +-----------------+')
  print ('              |    +   |    -   |')
  print ('+-------+-----+--------+--------+')
  print ('| Valor |  +  |  {:5d} |  {:5d} |'.format(TP, FN) )
  print ('| real  +-----+--------+--------+')
  print ('|       |  -  |  {:5d} |  {:5d} |'.format(FP, TN) )
  print ('+-------+-----+--------+--------+')
  print('Exactitud     : {:.3f}'.format(exactitud))
  print('Precisión     : {:.3f}'.format(precision))
  print('Exhaustividad : {:.3f}'.format(exhaustividad))
  print()

##1. Naive Bayes

In [0]:
#Creando model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings("ignore")

pipeline = Pipeline([('binarizer', Binarizer()), ('classifier', BernoulliNB())])
params = {'binarizer__threshold': np.logspace(0, 5, 10)}

naive_bayes_cv = GridSearchCV(pipeline, param_grid=params, cv=10, refit=True)


###a. Con datos originales

In [32]:
warnings.filterwarnings("ignore")

#Entrenar modelo
naive_bayes_cv.fit(X_test,y_test)

#Predicciones
y_predicciones_regresion = naive_bayes_cv.predict(X_test)

#Matriz confusion 
print_binary_confusion_matrix(y_test, y_predicciones_regresion)

#Validacion cruzada 
scores = cross_val_score(modelo_regresion_logistica_cv,X_test, y_test, cv=10, scoring='f1_macro')

print("Validacion cruzada : %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



              +-----------------+
              |   Predicción    |
              +-----------------+
              |    +   |    -   |
+-------+-----+--------+--------+
| Valor |  +  |   1018 |   1117 |
| real  +-----+--------+--------+
|       |  -  |    965 |   6800 |
+-------+-----+--------+--------+
Exactitud     : 0.790
Precisión     : 0.513
Exhaustividad : 0.477

Validacion cruzada : 0.44 (+/- 0.00)


###b. Con datos discretizados y columnas eliminadas

In [33]:
warnings.filterwarnings("ignore")

#Entrenar modelo
naive_bayes_cv.fit(X_test_binned,y_test_binned)

#Predicciones
y_predicciones_regresion_binned = naive_bayes_cv.predict(X_test_binned)

#Matriz confusion 
print_binary_confusion_matrix(y_test_binned, y_predicciones_regresion_binned)


#Validacion cruzada 
scores = cross_val_score(modelo_regresion_logistica_cv,X_test_binned, y_test_binned, cv=10, scoring='f1_macro')

print("Validacion cruzada : %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

              +-----------------+
              |   Predicción    |
              +-----------------+
              |    +   |    -   |
+-------+-----+--------+--------+
| Valor |  +  |      0 |   2135 |
| real  +-----+--------+--------+
|       |  -  |      0 |   7765 |
+-------+-----+--------+--------+
Exactitud     : 0.784
Precisión     : nan
Exhaustividad : 0.000

Validacion cruzada : 0.68 (+/- 0.05)


##2. Regresion logistica

In [0]:
#Creando model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score

modelo_regresion_logistica = LogisticRegression (penalty = "l1", solver = "saga", warm_start = True)
modelo_regresion_logistica_cv = GridSearchCV (modelo_regresion_logistica, param_grid = {"C": np.logspace(-4, 4, 10)}, scoring = "f1_macro")

###a. Con datos originales

In [35]:
warnings.filterwarnings("ignore")

#Entrenar modelo
modelo_regresion_logistica_cv.fit(X_test,y_test)

#Predicciones
y_predicciones_regresion = modelo_regresion_logistica_cv.predict(X_test)

#Matriz confusion 
print_binary_confusion_matrix(y_test, y_predicciones_regresion)

#Validacion cruzada 
scores = cross_val_score(modelo_regresion_logistica_cv,X_test, y_test, cv=10, scoring='f1_macro')

print("Validacion cruzada : %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


              +-----------------+
              |   Predicción    |
              +-----------------+
              |    +   |    -   |
+-------+-----+--------+--------+
| Valor |  +  |      2 |   2133 |
| real  +-----+--------+--------+
|       |  -  |      5 |   7760 |
+-------+-----+--------+--------+
Exactitud     : 0.784
Precisión     : 0.286
Exhaustividad : 0.001

Validacion cruzada : 0.44 (+/- 0.00)


### b. Con datos discretizados y columnas eliminadas

In [36]:
warnings.filterwarnings("ignore")

#Entrenar modelo
modelo_regresion_logistica_cv.fit(X_test_binned,y_test_binned)

#Predicciones
y_predicciones_regresion = modelo_regresion_logistica_cv.predict(X_test_binned)

#Matriz confusion 
print_binary_confusion_matrix(y_test_binned, y_predicciones_regresion)

#Validacion cruzada 
scores = cross_val_score(modelo_regresion_logistica_cv,X_test_binned, y_test_binned, cv=10, scoring='f1_macro')

print("Validacion cruzada : %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

              +-----------------+
              |   Predicción    |
              +-----------------+
              |    +   |    -   |
+-------+-----+--------+--------+
| Valor |  +  |    762 |   1373 |
| real  +-----+--------+--------+
|       |  -  |    355 |   7410 |
+-------+-----+--------+--------+
Exactitud     : 0.825
Precisión     : 0.682
Exhaustividad : 0.357

Validacion cruzada : 0.68 (+/- 0.05)


#Conclusiones

*   El algoritmo de regresión logística presenta mejor rendimiento.


*   Discreteando datos y reduciendo características se mejora el rendimiento de los algoritmos