## Importamos datos
Dataset de Kaggle: https://www.kaggle.com/c/GiveMeSomeCredit

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('data/cs-training.csv')

df.head()

## Limpieza de datos

In [None]:
df.info()

In [None]:
# Nos cargamos columnas inutiles


In [None]:
# Nos cargamos duplicados


## Matriz de correlación
Vamos a cargarnos algunas columnas que no estén muy relacionadas con el target

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(np.round(df.corr(), 2),
            vmin=-1,
            vmax=1,
            annot=True,
            cmap=sns.diverging_palette(145, 280, s=85, l=25, n=7),
            square=True,
            linewidths=.5);

## Dividimos el dataset

## Modelo

In [None]:
print(log_reg.coef_)
print(log_reg.intercept_)
print(log_reg.classes_)

## Feature importance

In [None]:
intercept = log_reg.intercept_
coefs = log_reg.coef_.ravel()

features = pd.DataFrame(coefs, X_train.columns, columns=['coefficient']).copy()
features['coefficient'] = np.abs(features['coefficient'])

features.sort_values('coefficient', ascending=False).head()

In [None]:
stdevs = []
for i in X_train.columns:
    stdev = df[i].std()
    stdevs.append(stdev)

features["stdev"] = np.array(stdevs).reshape(-1,1)
features["importance"] = features["coefficient"] * features["stdev"]
features['importance_standarized'] = features['importance'] / y_train.std()

features = features.sort_values('importance_standarized', ascending=True)
plt.barh(features.index, features.importance_standarized);

## Accuracy

## Radial chart

In [None]:
#!pip install yellowbrick

In [None]:
from yellowbrick.features.radviz import radviz

# Specify the target classes
classes = ["Pays", "Default"]

# Instantiate the visualizer
radviz(X, y.values, classes=classes);

## Confusion matrix

In [None]:
sns.heatmap(c_matrix_df/np.sum(c_matrix_df), annot=True, 
            fmt='.2%', cmap='Blues');

## Prediction errors

In [None]:
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassPredictionError

classes = ["Pays", "Default"]

# Instantiate the classification model and visualizer
visualizer = ClassPredictionError(
    LogisticRegression(max_iter = 5000), classes=classes
)

# Fit the training data to the visualizer
visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
visualizer.score(X_test, y_test)

# Draw visualization
visualizer.show();

## Classification report

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.naive_bayes import GaussianNB

from yellowbrick.classifier import ClassificationReport
from yellowbrick.datasets import load_occupancy

# Specify the target classes
classes = ["Pays", "Default"]

visualizer = ClassificationReport(LogisticRegression(max_iter = 5000), classes=classes, support=True)

visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show();                       # Finalize and show the figure

## ROC Curve

In [None]:
from yellowbrick.classifier import ROCAUC

# Instantiate the visualizer with the classification model
model = LogisticRegression(max_iter = 5000)
classes = ["Pays", "Default"]

visualizer = ROCAUC(model, classes = classes)

visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show();                       # Finalize and show the figure

## Threshold
Lo que realmente hace el `model.predict(X_train)` es obtener una probabilidad de ser 1, y a partir de un 50% se considera como 1. Este umbral del 50% lo podremos ir variando dependiendo de si nos interesa focalizar en los falsos positivos o los falsos negativos.

In [None]:
from yellowbrick.classifier import DiscriminationThreshold
import warnings
warnings.filterwarnings('ignore')

# Instantiate the classification model and visualizer
model = LogisticRegression(multi_class="auto", solver="liblinear")
classes = ["Default", "Pays"]

visualizer = DiscriminationThreshold(model)

visualizer.fit(X, y)        # Fit the data to the visualizer
visualizer.show();           # Finalize and render the figure

In [None]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

model = LogisticRegression(max_iter = 5000)
model.fit(X_train, y_train)

pred=model.predict_proba(X_test)


def f(punto_corte=0.5):
  y_pred=np.where(pred>punto_corte, 1, 0)
  conf_mat=pd.crosstab(y_test,
                       y_pred[:,1],
                       rownames=['Actual'],
                       colnames=['Predicted'])
    
  sns.heatmap(conf_mat, annot=True, fmt='g')



In [None]:
interact(f, punto_corte=(0, 1, 0.01));

## Shap
La librería `shap` también es muy útil para interpretar los modelos. Puedes encontrar información en estos dos artículos

https://towardsdatascience.com/explain-your-model-with-the-shap-values-bc36aac4de3d

https://towardsdatascience.com/shap-how-to-interpret-machine-learning-models-with-python-2323f5af4be9
