In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("../Data/bank-additional-full.csv", sep = ";")
data.head(3)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
data.y.value_counts()

no     36548
yes     4640
Name: y, dtype: int64

# Preparación del dataset

## Preparación en train y test

**Seleccion de variable target**

In [4]:
y = data.y
X = data.drop(["y"], axis = 1)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 161, stratify= y)

In [6]:
display(y_train.value_counts(normalize=True))
display(y_test.value_counts(normalize=True))

no     0.887344
yes    0.112656
Name: y, dtype: float64

no     0.887351
yes    0.112649
Name: y, dtype: float64

In [7]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

**Data train y test con undersampling**

In [8]:
# Balanceado de clases de target con RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler
undersampler = RandomUnderSampler(sampling_strategy= 0.3 ,random_state=123); # iguala las clases
X_train_us,y_train_us = undersampler.fit_resample(X_train,y_train)

# Selección de la data de train y test en tipo numérica y categórica
X_train_numerical_us = X_train_us.select_dtypes(exclude = "object")
X_test_numerical_us = X_test.select_dtypes(exclude = "object")

X_train_categorical_us = X_train_us.select_dtypes(include='object')
X_test_categorical_us = X_test.select_dtypes(include='object')

# Transformación de data de train y test categórica en numérica con OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown = "ignore", sparse = False)
X_train_categorical_us_OH =  pd.DataFrame (OH_encoder.fit_transform(X_train_categorical_us))
X_train_categorical_us_OH.index = X_train_categorical_us.index

X_test_categorical_us_OH = pd.DataFrame (OH_encoder.transform(X_test_categorical_us))
X_test_categorical_us_OH.index = X_test_categorical_us.index

#Estandarización de variables
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train_numerical_us_std = pd.DataFrame(sc_X.fit_transform(X_train_numerical_us))
X_train_numerical_us_std.index = X_train_numerical_us.index

X_test_numerical_us_std = pd.DataFrame(sc_X.transform(X_test_numerical_us))
X_test_numerical_us_std.index = X_test_numerical_us.index

# Concatenación de data numerica con categórica transformada con OneHotEncoder sin estandarizar
X_train_us = pd.concat([X_train_numerical_us, X_train_categorical_us_OH], axis = 1)
X_test_us = pd.concat([X_test_numerical_us, X_test_categorical_us_OH], axis = 1)

# Concatenación de data numerica con categórica transformada con OneHotEncoder estandarizada
X_train_us_std = pd.concat([X_train_numerical_us_std, X_train_categorical_us_OH], axis = 1)
X_test_us_std = pd.concat([X_test_numerical_us_std, X_test_categorical_us_OH], axis = 1)

## **F2 Score**

In [11]:
from sklearn.metrics import make_scorer, fbeta_score

In [12]:
# Creacion de score f2 
ftwo_scorer = make_scorer(fbeta_score, beta=2)

## **Logistic regression**

## **Regresion Logistica con regularización**

In [37]:
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, recall_score, fbeta_score, make_scorer, f1_score
from sklearn.linear_model import LogisticRegression

In [14]:
# Seteado de grid search
k_range = np.logspace(-4, 4, 20)
logreg = LogisticRegression(max_iter = 1e3)
param_grid = {'C': k_range}
k_fold =  StratifiedKFold(n_splits=3, shuffle=True, random_state=10)

#Instanciado de GridSearchCV
gridlogreg = GridSearchCV ( logreg, param_grid, cv = k_fold, verbose=0, scoring = ftwo_scorer)
# Fiteado del modelo de regresión logistica con grid search
gridlogreg = gridlogreg.fit(X_train_us_std,y_train_us)

print(gridlogreg.best_estimator_)
print("La media del f2 score del modelo es:", gridlogreg.best_score_)
print("El mejor hiperparametro C del modelo es: ", gridlogreg.best_params_)

LogisticRegression(C=78.47599703514607, max_iter=1000.0)
La media del f2 score del modelo es: 0.6740707027530043
El mejor hiperparametro C del modelo es:  {'C': 78.47599703514607}


In [15]:
pd.DataFrame(gridlogreg.cv_results_).head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.034013,0.015567,0.004,6.257699e-07,0.0001,{'C': 0.0001},0.053881,0.065659,0.065776,0.061772,0.00558,20
1,0.024338,0.000471,0.004001,4.899036e-07,0.000263665,{'C': 0.00026366508987303583},0.288298,0.306428,0.303173,0.299299,0.007892,19
2,0.030005,0.00217,0.004001,1.469711e-06,0.000695193,{'C': 0.0006951927961775605},0.427745,0.445162,0.445785,0.439564,0.008362,18


In [17]:
from sklearn.linear_model import LogisticRegression
# Fiteado de modelo de regresion logistica con mejor hiperparametro C
best_hip = 78.47599703514607
logmodel = LogisticRegression( C = best_hip, max_iter=1e3) 
logmodel.fit(X_train_us_std,y_train_us)

LogisticRegression(C=78.47599703514607, max_iter=1000.0)

In [38]:
logpredtrain = logmodel.predict(X_train_us_std)
logpredtest = logmodel.predict(X_test_us_std)

# Creación de las variables score para la posterior evaluación de modelos
LG_ftwo_train = ftwo_scorer(logmodel,X_train_us_std,y_train_us)
LG_ftwo_test = ftwo_scorer(logmodel,X_test_us_std,y_test)

LG_fone_train = f1_score(y_train_us, logpredtrain)
LG_fone_test = f1_score(y_test, logpredtest)

LG_accuracy_train = accuracy_score(y_train_us, logpredtrain)
LG_accuracy_test = accuracy_score(y_test, logpredtest)

LG_ROC_train = roc_auc_score(y_train_us, logpredtrain)
LG_ROC_test = roc_auc_score(y_test, logpredtest)

LG_recall_train = recall_score(y_train_us, logpredtrain)
LG_recall_test = recall_score(y_test, logpredtest)


print(confusion_matrix(y_train_us, logpredtrain))
print(confusion_matrix(y_test, logpredtest))
print(LG_ftwo_train)
print(LG_ftwo_test)

[[11649   724]
 [ 1253  2459]]
[[6867  443]
 [ 315  613]]
0.6818812045920912
0.6428271812080537


## **KNN**

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [20]:
knn = KNeighborsClassifier()
k_range = list(range(1, 31, 2))
param_grid = dict(n_neighbors=k_range)
folds=StratifiedKFold(n_splits=3, random_state=19, shuffle=True)

gridknn = GridSearchCV(knn, param_grid, cv=folds, scoring=ftwo_scorer)
gridknn.fit(X_train_us_std, y_train_us)

print(gridknn.best_estimator_)
print("La media del f2 score del modelo es:", gridknn.best_score_)
print("El mejor hiperparametro n_neighbors del modelo es: ", gridknn.best_params_)

KNeighborsClassifier(n_neighbors=9)
La media del f2 score del modelo es: 0.6568895298275658
El mejor hiperparametro n_neighbors del modelo es:  {'n_neighbors': 9}


In [21]:
# Fiteado de modelo de knn con mejor hiperparametro n_neighbors
best_hip = 9
knnmodel = KNeighborsClassifier( n_neighbors = best_hip) 
knnmodel.fit(X_train_us_std,y_train_us)

KNeighborsClassifier(n_neighbors=9)

In [40]:
knnpredtrain = knnmodel.predict(X_train_us_std)
knnpredtest = knnmodel.predict(X_test_us_std)

# Creación de las variables score para la posterior evaluación de modelos
KNN_ftwo_train = ftwo_scorer(knnmodel,X_train_us_std,y_train_us)
KNN_ftwo_test = ftwo_scorer(knnmodel,X_test_us_std,y_test)

KNN_fone_train = f1_score(y_train_us, knnpredtrain)
KNN_fone_test = f1_score(y_test, knnpredtest)

KNN_accuracy_train = accuracy_score(y_train_us, knnpredtrain)
KNN_accuracy_test = accuracy_score(y_test, knnpredtest)

KNN_ROC_train = roc_auc_score(y_train_us, knnpredtrain)
KNN_ROC_test = roc_auc_score(y_test, knnpredtest)

KNN_recall_train = recall_score(y_train_us, knnpredtrain)
KNN_recall_test = recall_score(y_test, knnpredtest)


print(confusion_matrix(y_train_us, knnpredtrain))
print(confusion_matrix(y_test, knnpredtest))
print(KNN_ftwo_train)
print(KNN_ftwo_test)

[[11614   759]
 [ 1071  2641]]
[[6762  548]
 [ 326  602]]
0.7236409469530907
0.6190867955573837


## **Naive Bayes**

In [41]:
from sklearn.naive_bayes import GaussianNB
gaussiannb = GaussianNB()
gaussiannb.fit(X_train_us_std, y_train_us)

GaussianNB()

In [42]:
gaussiannbpredtrain = gaussiannb.predict(X_train_us_std)
gaussianpredtest = gaussiannb.predict(X_test_us_std)

# Creación de las variables score para la posterior evaluación de modelos
GAU_ftwo_train = ftwo_scorer(gaussiannb,X_train_us_std,y_train_us)
GAU_ftwo_test = ftwo_scorer(gaussiannb,X_test_us_std,y_test)

GAU_fone_train = f1_score(y_train_us, gaussiannbpredtrain)
GAU_fone_test = f1_score(y_test, gaussianpredtest)

GAU_accuracy_train = accuracy_score(y_train_us, gaussiannbpredtrain)
GAU_accuracy_test = accuracy_score(y_test, gaussianpredtest)

GAU_ROC_train = roc_auc_score(y_train_us, gaussiannbpredtrain)
GAU_ROC_test = roc_auc_score(y_test, gaussianpredtest)

GAU_recall_train = recall_score(y_train_us, gaussiannbpredtrain)
GAU_recall_test = recall_score(y_test, gaussianpredtest)


print(confusion_matrix(y_train_us, gaussiannbpredtrain))
print(confusion_matrix(y_test, gaussianpredtest))
print(GAU_ftwo_train)
print(GAU_ftwo_test)

[[10069  2304]
 [ 1115  2597]]
[[5927 1383]
 [ 282  646]]
0.6575016456529443
0.5626197526563317


## **Decision TreeClassifier**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from IPython.display import Image
import pydotplus
from sklearn.tree import DecisionTreeClassifier

In [25]:
dtree = DecisionTreeClassifier(criterion='gini', max_depth = 5) #criterion = entopy, gini
dtree.fit(X_train_us, y_train_us)

DecisionTreeClassifier(max_depth=5)

In [43]:
dtreepredtrain = dtree.predict(X_train_us)
dtreepredtest = dtree.predict(X_test_us)

# Creación de las variables score para la posterior evaluación de modelos
DTREE_ftwo_train = ftwo_scorer(dtree,X_train_us,y_train_us)
DTREE_ftwo_test = ftwo_scorer(dtree,X_test_us,y_test)

DTREE_fone_train = f1_score(y_train_us, dtreepredtrain)
DTREE_fone_test = f1_score(y_test, dtreepredtest)

DTREE_accuracy_train = accuracy_score(y_train_us, dtreepredtrain)
DTREE_accuracy_test = accuracy_score(y_test, dtreepredtest)

DTREE_ROC_train = roc_auc_score(y_train_us, dtreepredtrain)
DTREE_ROC_test = roc_auc_score(y_test, dtreepredtest)

DTREE_recall_train = recall_score(y_train_us, dtreepredtrain)
DTREE_recall_test = recall_score(y_test, dtreepredtest)


print(confusion_matrix(y_train_us, dtreepredtrain))
print(confusion_matrix(y_test, dtreepredtest))
print(DTREE_ftwo_train)
print(DTREE_ftwo_test)

[[11401   972]
 [  881  2831]]
[[6713  597]
 [ 230  698]]
0.7589405393812665
0.6970241661673657


In [None]:
dot_data = tree.export_graphviz(dtree, out_file=None,feature_names= X_train_us.columns, filled = True)
graph2 = pydotplus.graph_from_dot_data(dot_data)
Image(graph2.create_png())

In [None]:
clftree = tree.DecisionTreeClassifier(random_state=40)
params_grid = { "criterion" : ["gini", "entropy"],
                "min_samples_leaf": [5,10,15,20,100,300,800, None], 
                "max_depth" : [1,2,3,4,5,6,8,9,10,11,12,13,14,20,25,None],
                "min_samples_split": [2, 3, 4,None]}
grid_search = GridSearchCV(clftree, params_grid,
                           n_jobs=-1, scoring= ftwo_scorer)
grid_search.fit(X_train_us, y_train_us)

In [None]:
#Score durante el entrenamiento.
grid_search.best_score_

In [None]:
grid_search.best_params_ 

In [None]:
cv_clf = grid_search.best_estimator_

In [None]:
cv_clf

In [None]:
accuracy_score(y_test, cv_clf.predict(X_test_us))

In [None]:
ftwo_scorer(cv_clf,X_test_us,y_test)

In [None]:
confusion_matrix(y_test, cv_clf.predict(X_test_us))

In [None]:
dot_data = tree.export_graphviz(cv_clf, out_file=None,feature_names= X_train_us.columns, filled = True)
graph2 = pydotplus.graph_from_dot_data(dot_data)
Image(graph2.create_png())

In [None]:
pd.DataFrame({'atributo':X.columns, 'importancia':cv_clf.feature_importances_}).sort_values('importancia', ascending = False)

## **Comparación de los modelos**

In [44]:
models = pd.DataFrame({'Models': ['Logistic Model', 'K-Near Neighbors',
                                   'Gausian NB', 'Decision Tree Classifier'],
                       'F2 Score Train':  [LG_ftwo_train, KNN_ftwo_train, GAU_ftwo_train, DTREE_ftwo_train],
                       'F2 Score Test': [LG_ftwo_test, KNN_ftwo_test, GAU_ftwo_test, DTREE_ftwo_test],
                       'F1 Score Train':  [LG_fone_train, KNN_fone_train, GAU_fone_train, DTREE_fone_train],
                       'F1 Score Test': [LG_fone_test, KNN_fone_test, GAU_fone_test, DTREE_fone_test],                       
                       'Accuracy Score Train':  [LG_accuracy_train, KNN_accuracy_train, GAU_accuracy_train, DTREE_accuracy_train],
                       'Accuracy Score Test': [LG_accuracy_test, KNN_accuracy_test, GAU_accuracy_test, DTREE_accuracy_test],
                       'ROC Score Train':  [LG_ROC_train, KNN_ROC_train, GAU_ROC_train, DTREE_ROC_train],
                       'ROC Score Test': [LG_ROC_test, KNN_ROC_test, GAU_ROC_test, DTREE_ROC_test],
                       'Recall Score Train':  [LG_recall_train, KNN_recall_train, GAU_recall_train, DTREE_recall_train],
                       'Recall Score Test': [LG_recall_test, KNN_recall_test, GAU_recall_test, DTREE_recall_test]})
        
models.sort_values(by='F2 Score Test', ascending=False)

Unnamed: 0,Models,F2 Score Train,F2 Score Test,F1 Score Train,F1 Score Test,Accuracy Score Train,Accuracy Score Test,ROC Score Train,ROC Score Test,Recall Score Train,Recall Score Test
3,Decision Tree Classifier,0.758941,0.697024,0.753426,0.62798,0.8848,0.899612,0.842052,0.835243,0.762662,0.752155
0,Logistic Model,0.681881,0.642827,0.71327,0.617944,0.87709,0.907987,0.801966,0.799979,0.662446,0.66056
1,K-Near Neighbors,0.723641,0.619087,0.742688,0.579403,0.886229,0.893906,0.825067,0.786871,0.711476,0.648707
2,Gausian NB,0.657502,0.56262,0.603042,0.436929,0.787442,0.797888,0.756705,0.753464,0.699623,0.696121


# graficos para ver si quedan

In [None]:
lr_false_positive_rate,lr_true_positive_rate,lr_threshold = roc_curve(y_test,lr_predict)
nb_false_positive_rate,nb_true_positive_rate,nb_threshold = roc_curve(y_test,nbpred)
rf_false_positive_rate,rf_true_positive_rate,rf_threshold = roc_curve(y_test,rf_predicted)                                                             
xgb_false_positive_rate,xgb_true_positive_rate,xgb_threshold = roc_curve(y_test,xgb_predicted)
knn_false_positive_rate,knn_true_positive_rate,knn_threshold = roc_curve(y_test,knn_predicted)
dt_false_positive_rate,dt_true_positive_rate,dt_threshold = roc_curve(y_test,dt_predicted)
svc_false_positive_rate,svc_true_positive_rate,svc_threshold = roc_curve(y_test,svc_predicted)


sns.set_style('whitegrid')
plt.figure(figsize=(10,5))
plt.title('Reciver Operating Characterstic Curve')
plt.plot(lr_false_positive_rate,lr_true_positive_rate,label='Logistic Regression')
plt.plot(nb_false_positive_rate,nb_true_positive_rate,label='Naive Bayes')
plt.plot(rf_false_positive_rate,rf_true_positive_rate,label='Random Forest')
plt.plot(xgb_false_positive_rate,xgb_true_positive_rate,label='Extreme Gradient Boost')
plt.plot(knn_false_positive_rate,knn_true_positive_rate,label='K-Nearest Neighbor')
plt.plot(dt_false_positive_rate,dt_true_positive_rate,label='Desion Tree')
plt.plot(svc_false_positive_rate,svc_true_positive_rate,label='Support Vector Classifier')
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import roc_curve
# Gradient Boosting Classifier
# Neural Classifier
# Naives Bayes Classifier
grd_fpr, grd_tpr, thresold = roc_curve(y_train, y_scores)
neu_fpr, neu_tpr, neu_threshold = roc_curve(y_train, neural_y_scores)
nav_fpr, nav_tpr, nav_threshold = roc_curve(y_train, naives_y_scores)

In [None]:
def graph_roc_curve(false_positive_rate, true_positive_rate, label=None):
    plt.figure(figsize=(10,6))
    plt.title('ROC Curve \n Gradient Boosting Classifier', fontsize=18)
    plt.plot(false_positive_rate, true_positive_rate, label=label)
    plt.plot([0, 1], [0, 1], '#0C8EE0')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.annotate('ROC Score of 91.73% \n (Not the best score)', xy=(0.25, 0.9), xytext=(0.4, 0.85),
            arrowprops=dict(facecolor='#F75118', shrink=0.05),
            )
    plt.annotate('Minimum ROC Score of 50% \n (This is the minimum score to get)', xy=(0.5, 0.5), xytext=(0.6, 0.3),
                arrowprops=dict(facecolor='#F75118', shrink=0.05),
                )
    
    
graph_roc_curve(grd_fpr, grd_tpr, threshold)
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

print('Gradient Boost Classifier Score: ', roc_auc_score(y_train, y_scores))
print('Neural Classifier Score: ', roc_auc_score(y_train, neural_y_scores))
print('Naives Bayes Classifier: ', roc_auc_score(y_train, naives_y_scores))

In [None]:
def graph_roc_curve_multiple(grd_fpr, grd_tpr, neu_fpr, neu_tpr, nav_fpr, nav_tpr):
    plt.figure(figsize=(8,6))
    plt.title('ROC Curve \n Top 3 Classifiers', fontsize=18)
    plt.plot(grd_fpr, grd_tpr, label='Gradient Boosting Classifier (Score = 91.72%)')
    plt.plot(neu_fpr, neu_tpr, label='Neural Classifier (Score = 91.54%)')
    plt.plot(nav_fpr, nav_tpr, label='Naives Bayes Classifier (Score = 80.33%)')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.annotate('Minimum ROC Score of 50% \n (This is the minimum score to get)', xy=(0.5, 0.5), xytext=(0.6, 0.3),
                arrowprops=dict(facecolor='#6E726D', shrink=0.05),
                )
    plt.legend()
    
graph_roc_curve_multiple(grd_fpr, grd_tpr, neu_fpr, neu_tpr, nav_fpr, nav_tpr)
plt.show()