In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
import pickle
from sklearn.feature_selection import f_classif, SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



Los depósitos a largo plazo son un producto importante para el banco porque permiten retener el dinero de los clientes durante un tiempo determinado. Esto le da al banco la posibilidad de usar esos fondos para invertir y mejorar su rentabilidad.

Actualmente, el banco portugués está pasando por una etapa donde sus ingresos están disminuyendo, y una de las causas es la baja contratación de depósitos a largo plazo. Por eso, el banco quiere mejorar su estrategia de marketing para enfocarse en los clientes que realmente tienen más probabilidades de contratar este producto.

Las campañas de marketing se hacen principalmente por llamadas telefónicas. Si el cliente no atiende, se vuelve a llamar más tarde. Este método implica mucho tiempo y recursos, sobre todo cuando se llama a personas que casi con seguridad no van a contratar el depósito.

La idea es crear un modelo de predicción que ayude a saber qué clientes tienen más posibilidades de decir “sí” al depósito a largo plazo. Así el banco podrá:

Enfocar sus esfuerzos en los clientes más propensos a aceptar.

Evitar gastar dinero y tiempo en llamadas poco efectivas.

Aumentar la tasa de éxito de las campañas.

El modelo se entrenará con datos de campañas anteriores, donde tenemos información del cliente (edad, trabajo, educación, si tiene préstamos, etc.), cómo se le contactó, y también algunos indicadores económicos del momento.

El objetivo final es que el banco pueda usar este modelo para priorizar a quién llamar primero y mejorar los resultados de sus campañas.

En este proyecto, lo más importante no es solo tener un modelo con alta precisión general, sino que logre identificar bien a los clientes que sí van a contratar. Es preferible llamar a alguien que finalmente no acepte, que dejar pasar a alguien que sí lo hubiera hecho.

In [21]:
df = pd.read_csv('/workspaces/marcoakai_machine_learning/data/raw/bank-marketing-campaign-data.csv', sep=';')
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [23]:
df.drop_duplicates()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [24]:
df['housing'].value_counts()

housing
yes        21576
no         18622
unknown      990
Name: count, dtype: int64

In [25]:
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

Variable objetivo

In [26]:
y = df["y"].map({"no": 0, "yes": 1})
X = df.drop("y", axis=1)

# split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# Encoding

In [28]:
cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

X_train_cat = encoder.fit_transform(X_train[cat_cols])
X_test_cat  = encoder.transform(X_test[cat_cols])

# Escalado

In [29]:
scaler = MinMaxScaler()

X_train_num = scaler.fit_transform(X_train[num_cols])
X_test_num  = scaler.transform(X_test[num_cols])

 # Dataset final

In [30]:
X_train_final = np.hstack([X_train_num, X_train_cat])
X_test_final  = np.hstack([X_test_num, X_test_cat])

# Modelo

In [31]:
model = LogisticRegression(max_iter=1000, class_weight="balanced")

model.fit(X_train_final, y_train)

y_pred = model.predict(X_test_final)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, report, conf_matrix

(0.8514202476329206,
 {'0': {'precision': 0.9843998726520217,
   'recall': 0.8459644322845418,
   'f1-score': 0.9099470276633314,
   'support': 3655.0},
  '1': {'precision': 0.42433537832310836,
   'recall': 0.8943965517241379,
   'f1-score': 0.5755894590846047,
   'support': 464.0},
  'accuracy': 0.8514202476329206,
  'macro avg': {'precision': 0.704367625487565,
   'recall': 0.8701804920043399,
   'f1-score': 0.742768243373968,
   'support': 4119.0},
  'weighted avg': {'precision': 0.9213093348106487,
   'recall': 0.8514202476329206,
   'f1-score': 0.8722820818462571,
   'support': 4119.0}},
 array([[3092,  563],
        [  49,  415]]))

# OPTIMIZACIÓN

In [32]:
param_grid = {"C": [0.01, 0.1, 1, 10],
              "penalty": ["l2"],
              "solver": ["lbfgs"]}

grid = GridSearchCV(LogisticRegression(max_iter=1000, class_weight="balanced"), param_grid, scoring="recall", cv=5, n_jobs=-1)

grid.fit(X_train_final, y_train)

best_model = grid.best_estimator_
y_pred_opt = best_model.predict(X_test_final)

accuracy_opt = accuracy_score(y_test, y_pred_opt)
report_opt = classification_report(y_test, y_pred_opt, output_dict=True)
conf_matrix_opt = confusion_matrix(y_test, y_pred_opt)

accuracy_opt, report_opt, conf_matrix_opt

(0.8521485797523671,
 {'0': {'precision': 0.9844147582697201,
   'recall': 0.8467852257181943,
   'f1-score': 0.9104280041182526,
   'support': 3655.0},
  '1': {'precision': 0.4256410256410256,
   'recall': 0.8943965517241379,
   'f1-score': 0.5767894371091036,
   'support': 464.0},
  'accuracy': 0.8521485797523671,
  'macro avg': {'precision': 0.7050278919553729,
   'recall': 0.8705908887211661,
   'f1-score': 0.7436087206136781,
   'support': 4119.0},
  'weighted avg': {'precision': 0.9214696230573594,
   'recall': 0.8521485797523671,
   'f1-score': 0.8728440528941096,
   'support': 4119.0}},
 array([[3095,  560],
        [  49,  415]]))

# GUARDADO

In [33]:
with open("../models/logistic_no_onehot_corrected.sav", "wb") as f:
    pickle.dump(best_model, f)