## Importar librerias

In [1]:
# Importar las bibliotecas necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report
import joblib

##Cargar datos

In [2]:
# Cargar el conjunto de datos
url = '/content/drive/MyDrive/2.Proyectos ML/Análisis prescriptivo/bank-full.csv'
data = pd.read_csv(url, sep=';')

In [3]:
data.shape

(45211, 17)

In [4]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
data.isna().sum()

Unnamed: 0,0
age,0
job,0
marital,0
education,0
default,0
balance,0
housing,0
loan,0
contact,0
day,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


## Preprocesamiento de datos

In [7]:
# Eliminar columnas irrelevantes
data = data.drop(columns=['duration', 'balance', 'pdays', 'previous'])

In [8]:
# Manejar valores desconocidos
data.replace('unknown', np.nan, inplace=True)
data.dropna(inplace=True)

In [9]:
# Codificar variables categóricas
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
  le = LabelEncoder()
  data[column] = le.fit_transform(data[column])
  label_encoders[column] = le

In [10]:
# Separar características y variable objetivo
X = data.drop(columns=['y'])
y = data['y']

In [11]:
# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Estandarizar las características
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Entrenar modelo

In [13]:
# Entrenar modelos

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_pred)

# LightGBM
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_test)
lgb_acc = accuracy_score(y_test, lgb_pred)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 1424, number of negative: 4849
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 163
[LightGBM] [Info] Number of data points in the train set: 6273, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.227005 -> initscore=-1.225303
[LightGBM] [Info] Start training from score -1.225303


## Evaluar modelo

In [14]:
evaluaciones = {'Random Forest': rf_acc, 'XGBoost': xgb_acc, 'LightGBM': lgb_acc}
mejor_modelo = max(evaluaciones, key=evaluaciones.get)

print("Exactitud de los modelos:")
print(f"Random Forest: {rf_acc:.4f}")
print(f"XGBoost: {xgb_acc:.4f}")
print(f"LightGBM: {lgb_acc:.4f}")
print(f"\n El mejor modelo es: {mejor_modelo}")

Exactitud de los modelos:
Random Forest: 0.8171
XGBoost: 0.8126
LightGBM: 0.8196

 El mejor modelo es: LightGBM


In [15]:
# Mostrar reporte de clasificación del mejor modelo
if mejor_modelo == 'Random Forest':
  print("\nReporte de clasificación para Random Forest:")
  print(classification_report(y_test, rf_pred))
  joblib.dump(rf_model, 'mejor_modelo.pkl')
elif mejor_modelo == 'XGBoost':
  print("\nReporte de clasificación para XGBoost:")
  print(classification_report(y_test, xgb_pred))
  joblib.dump(xgb_model, 'mejor_modelo.pkl')
else:
  print("\nReporte de clasificación para LightGBM:")
  print(classification_report(y_test, lgb_pred))
  joblib.dump(lgb_model, 'mejor_modelo.pkl')


Reporte de clasificación para LightGBM:
              precision    recall  f1-score   support

           0       0.86      0.91      0.89      1207
           1       0.64      0.51      0.57       362

    accuracy                           0.82      1569
   macro avg       0.75      0.71      0.73      1569
weighted avg       0.81      0.82      0.81      1569



## Guardar modelo

In [16]:
joblib.dump(lgb_model, 'modelo_marketing.pkl') #se especifica el modelo mejor calificado
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

['label_encoders.pkl']

## Probar el modelo

In [17]:
# Función para recomendar acciones

def recomendar_accion(nuevo_cliente):
  # Cargar el modelo y los objetos de preprocesamiento
  modelo = joblib.load('modelo_marketing.pkl')
  scaler = joblib.load('scaler.pkl')
  label_encoders = joblib.load('label_encoders.pkl')

  # Crear un DataFrame para el nuevo cliente
  nuevo_cliente_df = pd.DataFrame([nuevo_cliente])

  # Codificar variables categóricas
  for column, le in label_encoders.items():
    if column in nuevo_cliente_df.columns:
      nuevo_cliente_df[column] = le.transform(nuevo_cliente_df[column])

  # Estandarizar las características
  nuevo_cliente_scaled = scaler.transform(nuevo_cliente_df)

  # Predecir la probabilidad de suscripción
  probabilidad = modelo.predict_proba(nuevo_cliente_scaled)[0][1]

  # Recomendar acción basada en un umbral
  umbral = 0.7
  if probabilidad >= umbral:
    return f"SI contactar al cliente. Probabilidad de suscripción: {probabilidad:.2f}"
  else:
    return f"NO contactar al cliente. Probabilidad de suscripción: {probabilidad:.2f}"

In [18]:
# Datos de entrada de un potencial cliente
cliente_nuevo = {
  'age': 30,
  'job': 'management',
  'marital': 'married',
  'education': 'secondary',
  'default': 'no',
  'housing':  'yes',
  'loan': 'no',
  'contact': 'cellular',
  'day': '7',
  'month': 'may',
  'campaign': 1,
  'poutcome': 'success'
}

print(recomendar_accion(cliente_nuevo))

NO contactar al cliente. Probabilidad de suscripción: 0.40


