---
title: "Análisis Predictivo: Desistimiento de Clientes"
author: "Fabio Marulanda"
format:
  html:
    theme: zephyr      
    toc: true          
    toc-location: left 
    code-fold: true  
execute:
  echo: true
  warning: false
  message: false
---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import lightgbm as lgb
import joblib
import plotly.express as px
import plotly.io as pio
import plotly.express as px
pio.renderers.default = "notebook_connected"
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
sns.set_style("whitegrid")

In [None]:
#cargar datos

df = pd.read_excel("Base PRUEBA - ANALITICA (1).xlsx")

In [None]:
# EDA

df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

In [None]:
df["TIPO_CONTRATO"].unique()

In [None]:
df["TIPO_CONTRATO"] = df["TIPO_CONTRATO"].fillna("Otra")

In [None]:
df.isna().sum()

In [None]:
df["Estado"].unique()


In [None]:
estados_finales = ["Desistida", "Negada", "Aprobada","Anulada"]
df = df[df["Estado"].isin(estados_finales)].copy()

In [None]:
counts = df["Estado"].map({
    "Desistida": "Desiste", 
    "Aprobada": "No Desiste", 
    "Negada": "No Desiste", 
    "Anulada": "No Desiste"
}).value_counts().reset_index()

counts.columns = ['Estado', 'Cantidad']

fig_balance = px.bar(counts, x='Estado', y='Cantidad', 
             title="Balance de la Variable Objetivo",
             color='Estado',
             color_discrete_map={'Desiste': '#EF553B', 'No Desiste': '#636EFA'},
             template="plotly_white")

fig_balance.show()

In [None]:
fig_estados = px.histogram(
    df, 
    x="Estado", 
    title="<b>Distribución de Estados Finales</b>",
    color="Estado",  
    color_discrete_sequence=px.colors.sequential.Viridis,
    template="plotly_white"
)


fig_estados.update_layout(
    xaxis_title="Estado de la Solicitud",
    yaxis_title="Cantidad de Clientes",
    xaxis={'categoryorder':'total descending'}, 
    showlegend=False
)

fig_estados.show()

In [None]:
columnas_quitar = ["SOLICITUD", "FECHA_INICIO", "GENERO", "Marca producto"]
df.drop(columns=columnas_quitar, inplace=True)

In [None]:
# 1 = Desiste 0 = No desiste 
df["DESISTE"] = (df["Estado"] == "Desistida").astype(int)
df = df.drop(columns=["Estado"])

In [None]:
# Ingenieria de caracteristicas

df["CAPACIDAD_PAGO"] = df["INGRESOS"] - df["EGRESOS"]

df["RATIO_ENDEUDAMIENTO"] = df["EGRESOS"] / (df["INGRESOS"] + 1e-6)

df["RATIO_SOLICITUD_INGRESO"] = df["VALOR_SOLICITADO"] / (df["INGRESOS"] + 1e-6)

df["ESTRES_FINANCIERO"] = df["RATIO_ENDEUDAMIENTO"] + df["RATIO_SOLICITUD_INGRESO"]



In [None]:
#procesamiento previo

from sklearn.compose import ColumnTransformer, make_column_selector


X = df.drop(columns=["DESISTE"])
y = df["DESISTE"]

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.2,
    random_state=42,
    stratify=y_temp
)

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Desconocido")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, make_column_selector(dtype_exclude="object")),
        ("cat", categorical_transformer, make_column_selector(dtype_include="object")),
    ]
)



In [None]:

# Calcular el peso para la clase minoritaria (1 = Desiste)
count_class_0 = y_train.value_counts()[0]
count_class_1 = y_train.value_counts()[1]
scale_pos_weight = count_class_0 / count_class_1

print(f"Peso aplicado a la clase 1 (Desiste): {scale_pos_weight:.2f}")

lgbm = lgb.LGBMClassifier(
    objective="binary",
    metric="binary_logloss",
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight
)

clf_pipeline_lgbm = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", lgbm)
])

param_dist_lgbm = {
    "classifier__n_estimators": [300, 500, 800],
    "classifier__learning_rate": [0.01, 0.03, 0.1],
    "classifier__num_leaves": [20, 31, 50],
    "classifier__max_depth": [-1, 7, 10],
    "classifier__subsample": [0.7, 0.9, 1.0],
    "classifier__colsample_bytree": [0.7, 0.9, 1.0],
    "classifier__min_child_samples": [20, 50, 100],
    "classifier__reg_lambda": [0, 1, 5]
}

print("Entrenando y buscando mejores parámetros de LightGBM...")
random_search_lgbm = RandomizedSearchCV(
    clf_pipeline_lgbm,
    param_distributions=param_dist_lgbm,
    n_iter=30,         
    cv=5,
    scoring="f1",      
    n_jobs=-1,
    random_state=42,
    verbose=1
)

random_search_lgbm.fit(X_train, y_train)

best_model_lgbm = random_search_lgbm.best_estimator_
print(f"Mejores parámetros LightGBM: {random_search_lgbm.best_params_}")

In [None]:


# Ajustar umbral 
y_proba_val = best_model_lgbm.predict_proba(X_val)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_val, y_proba_val)
f1_scores = np.nan_to_num(2 * (precisions * recalls) / (precisions + recalls))
best_threshold_idx = np.argmax(f1_scores)
best_threshold_lgbm = thresholds[best_threshold_idx]

print(f"\n--- 6. Umbral Óptimo LightGBM (validación) encontrado: {best_threshold_lgbm:.4f} ---")
print(f"Precision @umbral_opt: {precisions[best_threshold_idx]:.4f}")
print(f"Recall    @umbral_opt: {recalls[best_threshold_idx]:.4f}")
print(f"F1        @umbral_opt: {f1_scores[best_threshold_idx]:.4f}")

In [None]:

#  Evaluar 

y_proba_test = best_model_lgbm.predict_proba(X_test)[:, 1]
y_pred_test_opt = (y_proba_test >= best_threshold_lgbm).astype(int)

print(classification_report(y_test, y_pred_test_opt))

In [None]:
import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_test_opt)

x_labels = ["Predicho: No Desiste", "Predicho: Desiste"]
y_labels = ["Real: No Desiste", "Real: Desiste"]


z = cm[::-1] 
y_labels_adj = y_labels[::-1]

z_text = [
    [f"Clientes Perdidos (FN): {z[0][0]}", f"Clientes Salvados (VP): {z[0][1]}"],
    [f"Correctos (VN): {z[1][0]}", f"Falsas Alarmas (FP): {z[1][1]}"]
]

fig_cm = ff.create_annotated_heatmap(
    z, 
    x=x_labels, 
    y=y_labels_adj, 
    annotation_text=z, 
    colorscale='Greens'
)

fig_cm.update_layout(
    title_text=f'<b>Impacto del Modelo LightGBM (Umbral: {best_threshold_lgbm:.2f})</b>',
    xaxis_title="Predicción del Modelo",
    yaxis_title="Realidad del Cliente",
    width=600,
    height=500,
    template="plotly_white"
)

fig_cm.show()

In [None]:
num_cols = X.select_dtypes(exclude="object").columns
cat_cols = X.select_dtypes(include="object").columns

In [None]:

# Nombres de columnas 
feature_names_num = num_cols.tolist()

ohe = best_model_lgbm.named_steps["preprocessor"].named_transformers_["cat"].named_steps["onehot"]
feature_names_cat = ohe.get_feature_names_out(cat_cols)

all_feature_names = np.r_[feature_names_num, feature_names_cat]

importances = best_model_lgbm.named_steps["classifier"].feature_importances_
feature_imp = pd.Series(importances, index=all_feature_names).sort_values(ascending=False).head(15)

df_imp = feature_imp.reset_index()
df_imp.columns = ['Variable', 'Importancia']


fig_imp = px.bar(df_imp, 
                 x='Importancia', 
                 y='Variable', 
                 orientation='h',
                 title="Top 15 Variables Predictoras (LightGBM)",
                 color='Importancia',
                 color_continuous_scale='Greens',
                 template="plotly_white")

fig_imp.update_layout(yaxis={'categoryorder':'total ascending'}) # Ordenar de mayor a menor
fig_imp.show()


In [None]:
import shap


lgbm_model = best_model_lgbm.named_steps["classifier"]
preprocessor = best_model_lgbm.named_steps["preprocessor"]

X_test_sample = X_test.sample(min(1000, len(X_test)), random_state=42)
X_transformed = preprocessor.transform(X_test_sample)

explainer = shap.TreeExplainer(lgbm_model)
shap_values = explainer.shap_values(X_transformed)

if isinstance(shap_values, list):
    shap_to_plot = shap_values[1]
else:
    shap_to_plot = shap_values

# 5. Graficar
shap.summary_plot(shap_to_plot, X_transformed, feature_names=all_feature_names)

In [None]:
modelo = best_model_lgbm
best_threshold = best_threshold_lgbm

artifact = {
    "modelo": modelo,
    "best_threshold": best_threshold,
    "feature_cols": X.columns.tolist(),
}

joblib.dump(artifact, "modelo_desistimiento_lgbm.joblib")
print("Modelo guardado en 'modelo_desistimiento_lgbm.joblib'")