In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2
from scipy.spatial.distance import mahalanobis
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

In [41]:
ruta = "../datos/dataset_ecommerce_limpio.csv"
df = pd.read_csv(ruta)

#Se realizá una copia del DataFrame original para trabajar con ella
datos = df.copy()

In [42]:
datos.head()

Unnamed: 0,Antiguedad,Distancia_Almacen,Numero_Dispositivos,Categoria_Preferida,Nivel_Satisfaccion,Estado_Civil,Numero_Direcciones,Queja,Dias_Ultima_Compra,Monto_Cashback,Target
0,15.0,29.0,4,laptop & accessory,3,single,2,0,7.0,143.32,0
1,7.0,25.0,4,mobile,1,married,2,0,7.0,129.29,0
2,27.0,13.0,3,laptop & accessory,1,married,5,0,7.0,168.54,0
3,20.0,25.0,4,fashion,3,divorced,7,0,3.0,230.27,0
4,30.0,15.0,4,others,4,single,8,0,8.0,322.17,0


In [43]:
datos.tail()

Unnamed: 0,Antiguedad,Distancia_Almacen,Numero_Dispositivos,Categoria_Preferida,Nivel_Satisfaccion,Estado_Civil,Numero_Direcciones,Queja,Dias_Ultima_Compra,Monto_Cashback,Target
3936,28.0,9.0,5,fashion,3,married,8,0,1.0,231.86,0
3937,8.0,7.0,2,mobile phone,2,single,4,0,4.0,157.8,0
3938,30.0,6.0,5,laptop & accessory,3,married,3,1,2.0,156.6,0
3939,6.0,14.0,4,mobile,3,married,10,1,0.0,124.37,1
3940,2.0,7.0,3,laptop & accessory,5,married,1,0,2.0,153.73,0


# Procesamiento del dataset

In [44]:
vars_mahalanobis = ['Antiguedad', 'Monto_Cashback', 'Distancia_Almacen', 'Dias_Ultima_Compra']
datos_temp = datos.copy()

# Manejo de Nulos Temporal: Imputamos la mediana para que Mahalanobis funcione
for col in vars_mahalanobis:
    median_val = datos[col].median() 
    datos_temp[col].fillna(median_val, inplace=True)


# Calcular la matriz de covarianza y su inversa (basada en el DF imputado)
df_mahal_ref = datos_temp[vars_mahalanobis]
cov_matrix = np.cov(df_mahal_ref.values.T)
inv_cov_matrix = np.linalg.inv(cov_matrix)
mean_dist = df_mahal_ref.mean().values

# Calcular la Distancia de Mahalanobis para cada registro
datos_temp['Mahalanobis_Dist'] = datos_temp[vars_mahalanobis].apply(
    lambda row: mahalanobis(row, mean_dist, inv_cov_matrix), axis=1
)

# 4. Calcular el P-Valor
# Usamos 4 grados de libertad (el número de variables)
datos_temp['Mahalanobis_P'] = 1 - chi2.cdf(datos_temp['Mahalanobis_Dist']**2, len(vars_mahalanobis))

# 5. Identificar y Eliminar Outliers
outliers_mask = datos_temp['Mahalanobis_P'] < 0.001
num_outliers = outliers_mask.sum()

print(f"Dimensiones ANTES de la limpieza: {datos_temp.shape}")
print(f"Se encontraron y eliminarán {num_outliers} outliers multivariados.")

# Filtrar: Nos quedamos con los que NO son outliers (~)
datos_limpios = datos_temp[~outliers_mask].copy()

# Eliminar las columnas auxiliares
datos_limpios.drop(columns=['Mahalanobis_Dist', 'Mahalanobis_P'], inplace=True)

print(f"Dimensiones DESPUÉS de la limpieza: {datos_limpios.shape}")

# Actualizar la variable principal 'datos' con la versión limpia
datos = datos_limpios

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  datos_temp[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  datos_temp[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

Dimensiones ANTES de la limpieza: (3941, 13)
Se encontraron y eliminarán 15 outliers multivariados.
Dimensiones DESPUÉS de la limpieza: (3926, 11)


In [45]:
datos["Categoria_Preferida"].astype('category').value_counts()

Categoria_Preferida
laptop & accessory    1453
mobile phone           884
fashion                584
mobile                 558
grocery                268
others                 179
Name: count, dtype: int64

In [46]:
datos["Estado_Civil"].astype('category').value_counts()

Estado_Civil
married     2046
single      1307
divorced     573
Name: count, dtype: int64

# XGBOOST

In [47]:
cols_elegidas = ['Categoria_Preferida', 'Estado_Civil', 'Queja', 'Antiguedad', 'Target']
df_model = datos[cols_elegidas].copy()

In [53]:
for col in ['Categoria_Preferida', 'Estado_Civil']:
    df_model[col] = df_model[col].astype('category')

In [55]:
X = df_model.drop('Target', axis=1)
y = df_model['Target']

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [76]:
# Fórmula: Cantidad de No Churn (0) / Cantidad de Churn (1)
ratio_balanceo = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Ratio de balanceo calculado: {ratio_balanceo:.2f}")

Ratio de balanceo calculado: 4.83


In [77]:
modelo = xgb.XGBClassifier(
    tree_method="hist",       # Necesario para eficiencia con categorías
    enable_categorical=True,  
    scale_pos_weight = ratio_balanceo,
    random_state=42,
    max_depth=4
)
modelo.fit(X_train, y_train)

In [78]:
print("\n--- XGBoost con Soporte Nativo de Categorías ---")
print(classification_report(y_test, modelo.predict(X_test)))


--- XGBoost con Soporte Nativo de Categorías ---
              precision    recall  f1-score   support

           0       0.95      0.82      0.88       976
           1       0.47      0.78      0.59       202

    accuracy                           0.81      1178
   macro avg       0.71      0.80      0.73      1178
weighted avg       0.87      0.81      0.83      1178

