In [11]:
import sys
import os
sys.path.append(os.path.abspath(".."))
import src.support_functions as sf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                           confusion_matrix, classification_report, roc_auc_score, roc_curve)

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

In [12]:
data = data = pd.read_csv('../data/CHURN_FEATURES.csv')

data = data.drop(columns=[
    'ID_VENDEDOR',
    'ID_CAMPANA',
    'ID_VENDEDOR.1',
    'ID_CAMPANA.1',
    'ID_VENDEDOR.2',
    'ID_CAMPANA.2',
    'ID_UBICACION',
    'ID_VENDEDOR.3',
    'ID_CAMPANA.3',
    'ID_VENDEDOR.4',
    'ID_CAMPANA.4',
    'ID_VENDEDOR.5',
    'ID_CAMPANA.5',
    'NUMERO_CAMPANA',
    'NUMERO_CAMPANA.1',
    'PROVINCIA',
    'DISTRITO',
    'ANIO'
])

data.drop(columns=['EDAD_VENDEDORA'], inplace=True, axis=1)
data['ANTIGUEDAD_MESES'] = data['ANTIGUEDAD_MESES'].clip(lower=0)
data.dropna(inplace=True)

In [13]:
print(f"\nüîß PREPARACI√ìN DE DATOS:")

# Generamos dummy variables
data = pd.get_dummies(data, columns=['DEPARTAMENTO'], drop_first=False)
data['SEXO'] = data['SEXO'].map({'F': 0, 'M': 1})
data['SEXO'] = data['SEXO'].fillna(0)
data['TIPO_VENDEDOR'] = data['TIPO_VENDEDOR'].map({'Asesora': 0, 'L√≠der': 1})

# Separar caracter√≠sticas y variable objetivo
X = data.drop('TARGET_CHURN', axis=1)
y = data['TARGET_CHURN']

# Divisi√≥n en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print(f"   ‚Ä¢ Tama√±o conjunto entrenamiento: {X_train.shape}")
print(f"   ‚Ä¢ Tama√±o conjunto prueba: {X_test.shape}")

# Escalado de caracter√≠sticas
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"   ‚Ä¢ Caracter√≠sticas escaladas ‚úÖ")



üîß PREPARACI√ìN DE DATOS:
   ‚Ä¢ Tama√±o conjunto entrenamiento: (13648, 46)
   ‚Ä¢ Tama√±o conjunto prueba: (3412, 46)
   ‚Ä¢ Caracter√≠sticas escaladas ‚úÖ


In [14]:
# =================================

print(f"\n‚öñÔ∏è  BALANCEADO DE DATOS:")

# Verificar si es necesario balancear
class_counts = y_train.value_counts()
balance_ratio = min(class_counts) / max(class_counts)
print(f"   ‚Ä¢ Ratio de balance: {balance_ratio:.3f}")

if balance_ratio < 0.8:  # Si el desbalance es significativo
    print(f"   ‚Ä¢ Aplicando SMOTE + Tomek Links...")

    # Aplicar SMOTE + Tomek Links
    smote_tomek = SMOTETomek(random_state=42)
    X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train_scaled, y_train)

    print(f"   ‚Ä¢ Datos balanceados:")
    print(f"     - Antes: {y_train.value_counts().to_dict()}")
    print(f"     - Despu√©s: {pd.Series(y_train_balanced).value_counts().to_dict()}")

    # Visualizar el balanceado
    sf.plot_class_balance(pd.Series(y_train_balanced), "Distribuci√≥n Despu√©s del Balanceado")

    # Usar datos balanceados
    X_train_final = X_train_balanced
    y_train_final = y_train_balanced
else:
    print(f"   ‚Ä¢ No es necesario balancear los datos")
    X_train_final = X_train_scaled
    y_train_final = y_train




‚öñÔ∏è  BALANCEADO DE DATOS:
   ‚Ä¢ Ratio de balance: 0.812
   ‚Ä¢ No es necesario balancear los datos


#ENTRENAMIENTO

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd

print(f"\n‚öñÔ∏è  OPTIMIZACI√ìN DE HIPERPAR√ÅMETROS:")

# Hiperpar√°metros por modelo
params = {
    "SVM": {
        "kernel": ["rbf"],
        "C": [0.1, 1, 10],
        "gamma": ["scale", "auto", 0.1, 0.01]
    },
    "RF": {
        "n_estimators": [100, 200],
        "max_depth": [5, 10, None],
        "min_samples_split": [2, 5, 10]
    },
    "XGB": {
        "n_estimators": [100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1, 0.2],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0]
    }
}

# Modelos base
modelos_gridsearch = {
    "SVM": SVC(random_state=42, probability=True),
    "RF": RandomForestClassifier(random_state=42),
    "XGB": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

# Para guardar resultados
models_results = {}

# GridSearchCV
for name, model in modelos_gridsearch.items():
    print(f"\nüîç Modelo: {name}")
    print(f"   Par√°metros: {params[name]}")
    gs = GridSearchCV(model, params[name], cv=5, n_jobs=-1, scoring='f1')
    gs.fit(X_train_final, y_train_final)

    best_model = gs.best_estimator_
    best_params = gs.best_params_
    preds = best_model.predict(X_test_scaled)

    models_results[name] = {
        "model": best_model,
        "params": best_params,
        "predictions": preds
    }

    print(f"‚úÖ Mejor combinaci√≥n: {best_params}")
    print(f"üìà Mejor F1-Score (cv): {gs.best_score_:.4f}")



‚öñÔ∏è  OPTIMIZACI√ìN DE HIPERPAR√ÅMETROS:

üîç Modelo: SVM
   Par√°metros: {'kernel': ['rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto', 0.1, 0.01]}
‚úÖ Mejor combinaci√≥n: {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
üìà Mejor F1-Score (cv): 0.7491

üîç Modelo: RF
   Par√°metros: {'n_estimators': [100, 200], 'max_depth': [5, 10, None], 'min_samples_split': [2, 5, 10]}
‚úÖ Mejor combinaci√≥n: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
üìà Mejor F1-Score (cv): 0.7393

üîç Modelo: XGB
   Par√°metros: {'n_estimators': [100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.2], 'subsample': [0.8, 1.0], 'colsample_bytree': [0.8, 1.0]}
‚úÖ Mejor combinaci√≥n: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
üìà Mejor F1-Score (cv): 0.7480


In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix, ConfusionMatrixDisplay
)
import pandas as pd
import matplotlib.pyplot as plt

# =================================
# üìä EVALUACI√ìN DE MODELOS
# =================================

print(f"\nüìä EVALUACI√ìN DE MODELOS:")

evaluation_results = []

def evaluate_model(model, X_test, y_test, preds, name):
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds),
        "Recall": recall_score(y_test, preds),
        "F1-Score": f1_score(y_test, preds),
        "ROC-AUC": roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) if hasattr(model, "predict_proba") else roc_auc_score(y_test, model.decision_function(X_test))
    }

for model_name, results in models_results.items():
    model = results['model']
    preds = results['predictions']

    # Guardar m√©tricas generales
    metrics = evaluate_model(model, X_test_scaled, y_test, preds, model_name)
    evaluation_results.append(metrics)

    # === Imprimir m√©tricas generales ===
    print(f"\nüî∏ {model_name}:")
    print(f"   ‚Ä¢ Accuracy:  {metrics['Accuracy']:.4f}")
    print(f"   ‚Ä¢ Precision: {metrics['Precision']:.4f}")
    print(f"   ‚Ä¢ Recall:    {metrics['Recall']:.4f}")
    print(f"   ‚Ä¢ F1-Score:  {metrics['F1-Score']:.4f}")
    print(f"   ‚Ä¢ ROC-AUC:   {metrics['ROC-AUC']:.4f}")

    # === Reporte detallado por clase ===
    print(f"\nüìÑ Classification Report ({model_name}):")
    print(classification_report(y_test, preds, digits=4))


# === Tabla resumen ===
results_df = pd.DataFrame(evaluation_results).set_index('Model').round(4)

print(f"\nüìã TABLA COMPARATIVA DE RESULTADOS:")
print("=" * 60)
print(results_df)

# === Mejores modelos por m√©trica ===
print(f"\nüèÜ MEJORES MODELOS POR M√âTRICA:")
for metric in ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']:
    best_model = results_df[metric].idxmax()
    best_score = results_df[metric].max()
    print(f"   ‚Ä¢ {metric:<10}: {best_model} ({best_score:.4f})")



üìä EVALUACI√ìN DE MODELOS:

üî∏ SVM:
   ‚Ä¢ Accuracy:  0.6761
   ‚Ä¢ Precision: 0.6515
   ‚Ä¢ Recall:    0.8797
   ‚Ä¢ F1-Score:  0.7486
   ‚Ä¢ ROC-AUC:   0.7472

üìÑ Classification Report (SVM):
              precision    recall  f1-score   support

           0     0.7463    0.4293    0.5451      1542
           1     0.6515    0.8797    0.7486      1870

    accuracy                         0.6761      3412
   macro avg     0.6989    0.6545    0.6468      3412
weighted avg     0.6944    0.6761    0.6566      3412


üî∏ RF:
   ‚Ä¢ Accuracy:  0.6782
   ‚Ä¢ Precision: 0.6731
   ‚Ä¢ Recall:    0.8027
   ‚Ä¢ F1-Score:  0.7322
   ‚Ä¢ ROC-AUC:   0.7504

üìÑ Classification Report (RF):
              precision    recall  f1-score   support

           0     0.6878    0.5272    0.5969      1542
           1     0.6731    0.8027    0.7322      1870

    accuracy                         0.6782      3412
   macro avg     0.6805    0.6650    0.6646      3412
weighted avg     0.6797    0.67