In [1]:
# ===== 0) Imports y utilidades =====
import os, json, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report, RocCurveDisplay
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# (Opcional) para balanceo
USE_SMOTE = False
if USE_SMOTE:
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline

RANDOM_STATE = 42

def metrics_report(y_true, y_pred, y_proba=None, title="Model"):
    print(f"\n=== {title} ===")
    print(classification_report(y_true, y_pred, digits=3))
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")
    if y_proba is not None:
        auc = roc_auc_score(y_true, y_proba)
        print(f"ROC-AUC:   {auc:.4f}")

    # Matriz de confusión
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt="d", cbar=False)
    plt.title(f"Confusion Matrix - {title}")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.show()

def column_drop_if_exists(df, cols):
    cols_to_drop = [c for c in cols if c in df.columns]
    return df.drop(columns=cols_to_drop), cols_to_drop


In [None]:
# ===== 1) Subir y cargar CSV tratado =====
from google.colab import files
import pandas as pd

# Subir archivo desde tu PC
uploaded = files.upload()

# Obtener el nombre real del archivo subido
csv_filename = list(uploaded.keys())[0]

# Cargar DataFrame
df = pd.read_csv(csv_filename)

print("Dimensiones:", df.shape)
df.head()


In [None]:
# ===== 2) Eliminar columnas irrelevantes =====
# Ajusta esta lista según tu dataset
irrelevantes = [
    "customerid", "customer_customerid", "id", "customer_id",
    # si quedó 'churn' textual y 'churn_bin' numérico, usaremos churn_bin
]

df, dropped = column_drop_if_exists(df, irrelevantes)
print("Columnas eliminadas:", dropped)

# Aseguramos la variable objetivo binaria (0/1)
if "churn_bin" in df.columns:
    y = df["churn_bin"].astype(int)
elif "churn" in df.columns:
    y = df["churn"].map({"Yes":1, "No":0}).astype(int)
    # opcional: conservar churn_bin
    df["churn_bin"] = y
else:
    raise ValueError("No se encontró columna objetivo ('churn_bin' o 'churn').")

# Quitamos la(s) columna(s) objetivo del set de features
X = df.drop(columns=[c for c in ["churn", "churn_bin"] if c in df.columns])

print("X shape:", X.shape, " | y shape:", y.shape)


In [None]:
# ===== 3) Encoding y proporción de churn =====
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number, "bool"]).columns.tolist()

print("Categóricas:", cat_cols)
print("Numéricas:", num_cols)

# Proporción de clases
print("\nProporción de churn:")
print(y.value_counts(normalize=True).rename({0:"No", 1:"Yes"}).round(3))

# Preprocesadores:
# - con normalización (para LR / KNN / SVM)
preprocess_norm = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse=False), cat_cols),
        ("num", StandardScaler(), num_cols),
    ],
    remainder="drop"
)

# - sin normalización (para árboles / RF / XGB)
preprocess_no_norm = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse=False), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop"
)


In [None]:
# ===== 4) (Opcional) Balanceo =====
# Si usas SMOTE, se aplica SOLO al set de entrenamiento (vía pipeline)
# Para un baseline rápido, puedes usar class_weight='balanced' en modelos lineales.
USE_CLASS_WEIGHT = True  # True aplica class_weight en LR; RF no lo usará (puedes ajustarlo).

print("SMOTE activado:", USE_SMOTE, "| class_weight en LR:", USE_CLASS_WEIGHT)


In [None]:
# ===== 5) Correlación y análisis dirigido =====
# (Solo variables numéricas)
corr = df.corr(numeric_only=True)
plt.figure(figsize=(10,8))
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Matriz de correlación (numéricas)")
plt.tight_layout()
plt.show()

# Análisis dirigido: ejemplos (usa las que existan en tu CSV)
cand_num = [c for c in ["customer_tenure", "account_charges_monthly",
                        "account_charges_total", "cuentas_diarias"] if c in df.columns]

for col in cand_num:
    plt.figure(figsize=(5,3))
    sns.boxplot(x=y, y=df[col])
    plt.title(f"{col} vs Churn (0=No, 1=Yes)")
    plt.tight_layout()
    plt.show()


In [None]:
# ===== 6) Train/Test split =====
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


In [None]:
# ===== 7) Modelos =====

# (A) Modelo sensible a escala: Logistic Regression
if USE_SMOTE:
    # Pipeline con SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline
    model_lr = ImbPipeline(steps=[
        ("prep", preprocess_norm),
        ("smote", SMOTE(random_state=RANDOM_STATE)),
        ("clf", LogisticRegression(max_iter=1000,
                                   class_weight=("balanced" if USE_CLASS_WEIGHT else None),
                                   random_state=RANDOM_STATE))
    ])
else:
    model_lr = Pipeline(steps=[
        ("prep", preprocess_norm),
        ("clf", LogisticRegression(max_iter=1000,
                                   class_weight=("balanced" if USE_CLASS_WEIGHT else None),
                                   random_state=RANDOM_STATE))
    ])

# (B) Modelo no sensible a escala: Random Forest
# (puedes ajustar n_estimators / max_depth)
model_rf = Pipeline(steps=[
    ("prep", preprocess_no_norm),
    ("clf", RandomForestClassifier(
        n_estimators=300,
        random_state=RANDOM_STATE,
        n_jobs=-1
    ))
])

# Entrenamos
model_lr.fit(X_train, y_train)
model_rf.fit(X_train, y_train)

# Predicciones y probabilidades
pred_lr = model_lr.predict(X_test)
proba_lr = model_lr.predict_proba(X_test)[:,1]

pred_rf = model_rf.predict(X_test)
proba_rf = model_rf.predict_proba(X_test)[:,1]


In [None]:
# ===== 8) Evaluación =====
metrics_report(y_test, pred_lr, proba_lr, title="Logistic Regression")
RocCurveDisplay.from_predictions(y_test, proba_lr)
plt.title("ROC - Logistic Regression"); plt.show()

metrics_report(y_test, pred_rf, proba_rf, title="Random Forest")
RocCurveDisplay.from_predictions(y_test, proba_rf)
plt.title("ROC - Random Forest"); plt.show()


In [None]:
# ===== 9) Importancia de variables =====
def get_feature_names(preprocessor, input_features):
    """Devuelve los nombres de features después del ColumnTransformer."""
    out = []
    for name, trans, cols in preprocessor.transformers_:
        if name == "remainder" and trans == "drop":
            continue
        if hasattr(trans, "get_feature_names_out"):
            feats = trans.get_feature_names_out(cols)
        else:
            feats = cols
        out.extend(feats)
    return np.array(out)

# (A) Coeficientes de Logistic Regression
pre_lr = model_lr.named_steps["prep"]
clf_lr = model_lr.named_steps["clf"]

feat_names_lr = get_feature_names(pre_lr, X.columns)
coef_lr = pd.Series(clf_lr.coef_.ravel(), index=feat_names_lr).sort_values(key=np.abs, ascending=False)

print("\nTop 15 | Coeficientes (LR) por magnitud:")
display(coef_lr.head(15))

plt.figure(figsize=(6,5))
coef_lr.head(15).iloc[::-1].plot(kind="barh")
plt.title("Top 15 | Importancia (coef.) - Logistic Regression")
plt.tight_layout(); plt.show()

# (B) Importancias de Random Forest
pre_rf = model_rf.named_steps["prep"]
clf_rf = model_rf.named_steps["clf"]

feat_names_rf = get_feature_names(pre_rf, X.columns)
imp_rf = pd.Series(clf_rf.feature_importances_, index=feat_names_rf).sort_values(ascending=True)

print("\nTop 15 | Importancias (RF):")
display(imp_rf.tail(15))

plt.figure(figsize=(6,5))
imp_rf.tail(15).plot(kind="barh")
plt.title("Top 15 | Importancia - Random Forest")
plt.tight_layout(); plt.show()


In [None]:
# ===== 10) Guardados opcionales =====
from joblib import dump

os.makedirs("/content/models", exist_ok=True)
dump(model_lr, "/content/models/model_lr.joblib")
dump(model_rf, "/content/models/model_rf.joblib")

# Predicciones de test
out = pd.DataFrame({
    "y_true": y_test,
    "proba_lr": proba_lr,
    "pred_lr": pred_lr,
    "proba_rf": proba_rf,
    "pred_rf": pred_rf
})
out.to_csv("/content/predicciones_test.csv", index=False)

print("Modelos y predicciones guardadas en /content")
