## MODELING AND PRE PROCESSING

*Dado a todo lo investigado y conocer las relaciones que se ejercen directamente las features del dataset, podriamos usar las variables que mejor se nos ajuste, sin embargo, para seguir el mismo concepto de Quick & Dirty, vamos a hacer uso*

Dividir los datos en **train / validation / test** para poder:

* entrenar el modelo (`train`),
* ajustar hiperparámetros (`validation`),
* y evaluar el desempeño final (`test`).


### Opciones

* Si solo necesitás **train/test**, basta con un solo `train_test_split`.
* Si vas a usar algo como **GridSearchCV o RandomizedSearchCV**, no necesitás `validation` explícito: esas clases ya hacen validación cruzada.
* Para problemas de **churn** (clasificación con clases desbalanceadas) recomiendo **usar `stratify=y`** para que la proporción de churn/no-churn se mantenga igual en los splits.



In [None]:
import sqlite3
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from datetime import datetime
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer, StandardScaler

# setting display
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

## Obtener los datos

In [None]:
def pandas_sqlite_read(db_path, query, params=None):
    with sqlite3.connect(db_path) as conn:
        # Leer directamente a DataFrame
        if params:
            df = pd.read_sql_query(query, conn, params=params)
        else:
            df = pd.read_sql_query(query, conn)
    
    return df

In [None]:
class ChurnDatabase:
    
    def __init__(self, db_path):
        self.db_path = db_path
        
    def get_all_customers(self):
        query = "SELECT * FROM telco_customer_churn"
        return pandas_sqlite_read(self.db_path, query)
    
    def get_churn_customers(self):
        query = "SELECT * FROM telco_customer_churn WHERE Churn = 'Yes'"
        return pandas_sqlite_read(self.db_path, query)
    
    def get_customers_by_contract(self, contract_type):
        query = "SELECT * FROM telco_customer_churn WHERE Contract = ?"
        return pandas_sqlite_read(self.db_path, query, params=[contract_type])
    
    def get_high_value_customers(self, min_charges):
        query = """
        SELECT customerID, gender, tenure, Contract, 
               MonthlyCharges, TotalCharges, Churn
        FROM telco_customer_churn
        WHERE TotalCharges > ?
        ORDER BY TotalCharges DESC
        """
        return pandas_sqlite_read(self.db_path, query, params=[min_charges])
    
    def get_churn_analysis_data(self):
        
        query = """
        SELECT 
            gender,
            SeniorCitizen,
            Partner,
            Dependents,
            tenure,
            PhoneService,
            MultipleLines,
            InternetService,
            OnlineSecurity,
            OnlineBackup,
            DeviceProtection,
            TechSupport,
            StreamingTV,
            StreamingMovies,
            Contract,
            PaperlessBilling,
            PaymentMethod,
            MonthlyCharges,
            TotalCharges,
            Churn = "Yes" AS Churn,
            -- Features derivadas
            CASE 
                WHEN tenure <= 12 THEN 'New'
                WHEN tenure <= 36 THEN 'Regular' 
                ELSE 'Loyal'
            END as customer_segment,
            
            CASE 
                WHEN MonthlyCharges < 35 THEN 'Low'
                WHEN MonthlyCharges < 65 THEN 'Medium'
                ELSE 'High'
            END as price_segment,
            
            -- Número de servicios adicionales
            (CASE WHEN OnlineSecurity = 'Yes' THEN 1 ELSE 0 END +
             CASE WHEN OnlineBackup = 'Yes' THEN 1 ELSE 0 END +
             CASE WHEN DeviceProtection = 'Yes' THEN 1 ELSE 0 END +
             CASE WHEN TechSupport = 'Yes' THEN 1 ELSE 0 END +
             CASE WHEN StreamingTV = 'Yes' THEN 1 ELSE 0 END +
             CASE WHEN StreamingMovies = 'Yes' THEN 1 ELSE 0 END) as additional_services
             
        FROM telco_customer_churn
        """
        return pandas_sqlite_read(self.db_path, query)
    
    def get_churn_summary_stats(self):

        query = """
        SELECT 
            Contract,
            PaymentMethod,
            COUNT(*) as total_customers,
            SUM(CASE WHEN Churn = 'Yes' THEN 1 ELSE 0 END) as churn_customers,
            ROUND(
                100.0 * SUM(CASE WHEN Churn = 'Yes' THEN 1 ELSE 0 END) / COUNT(*), 
                2
            ) as churn_rate,
            ROUND(AVG(MonthlyCharges), 2) as avg_monthly_charges,
            ROUND(AVG(tenure), 2) as avg_tenure
        FROM telco_customer_churn
        GROUP BY Contract, PaymentMethod
        ORDER BY churn_rate DESC
        """
        return pandas_sqlite_read(self.db_path, query)
    
    def get_dataframe_by_query(self, query:str):
        return pandas_sqlite_read(self.db_path, query)

In [None]:
churn_objetct = ChurnDatabase("../database/telco_customer_churn.sqlite.db")
churn = churn_objetct.get_churn_analysis_data()
churn.head()

## Pipe Modeling PreProcessing

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# --- Función de limpieza global ---
def clean_columns_global(df):
    df = df.copy()
    if "PaymentMethod" in df.columns:
        df["PaymentMethod"] = df["PaymentMethod"].str.replace(r" \(automatic\)", "", regex=True)
        
    if "TotalCharges" in df.columns:
        df["TotalCharges"] = df["TotalCharges"].replace(" ", np.nan).replace("", np.nan)
        df["TotalCharges"] = df["TotalCharges"].astype(float)
        df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())
    return df

# --- Transformadores globales ---
mapper_columns_bins = {"Yes": 1, "No": 0, "Male": 1, "Female": 0}

def map_bins(X): 
    return X.applymap(lambda x: mapper_columns_bins.get(x, x) if isinstance(x, str) else x)

def to_numeric(X):
    if isinstance(X, np.ndarray):
        df = pd.DataFrame(X)
        return df.apply(pd.to_numeric, errors='coerce').fillna(0).values
    else:
        return X.apply(pd.to_numeric, errors='coerce').fillna(0)

# --- Build preprocessor ---
def build_preprocessor(X):
    bin_transformer = FunctionTransformer(func=map_bins, validate=False)
    numeric_transformer = FunctionTransformer(func=to_numeric, validate=False)

    bin_cols = ["gender","Partner","Dependents","PhoneService","PaperlessBilling","Churn"]
    ordinal_cols = ["customer_segment", "price_segment", "additional_services"]
    num_cols = ["tenure", "SeniorCitizen", "MonthlyCharges", "TotalCharges"]
    multi_cols_for_dummies = [
        "MultipleLines","InternetService","OnlineSecurity","OnlineBackup",
        "DeviceProtection","TechSupport","StreamingTV","StreamingMovies",
        "Contract","PaymentMethod"
    ]

    ordinal_categories = [
        ["New", "Regular", "Loyal"],  # customer_segment
        ["Low", "Medium", "High"],    # price_segment
        [0, 1, 2, 3, 4, 5, 6]        # additional_services
    ]

    ordinal_transformer = OrdinalEncoder(categories=ordinal_categories)
    multi_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    scaling = StandardScaler()

    # --- Filtrar columnas presentes ---
    bin_cols_present = [c for c in bin_cols if c in X.columns]
    ordinal_cols_present = [c for c in ordinal_cols if c in X.columns]
    multi_cols_present = [c for c in multi_cols_for_dummies if c in X.columns]
    num_cols_present = [c for c in num_cols if c in X.columns]

    # --- ColumnTransformer ---
    col_transformer = ColumnTransformer(
        transformers=[
            ("bin", bin_transformer, bin_cols_present),
            ("ord", ordinal_transformer, ordinal_cols_present),
            ("multi", multi_transformer, multi_cols_present),
            ("scale", scaling, num_cols_present),
            ("to_numeric", numeric_transformer, X.columns)
        ],
        remainder="passthrough"
    )

    return col_transformer


In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.combine import SMOTETomek
from sklearn.model_selection import StratifiedKFold, cross_validate

def train_single_model( X, y, balance=True, with_model = True, model= None):

    steps = []
    
    # --- Preprocessor ---
    preprocessor = build_preprocessor(X)
    steps.append(("preprocessor", preprocessor))
    
    # --- Balanceo ---
    if balance:
        steps.append(("balance", SMOTETomek(random_state=42)))
    
    if with_model:
        # --- Modelo ---
        steps.append(("model", model))
        
    # --- Pipeline ---
    pipe = ImbPipeline(steps=steps)
    
        # --- Cross-validation ---
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    scores = cross_validate(
        pipe,
        X, y,
        cv=skf,
        scoring=['accuracy', 'f1', 'roc_auc'],
        return_train_score=True,
        n_jobs=-1
    )
    
    # --- Calcular medias ---
    mean_scores = {key: np.mean(val) for key, val in scores.items()}
    
    print("Media de scores CV ")
    for key, val in mean_scores.items():
        print(f"{key}: {val:.4f}")
    
    print("--------------------------------------")

    pipe.fit(X, y)
    
    return pipe


In [None]:
def reconstruct_transformed_df(pipe, df: pd.DataFrame, target="Churn"):
    """
    Reconstruye un DataFrame transformado desde un pipeline entrenado,
    preservando nombres de columnas para cada tipo de transformador.
    """
    X = df.drop(columns=[target])
    
    preprocessor = pipe.named_steps["preprocessor"]
    
    transformed = preprocessor.transform(X)
    # Si es sparse (por OneHot), convertir a array
    if hasattr(transformed, "toarray"):
        transformed = transformed.toarray()
    
    # --- Nombres de columnas ---
    all_features = []

    # Binarias
    bin_cols = [c for c in ["gender","Partner","Dependents","PhoneService","PaperlessBilling"] if c in X.columns]
    all_features += bin_cols

    # Ordinal
    ordinal_cols = [c for c in ["customer_segment", "price_segment", "additional_services"] if c in X.columns]
    all_features += ordinal_cols

    # One-hot
    multi_cols = [c for c in [
        "MultipleLines","InternetService","OnlineSecurity","OnlineBackup",
        "DeviceProtection","TechSupport","StreamingTV","StreamingMovies",
        "Contract","PaymentMethod"] if c in X.columns]

    if "multi" in preprocessor.named_transformers_:
        onehot_cols = preprocessor.named_transformers_["multi"].get_feature_names_out(multi_cols)
        all_features += list(onehot_cols)

    # Numéricas
    num_cols = [c for c in ["tenure", "SeniorCitizen", "MonthlyCharges", "TotalCharges"] if c in X.columns]
    all_features += num_cols

    # Si el array tiene más columnas que nombres, generar nombres genéricos
    if transformed.shape[1] > len(all_features):
        extra_cols = transformed.shape[1] - len(all_features)
        all_features += [f"extra_{i}" for i in range(extra_cols)]

    return pd.DataFrame(transformed, columns=all_features)


In [None]:
# --- Función para evaluar modelos entrenados ---
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score


def evaluate_models(trained_models, X, y):
    results = {}
    
    for name, model in trained_models.items():
        result = eval_model(model, X, y)
        results[name] = result
    return pd.DataFrame(results).T

def eval_model(model, X, y):
    
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:,1]
    
    return {
        "accuracy": accuracy_score(y, y_pred),
        "precision": precision_score(y, y_pred),
        "recall": recall_score(y, y_pred),
        "f1": f1_score(y, y_pred),
        "roc_auc": roc_auc_score(y, y_proba),
        "pr_auc": average_precision_score(y, y_proba)
    }

## Split Train Validation Test

* `SMOTE` → genera nuevas muestras **sintéticas** para la clase minoritaria.
* `TomekLinks` → elimina ejemplos de la clase mayoritaria que están muy cerca de la minoritaria (ruido/solapamiento).
* `SMOTETomek` → combina ambos pasos en una sola estrategia.

In [None]:
from sklearn.model_selection import train_test_split
# Data split

churn = clean_columns_global(churn)

# Supongamos que ya tenés X (features) y y (target)
X = churn.drop("Churn", axis=1)
y = churn["Churn"]

# Primero separamos train+val vs test (20% para test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

#  Luego de train+val, volvemos a separar: train (60%), val (20%)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)
# (0.25 de 0.8 = 0.20 → o sea: 60% train, 20% val, 20% test)

print("Train:", X_train.shape, y_train.shape)
print("Validation:", X_val.shape, y_val.shape)
print("Test:", X_test.shape, y_test.shape)

print(X_train.columns)
print(X_train.dtypes)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score, roc_auc_score, average_precision_score
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek

# Definimos folds estratificados
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Modelos baseline
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",   # ajusta peso de clases
        penalty="l2",              # regularización L2 para evitar overfitting
        C=0.1,                     # regularización más fuerte (menor C = más regularización)
        solver="liblinear",        # bueno para datasets pequeños/medianos
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=4,
        max_features='sqrt',
        class_weight='balanced_subsample',
        bootstrap=True,
        random_state=42
    ),
    "LightGBM":LGBMClassifier(
        random_state=42,
        class_weight="balanced",
        n_estimators=100
    )
}


In [None]:
models_pipes = {
    name:train_single_model(X=X_train, y=y_train, model=model)
    for name,model in models.items()
}

In [None]:
validacion = evaluate_models(models_pipes, X_train_val, y_train_val)
validacion

In [None]:
test = evaluate_models(models_pipes, X_test, y_test)
test

In [None]:
def evaluate_metric_differences(eval_df: pd.DataFrame, test_df: pd.DataFrame):
    diff_df = eval_df.set_index(eval_df.index) - test_df.set_index(test_df.index)
    diff_df = diff_df.rename(columns=lambda c: f"{c}_diff")
    return diff_df


In [None]:
evaluate_metric_differences(validacion,test)

In [None]:
import plotly.express as px
import pandas as pd

def plot_metric_differences(eval_df: pd.DataFrame, test_df: pd.DataFrame):
    # Calculamos diferencias
    diff_df = eval_df.set_index(eval_df.index) - test_df.set_index(test_df.index)
    diff_df = diff_df.rename(columns=lambda c: f"{c}_diff")
    
    # Convertimos a formato largo para Plotly
    diff_long = diff_df.reset_index().melt(id_vars='index', var_name='Metric', value_name='Difference')
    
    # Creamos gráfico de barras
    fig = px.bar(diff_long, x='index', y='Difference', color='Metric', barmode='group',
                 title="Diferencias entre métricas de evaluación y test",
                 labels={'index':'Modelo'})
    
    fig.show()
    

    fig = px.imshow(
        diff_df,
        text_auto=True,
        color_continuous_scale='RdYlGn_r',  # rojo = mayor diferencia, verde = menor
        aspect="auto",
        labels=dict(x="Métrica", y="Modelo", color="Diferencia"),
        title="Heatmap de diferencias entre métricas de evaluación y test"
    )

    fig.show()


In [None]:
plot_metric_differences(validacion,test)

## Metricas


In [None]:
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix, roc_curve, auc
import numpy as np

def plot_model_comparison(trained_models, X_val, y_val, X_test, y_test):
    """
    Genera un subplot con:
      - Curvas ROC para validation y test de cada modelo
      - Matriz de confusión para validation y test de cada modelo

    Args:
        trained_models (dict): Diccionario {nombre_modelo: modelo_entrenado}
        X_val, y_val: Dataset de validación
        X_test, y_test: Dataset de test
    """
    from plotly.subplots import make_subplots
    
    n_models = len(trained_models)

    subplot_titles = [title for name in trained_models.keys() for title in 
                  [f"{name} ROC Val", f"{name} ROC Test", f"{name} CM Val", f"{name} CM Test"]]


    fig = make_subplots(
        rows=n_models, cols=4,
        subplot_titles= subplot_titles,
        horizontal_spacing=0.1,
        vertical_spacing=0.15
    )

    row = 1
    for model_name, model in trained_models.items():
        # --- Curvas ROC ---
        for col, (X, y, label) in enumerate([(X_val, y_val, 'Val'), (X_test, y_test, 'Test')], start=1):
            y_proba = model.predict_proba(X)[:, 1]
            fpr, tpr, _ = roc_curve(y, y_proba)
            roc_auc = auc(fpr, tpr)
            fig.add_trace(
                go.Scatter(x=fpr, y=tpr, mode='lines', name=f"{model_name} {label} (AUC={roc_auc:.2f})"),
                row=row, col=col
            )
            fig.update_xaxes(title_text="FPR", row=row, col=col)
            fig.update_yaxes(title_text="TPR", row=row, col=col)

        # --- Matrices de Confusión ---
        for col, (X, y, label) in enumerate([(X_val, y_val, 'Val'), (X_test, y_test, 'Test')], start=3):
            y_pred = model.predict(X)
            cm = confusion_matrix(y, y_pred)
            fig.add_trace(
                go.Heatmap(
                    z=cm,
                    x=[f"Pred {c}" for c in range(cm.shape[1])],
                    y=[f"True {c}" for c in range(cm.shape[0])],
                    showscale=False,
                    colorscale='Blues',
                    text=cm,
                    texttemplate="%{text}"
                ),
                row=row, col=col
            )

        row += 1

    fig.update_layout(height=300*n_models, width=1400, title_text="Comparación de Modelos: ROC y Matriz de Confusión")
    fig.show()


In [None]:
plot_model_comparison(
    models_pipes,
    X_val, y_val,
    X_test, y_test
    )


## Guardado de Modelos entrenados

In [None]:
import joblib
import os
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def save_model_with_score(model, model_name, X_test, y_test, folder="../models"):
    """
    Parámetros:
    -----------
    model : estimator
        Modelo ya entrenado (fit).
    X_test : DataFrame o array
        Datos de test para evaluar.
    y_test : Series o array
        Etiquetas verdaderas.
    filename : str
        Nombre del archivo .pkl donde se guardará el modelo.
    metrics : list
        Lista de métricas a calcular, ej: ["accuracy", "f1", "precision", "recall"].
    folder : str
        Carpeta donde se guardará el modelo.
    """

    # Crear carpeta si no existe
    os.makedirs(folder, exist_ok=True)

    # Predecir
    y_pred = model.predict(X_test)

    
    # Calcular métricas
    scores = eval_model(model, X_test, y_test)
    
    # Extraer nombres de columnas si es DataFrame
    if hasattr(X_test, "columns"):
        feature_names = list(X_test.columns)
    else:
        feature_names = [f"feature_{i}" for i in range(X_test.shape[1])]


    # Guardar con joblib
    _name = model_name.split(" ")
    _name = "_".join(_name).lower() + ".pkl"

    # Empaquetar modelo + scores + columnas
    obj_to_save = {
        "name": model_name,
        "model": model,
        "scores": scores,
        "features": feature_names
    }
    
    filepath = os.path.join(folder, _name)
    joblib.dump(obj_to_save, filepath)

    print(f"✅ Modelo guardado en {filepath} con scores: {scores}")
    print(f"📌 Columnas guardadas: {feature_names}")
    return scores


In [None]:
for name,model in models_pipes.items():
    print(f"Guardando el modelo: {name}")
    save_model_with_score(model,name,X_test,y_test)
