# Telecom X — ETL + EDA + Modelado

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/flacoca1970/Desafio_2/blob/main/notebooks/TelecomX_LATAM_inyectado.ipynb)

Este notebook ejecuta el pipeline del desafío: **Extracción → Transformación → EDA → Model-ready → Baseline**.


In [None]:
!pip -q install imbalanced-learn --upgrade
import os, io, json, ast, warnings
warnings.filterwarnings("ignore")
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler
from pandas import json_normalize
print("Librerías cargadas.")

In [None]:
# Configuración de extracción
USE_API = False
API_URL = "https://<tu_api>/endpoint"
LOCAL_JSON_PATH = "/content/TelecomX_Data.json"

def robust_load_json_dataframe_from_str(raw: str) -> pd.DataFrame:
    raw = raw.strip()
    try:
        obj = json.loads(raw)
        if isinstance(obj, list):
            return pd.DataFrame(obj)
        if isinstance(obj, dict):
            for key in ["data","items","results","records","rows"]:
                if key in obj and isinstance(obj[key], list):
                    return pd.DataFrame(obj[key])
            return pd.DataFrame([obj])
    except Exception:
        pass
    try:
        df = pd.read_json(io.StringIO(raw), lines=True)
        if isinstance(df, pd.DataFrame):
            return df
    except Exception:
        pass
    return pd.read_json(io.StringIO(raw))

def robust_load_json_dataframe(path: str) -> pd.DataFrame:
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()
    return robust_load_json_dataframe_from_str(raw)

def parse_maybe_dict(x):
    if isinstance(x, dict):
        return x
    if pd.isna(x):
        return None
    s = str(x).strip()
    if not (s.startswith("{") and s.endswith("}")):
        return None
    try:
        return json.loads(s)
    except Exception:
        pass
    try:
        return ast.literal_eval(s)
    except Exception:
        return None

def flatten_telecomx(df: pd.DataFrame) -> pd.DataFrame:
    nested_candidates = [c for c in df.columns if c.lower() in ["customer","phone","internet","account"]]
    flat_parts = []
    for c in nested_candidates:
        parsed = df[c].apply(parse_maybe_dict)
        part = json_normalize(parsed.dropna())
        part = part.reindex(parsed.index)
        part.columns = [f"{c}__{col}" for col in part.columns]
        flat_parts.append(part)
    out = df.drop(columns=nested_candidates, errors="ignore").copy()
    if flat_parts:
        extra = pd.concat(flat_parts, axis=1)
        out = pd.concat([out, extra], axis=1)
    return out

def standardize_churn(s: pd.Series) -> pd.Series:
    if s.dtype == bool:
        return s.map({True:"Yes", False:"No"})
    if np.issubdtype(s.dropna().infer_objects().dtype, np.number):
        return s.map({1:"Yes", 0:"No"})
    ss = s.astype(str).str.strip().str.lower().replace({"nan": np.nan, "none": np.nan, "": np.nan})
    return ss.map(lambda x: "Yes" if x=="yes" else ("No" if x=="no" else np.nan))


In [None]:
# Extracción
if USE_API:
    import requests
    r = requests.get(API_URL, timeout=30)
    r.raise_for_status()
    df = robust_load_json_dataframe_from_str(r.text)
else:
    df = robust_load_json_dataframe(LOCAL_JSON_PATH)
print("Shape inicial:", df.shape)
df.head(3)

In [None]:
# Transformación y EDA rápida
df_flat = flatten_telecomx(df)
if "Churn" in df_flat.columns:
    df_flat["Churn"] = standardize_churn(df_flat["Churn"])

for cand in ["account__MonthlyCharges","account__TotalCharges","account__tenure","MonthlyCharges","TotalCharges","tenure","Tenure"]:
    if cand in df_flat.columns:
        df_flat[cand] = pd.to_numeric(df_flat[cand], errors="coerce")

print("Shape aplanado:", df_flat.shape)
df_flat.head(3)

In [None]:
# Tasa global y por categorías
def churn_rate_by(df, cat_col):
    m = df["Churn"].isin(["Yes","No"]) & df[cat_col].notna()
    if not m.any(): return None
    t = df.loc[m, [cat_col, "Churn"]].copy()
    rate = t.groupby(cat_col)["Churn"].apply(lambda s: (s=="Yes").mean()).rename("churn_rate").reset_index()
    return rate.sort_values("churn_rate", ascending=False)

def plot_bar(rate_df, title, xcol):
    if rate_df is None or rate_df.empty: return
    plt.figure(figsize=(7,4))
    plt.bar(rate_df[xcol].astype(str), rate_df["churn_rate"].values)
    plt.xticks(rotation=45, ha="right")
    plt.ylabel("Tasa de churn")
    plt.title(title)
    plt.show()

known = df_flat["Churn"].isin(["Yes","No"]) if "Churn" in df_flat.columns else pd.Series(False, index=df_flat.index)
if known.any():
    print("Tasa global de churn:", round(100 * (df_flat.loc[known, "Churn"]=="Yes").mean(), 2), "%")

for col, label in [
    ("account__Contract", "Contract"),
    ("account__PaymentMethod", "PaymentMethod"),
    ("internet__InternetService", "InternetService"),
]:
    if col in df_flat.columns:
        rate = churn_rate_by(df_flat, col)
        plot_bar(rate, f"Tasa de churn por {label}", col)

In [None]:
# Model-ready y baseline
assert "Churn" in df_flat.columns, "No se encontró 'Churn' tras aplanar. Revisa el JSON."
df_model = df_flat.dropna(subset=["Churn"]).copy()
df_model["Churn"] = df_model["Churn"].map({"Yes":1, "No":0})

y = df_model["Churn"]
X = df_model.drop(columns=["Churn", "customerID"], errors="ignore")

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))])
pre = ColumnTransformer([("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = Pipeline([("pre", pre), ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))])
clf.fit(X_train, y_train)
proba = clf.predict_proba(X_test)[:,1]
pred  = (proba >= 0.5).astype(int)

print("ROC-AUC:", roc_auc_score(y_test, proba))
print("PR-AUC:",  average_precision_score(y_test, proba))
print(classification_report(y_test, pred, digits=3))

In [None]:
# Variante balanceada con RandomOverSampler (sin fuga de datos)
imb_clf = ImbPipeline([("pre", pre), ("ros", RandomOverSampler(random_state=42)), ("model", LogisticRegression(max_iter=1000))])
imb_clf.fit(X_train, y_train)
proba2 = imb_clf.predict_proba(X_test)[:,1]
pred2  = (proba2 >= 0.5).astype(int)
print("ROC-AUC (ROS):", roc_auc_score(y_test, proba2))
print("PR-AUC  (ROS):", average_precision_score(y_test, proba2))
print(classification_report(y_test, pred2, digits=3))