# Telecom X — Preprocesamiento y Modelado de Churn

Notebook listo para ejecutar: carga el CSV limpio en `data/TelecomX_cleaned.csv`. Este notebook realiza encoding, balanceo opcional (SMOTE), escalado, entrenamiento y evaluación de Regresión Logística y Random Forest, además de análisis de importancia de variables.


## 0. Instalar dependencias (si hace falta)
Descomenta y ejecuta la celda si necesitas instalar librerías:

```python
# !pip install scikit-learn imbalanced-learn matplotlib pandas numpy joblib
```

In [None]:

import os, pandas as pd, numpy as np
from pathlib import Path
print("Python and basic libs available")


## 1. Cargar datos
Coloca `TelecomX_cleaned.csv` en la carpeta `data/` del proyecto.

In [None]:

possible = [
    "data/TelecomX_cleaned.csv",
    "/mnt/data/telecomx_churn_project/data/TelecomX_cleaned.csv",
    "TelecomX_cleaned.csv"
]
csv_path = None
for p in possible:
    if os.path.exists(p):
        csv_path = p
        break
if csv_path is None:
    raise FileNotFoundError("No se encontró data/TelecomX_cleaned.csv. Por favor sube el CSV limpio y re-ejecuta.")
df = pd.read_csv(csv_path)
print("Shape:", df.shape)
df.head()


## 2. Eliminar columnas irrelevantes (IDs)

In [None]:

for id_col in ['customerID','CustomerID','id','ID']:
    if id_col in df.columns:
        df.drop(columns=[id_col], inplace=True)
        print("Dropped", id_col)
print("Shape after drop:", df.shape)


## 3. Preparar variable target y revisar nulos

In [None]:

target_candidates = ['Churn','churn','Evasion','Evasión','Cancelacion','Cancelación','target']
target_col = next((c for c in df.columns if c in target_candidates), None)
if target_col is None:
    for c in df.columns:
        if df[c].nunique()==2:
            target_col = c; break
if target_col is None:
    raise ValueError("No se pudo detectar la columna target (Churn). Indica manualmente el nombre en target_col.")

print("Target detected:", target_col)
df[target_col] = df[target_col].astype(str).str.strip().str.lower().map({'no':0,'n':0,'false':0,'0':0,'si':1,'sí':1,'yes':1,'y':1,'true':1,'1':1}).astype(int)
print(df[target_col].value_counts())
print("\nNulos por columna:")
print(df.isna().sum().sort_values(ascending=False).head(20))


## 4. One-hot encoding para categóricas (solo con pocas categorías)

In [None]:

cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
cat_cols = [c for c in cat_cols if c != target_col]
print("Categorical columns:", cat_cols)
ohe_cols = [c for c in cat_cols if df[c].nunique() < 30]
print("Columns to OHE:", ohe_cols)
df_ohe = pd.get_dummies(df, columns=ohe_cols, drop_first=True)
print("Shape after OHE:", df_ohe.shape)


## 5. Chequeo de desbalance

In [None]:

import matplotlib.pyplot as plt
y = df_ohe[target_col]
print("Counts:\n", y.value_counts())
print("Proportions:\n", y.value_counts(normalize=True))

plt.figure(figsize=(4,3))
plt.bar(['No','Yes'], y.value_counts().values)
plt.title('Distribución de Churn')
plt.show()


## 6. Opcional: SMOTE para balancear clases

In [None]:

apply_smote = True

if apply_smote:
    try:
        from imblearn.over_sampling import SMOTE
        X = df_ohe.drop(columns=[target_col])
        sm = SMOTE(random_state=42)
        X_res, y_res = sm.fit_resample(X, df_ohe[target_col])
        print('After SMOTE:', y_res.value_counts())
    except Exception as e:
        print('imblearn no disponible o error en SMOTE:', e)
        X_res, y_res = df_ohe.drop(columns=[target_col]), df_ohe[target_col]
else:
    X_res, y_res = df_ohe.drop(columns=[target_col]), df_ohe[target_col]


## 7. Normalización para modelos sensibles (StandardScaler)

In [None]:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Convert to DataFrame if needed
X_res_df = X_res.copy()
X_res_df = X_res_df.reset_index(drop=True)
X_res_df.head()
X_scaled_all = scaler.fit_transform(X_res_df)
print('Scaled shape:', X_scaled_all.shape)


## 8. Matriz de correlación (numéricas) y correlación con churn

In [None]:

import numpy as np
num_df = pd.concat([X_res_df, y_res.reset_index(drop=True)], axis=1)
corr = num_df.corr()
print('Top correlations with target:')
print(corr[target_col].abs().sort_values(ascending=False).head(15))
plt.figure(figsize=(10,8))
plt.imshow(corr.values, aspect='auto')
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.index)), corr.index)
plt.title('Matriz de correlación (numérica)')
plt.tight_layout()
plt.show()


## 9. Boxplots: Tenure and MonthlyCharges (si existen)

In [None]:

tenure_candidates = [c for c in X_res_df.columns if 'tenure' in c.lower() or 'time' in c.lower() or 'months' in c.lower()]
monthly_candidates = [c for c in X_res_df.columns if 'monthly' in c.lower() or 'charge' in c.lower() or 'costo' in c.lower() or 'factur' in c.lower()]
print('Tenure candidates', tenure_candidates)
print('Monthly candidates', monthly_candidates)
import matplotlib.pyplot as plt
for tc in tenure_candidates[:2]:
    plt.figure(figsize=(6,4))
    plt.boxplot([X_res_df.loc[y_res==0, tc], X_res_df.loc[y_res==1, tc]], labels=['No','Yes'])
    plt.title(f"{tc} vs Churn")
    plt.show()
for mc in monthly_candidates[:2]:
    plt.figure(figsize=(6,4))
    plt.boxplot([X_res_df.loc[y_res==0, mc], X_res_df.loc[y_res==1, mc]], labels=['No','Yes'])
    plt.title(f"{mc} vs Churn")
    plt.show()


## 10. División entrenamiento / prueba (70/30) y escalado para train/test

In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res_df, y_res, test_size=0.3, random_state=42, stratify=y_res)
scaler2 = StandardScaler().fit(X_train)
X_train_scaled = scaler2.transform(X_train)
X_test_scaled = scaler2.transform(X_test)
print('Train/Test shapes:', X_train.shape, X_test.shape)


## 11. Modelado: Logistic Regression (scaled) y Random Forest (unscaled)

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Logistic Regression (uses scaled data)
log = LogisticRegression(max_iter=1000, random_state=42)
log.fit(X_train_scaled, y_train)
y_pred_log = log.predict(X_test_scaled)
y_prob_log = log.predict_proba(X_test_scaled)[:,1]
print('Logistic Regression Report:')
print(classification_report(y_test, y_pred_log))
print('ROC AUC:', roc_auc_score(y_test, y_prob_log))

# Random Forest (uses unscaled features)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:,1]
print('\nRandom Forest Report:')
print(classification_report(y_test, y_pred_rf))
print('ROC AUC:', roc_auc_score(y_test, y_prob_rf))


## 12. Importancia de variables y coeficientes

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
feat_names = X_res_df.columns
rf_imp = pd.Series(rf.feature_importances_, index=feat_names).sort_values(ascending=False)
plt.figure(figsize=(10,5))
rf_imp.head(15).plot(kind='bar')
plt.title('RF - Top 15 feature importances')
plt.show()

log_coef = pd.Series(log.coef_[0], index=feat_names).sort_values(ascending=False)
plt.figure(figsize=(10,5))
log_coef.head(15).plot(kind='bar')
plt.title('Logistic - Top 15 coefficients')
plt.show()


## 13. Guardar modelos y preprocesado
Se guardan `models/logistic_model.joblib`, `models/random_forest_model.joblib` y `data/TelecomX_preprocessed.csv` cuando se ejecuta.

In [None]:

import joblib
os.makedirs('models', exist_ok=True)
joblib.dump(log, 'models/logistic_model.joblib')
joblib.dump(rf, 'models/random_forest_model.joblib')
print('Models saved in models/ folder')

pd.DataFrame(X_res_df, columns=X_res_df.columns).assign(**{target_col: y_res.values}).to_csv('data/TelecomX_preprocessed.csv', index=False)
print('Saved data/TelecomX_preprocessed.csv')


## 14. Informe — Conclusiones y Recomendaciones
Rellena este apartado con los insights obtenidos tras ejecutar el notebook: métricas por modelo, variables más importantes, y recomendaciones de negocio (segmentación, ofertas, mejora de métodos de pago, campañas de retención, etc.).