In [40]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, ConfusionMatrixDisplay, classification_report
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [41]:
# 1. Cargar datos desde URL (JSON)
url = 'https://raw.githubusercontent.com/ingridcristh/challenge2-data-science-LATAM/main/TelecomX_Data.json'
response = requests.get(url)
df = response.json()
datos = pd.json_normalize(df)

In [42]:
datos_columnas_categoricas = ["Churn", "customer.Partner", "customer.Dependents",
                              "phone.PhoneService", "account.PaperlessBilling","customer.gender", "phone.MultipleLines", "internet.InternetService",
                 "internet.OnlineSecurity", "internet.OnlineBackup",
                 "internet.DeviceProtection", "internet.TechSupport",
                 "internet.StreamingTV", "internet.StreamingMovies",
                 "account.Contract", "account.PaymentMethod"]
datos_columnas_flotantes = ["account.Charges.Total"]

In [43]:
datos[datos_columnas_categoricas] = datos[datos_columnas_categoricas].astype("category")
datos["account.Charges.Total"] = datos["account.Charges.Total"].replace(" ", "0").astype("float64")


In [44]:
datos["Cuentas.diarias"] = datos["account.Charges.Total"] /30

In [48]:
# Reemplazamos cadenas vacías o espacios por NaN
datos['Churn'] = datos['Churn'].replace("", np.nan)
datos['account.Charges.Total'] = datos['account.Charges.Total'].replace(" ", np.nan)

# Eliminamos las filas donde Churn o Charges.Total sean NaN
datos = datos.dropna(subset=['Churn', 'account.Charges.Total'])

In [49]:
# 2. Separar X e y
y = datos["Churn"].apply(lambda x: 1 if x == "Yes" else 0)  # Convertimos a 0/1
X = datos.drop(columns=["Churn"])

In [51]:
# 3. Encoding de variables categóricas
categoricas = X.select_dtypes(include=["category", "object"]).columns
one_hot = make_column_transformer(
    (OneHotEncoder(drop="if_binary"), categoricas),
    remainder="passthrough",
    sparse_threshold=0
)

X_encoded = one_hot.fit_transform(X)
X_encoded = pd.DataFrame(X_encoded, columns=one_hot.get_feature_names_out(X.columns))


In [53]:
# 4. Normalización (para KNN)
scaler = MinMaxScaler()
X_normalizado = scaler.fit_transform(X_encoded)


In [None]:
# 5. Proporción de churn
print("Proporción de churn:")
print(y.value_counts(normalize=True))

In [None]:
# 6. Análisis de correlación
plt.figure(figsize=(10,6))
sns.heatmap(X_encoded.corr(), cmap="coolwarm")
plt.title("Matriz de Correlación")
plt.show()


In [None]:
# 7. Análisis dirigido: tenure y account.Charges.Total
plt.figure(figsize=(6,4))
sns.boxplot(x=y, y=datos["customer.tenure"])
plt.title("Tenure vs Churn")
plt.show()

plt.figure(figsize=(6,4))
sns.boxplot(x=y, y=datos["account.Charges.Total"])
plt.title("Gasto total vs Churn")
plt.show()


In [None]:
# 8. División de datos
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.3, stratify=y, random_state=5
)

X_train_norm, X_test_norm, _, _ = train_test_split(
    X_normalizado, y, test_size=0.3, stratify=y, random_state=5
)


In [None]:
# 9. Modelos
## Árbol de decisión
arbol = DecisionTreeClassifier(max_depth=5, random_state=5)
arbol.fit(X_train, y_train)
y_pred_arbol = arbol.predict(X_test)

## KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_norm, y_train)
y_pred_knn = knn.predict(X_test_norm)


In [None]:
# 10. Evaluación
def evaluar_modelo(nombre, y_true, y_pred):
    print(f"\nResultados para {nombre}:")
    print(f"Exactitud: {accuracy_score(y_true,y_pred):.4f}")
    print(f"Precisión: {precision_score(y_true,y_pred):.4f}")
    print(f"Recall: {recall_score(y_true,y_pred):.4f}")
    print(f"F1-score: {f1_score(y_true,y_pred):.4f}")
    print("\nMatriz de confusión:")
    ConfusionMatrixDisplay(confusion_matrix(y_true,y_pred)).plot()
    plt.show()
    print("\nReporte completo:")
    print(classification_report(y_true,y_pred))

evaluar_modelo("Árbol de Decisión", y_test, y_pred_arbol)
evaluar_modelo("KNN", y_test, y_pred_knn)