In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report, roc_auc_score
from google.colab import drive

In [25]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
df = pd.read_csv("/content/drive/MyDrive/machine learning/actividades ml/Proyecto 2 – Parte I (Core)/dataset1_clean.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        7043 non-null   int64  
 1   customerid        7043 non-null   object 
 2   gender            7043 non-null   object 
 3   seniorcitizen     7043 non-null   int64  
 4   partner           7043 non-null   object 
 5   dependents        7043 non-null   object 
 6   tenure            7043 non-null   int64  
 7   phoneservice      7043 non-null   object 
 8   multiplelines     7043 non-null   object 
 9   internetservice   7043 non-null   object 
 10  onlinesecurity    7043 non-null   object 
 11  onlinebackup      7043 non-null   object 
 12  deviceprotection  7043 non-null   object 
 13  techsupport       7043 non-null   object 
 14  streamingtv       7043 non-null   object 
 15  streamingmovies   7043 non-null   object 
 16  contract          7043 non-null   object 


# Preprocesamiento de Datos


In [28]:
# Primero transformaremos el target "churn" a binario
df['churn'] = df['churn'].apply(lambda x: 1 if x == 'Yes' else 0)

In [29]:
# Seleccion de features
X = df.drop(columns=['churn', 'Unnamed: 0', 'customerid'], axis=1)
y = (df['churn'])

In [30]:
# Clasificamos variables según su tipo
num_features = X.select_dtypes(include='number').columns.tolist()
cat_features = X.select_dtypes(include='object').columns.tolist()


In [31]:
# Crear pipelines numéricas y categóricas

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [32]:
# ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [33]:
# Dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

**Modelo de Regresión Logística:**

In [34]:
# Creamos pipeline para Log.
pipeline_log = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=100))
])

In [35]:
# Optimizamos con GridSearchCV

params_log = {
    "classifier__C": [0.01, 0.1, 1, 10, 100]
}

grid_log = GridSearchCV(pipeline_log, params_log, cv=5, scoring = "accuracy")
grid_log.fit(X_train, y_train)

In [37]:
# Resultados
print("Logistic Regression - Mejores parámetros:", grid_log.best_params_)
print(f"Logistic Regression - Accuracy(optimizado): {grid_log.best_score_}")

Logistic Regression - Mejores parámetros: {'classifier__C': 10}
Logistic Regression - Accuracy(optimizado): 0.8047542872272857


In [38]:
# entrenar modelo con los mejores parámetros
best_log = grid_log.best_estimator_
best_log.fit(X_train, y_train)

In [52]:
y_pred = grid_log.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"Accuracy:", {accuracy_score(y_test, y_pred)})

[[925 110]
 [165 209]]
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.80      0.80      1409

Accuracy: {0.8048261178140526}


**Modelo 2: KNN Clasifier**

In [39]:
# Pipeline para knn

pipeline_knn = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", KNeighborsClassifier(n_neighbors=5))
])

In [40]:
# Optimizamos con GridSearchCV
knn_params = {
    "classifier__n_neighbors": list(range(3, 21))
}

grid_knn = GridSearchCV(pipeline_knn, knn_params, cv=5, scoring = "accuracy")
grid_knn.fit(X_train, y_train)

In [41]:
# Resultados
print("KNN - Mejores parámetros:", grid_knn.best_params_)
print(f"KNN - Accuracy(optimizado): {grid_knn.best_score_}")

KNN - Mejores parámetros: {'classifier__n_neighbors': 20}
KNN - Accuracy(optimizado): 0.7965902339003407


In [51]:
y_pred = grid_knn.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"Accuracy:", {accuracy_score(y_test, y_pred)})

[[901 134]
 [169 205]]
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      1035
           1       0.60      0.55      0.58       374

    accuracy                           0.78      1409
   macro avg       0.72      0.71      0.72      1409
weighted avg       0.78      0.78      0.78      1409

Accuracy: {0.7849538679914834}


**Modelo 3: Decision Tree Clasiffier**

In [42]:
# Pipeline para DecissionTreeClassifier

pipeline_tree = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", DecisionTreeClassifier())
])

In [43]:
# Optimizamos con GridSearchCV

param_grid = {
    "classifier__max_depth": [3, 5, 10, None],
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__min_samples_leaf": [1, 2, 4],
    "classifier__criterion": ["gini", "entropy"]
}

grid_tree = GridSearchCV(pipeline_tree, param_grid, cv=5, scoring = "accuracy")
grid_tree.fit(X_train, y_train)

In [44]:
print("Mejores parámetros:", grid_tree.best_params_)
print("Mejor accuracy:", grid_tree.best_score_)

Mejores parámetros: {'classifier__criterion': 'entropy', 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}
Mejor accuracy: 0.7919725894837045


In [50]:
y_pred = grid_tree.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"Accuracy:", {accuracy_score(y_test, y_pred)})

[[912 123]
 [162 212]]
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      1035
           1       0.63      0.57      0.60       374

    accuracy                           0.80      1409
   macro avg       0.74      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409

Accuracy: {0.7977288857345636}


# Conclusiones

| Modelo                  | Accuracy | Precision (clase 1) | Recall (clase 1) | F1-score (clase 1) |
| ----------------------- | -------- | ------------------- | ---------------- | ------------------ |
| **Logistic Regression** | 0.8048   | 0.66                | 0.56             | 0.60               |
| **KNN Classifier**      | 0.7849   | 0.60                | 0.55             | 0.58               |
| **Decision Tree**       | 0.7977   | 0.63                | 0.57             | 0.60               |


 1. **Accuracy:**

Regresión Logística logra la mejor precisión general con 80.48%, ligeramente superior al Árbol de Decisión (79.77%) y al KNN (78.49%).

2.
**Clase Minoritaria (1): Recall y F1**

F1-score para clase 1 (balance entre precision y recall):
* Logistic Regression y Decision Tree empatan en 0.60.
* KNN tiene un rendimiento menor con 0.58.

**Clase Mayoritaria (0)**
* Todos los modelos predicen bastante bien la clase 0.
* Precision y recall sobre 0.84 en todos los casos.

**Recomendaciones**

Usar Logistic Regression o Decision Tree, ya que ambos ofrecen un buen balance entre precisión general y desempeño en clase minoritaria.

Para mayor interpretabilidad	Regresión Logística es mejor (coeficientes claros).
Para una mejor flexibilidad y manejo de relaciones **no lineales**	Decision Tree es más recomendable ya que puede capturar patrones más complejos.

En ningún caso sería recomdable usar KNN Classifier ya que tiene el rendimiento más bajo.