In [1]:
# Lab | Cross Validation

#For this lab, we will build a model on customer churn binary classification problem. You will be using `files_for_lab/Customer-Churn.csv` file.



### Instructions

#1. Apply SMOTE for upsampling the data

    #- Use logistic regression to fit the model and compute the accuracy of the model.
    #- Use decision tree classifier to fit the model and compute the accuracy of the model.
    #- Compare the accuracies of the two models.


#2. Apply TomekLinks for downsampling

    #- It is important to remember that it does not make the two classes equal but only removes the points from the majority class that are close to other points in minority class.
    #- Use logistic regression to fit the model and compute the accuracy of the model.
    #- Use decision tree classifier to fit the model and compute the accuracy of the model.
    #- Compare the accuracies of the two models.
    #- You can also apply this algorithm one more time and check the how the imbalance in the two classes changed from the last time.

In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
data_cross = pd.read_csv('Customer-Churn.txt')
data_cross.head(8)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes
5,Female,0,No,No,8,Yes,No,No,Yes,No,Yes,Yes,Month-to-month,99.65,820.5,Yes
6,Male,0,No,Yes,22,Yes,No,Yes,No,No,Yes,No,Month-to-month,89.1,1949.4,No
7,Female,0,No,No,10,No,Yes,No,No,No,No,No,Month-to-month,29.75,301.9,No


In [4]:
data_cross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [5]:
def identify_column_types(df):
    """
    Identifica y devuelve las columnas numéricas y categóricas de un DataFrame.
    
    Parámetros:
    df (pd.DataFrame): El DataFrame a analizar.
    
    Retorna:
    dict: Un diccionario con dos claves 'numerical' y 'categorical', cada una conteniendo una lista de nombres de columnas.
    """
    numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    return {
        'numerical': numerical_cols,
        'categorical': categorical_cols
    }

# Ejemplo de uso
# Asume que churnData es tu DataFrame
column_types = identify_column_types(data_cross)
print("Columnas numéricas:", column_types['numerical'])
print("Columnas categóricas:", column_types['categorical'])

Columnas numéricas: ['SeniorCitizen', 'tenure', 'MonthlyCharges']
Columnas categóricas: ['gender', 'Partner', 'Dependents', 'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'TotalCharges', 'Churn']


In [6]:
data_cross.columns = data_cross.columns.str.lower().str.replace(' ', '_')
data_cross.tail()

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,monthlycharges,totalcharges,churn
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.8,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.2,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.6,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.4,306.6,Yes
7042,Male,0,No,No,66,Yes,Yes,No,Yes,Yes,Yes,Yes,Two year,105.65,6844.5,No


In [7]:
# Convertir la columna 'TotalCharges' a tipo numérico
data_cross['totalcharges'] = pd.to_numeric(data_cross['totalcharges'], errors='coerce')
# Verificar si hay valores NaN
print(data_cross['totalcharges'].isna().sum())

11


In [8]:
data_cross.totalcharges.value_counts()

totalcharges
20.20      11
19.75       9
20.05       8
19.90       8
19.65       8
           ..
6849.40     1
692.35      1
130.15      1
3211.90     1
6844.50     1
Name: count, Length: 6530, dtype: int64

In [9]:
# Filtrar las filas que tienen valores nulos en la columna 'totalcharges'
null_rows = data_cross[data_cross['totalcharges'].isnull()]
null_rows

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,monthlycharges,totalcharges,churn
488,Female,0,Yes,Yes,0,No,Yes,No,Yes,Yes,Yes,No,Two year,52.55,,No
753,Male,0,No,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.25,,No
936,Female,0,Yes,Yes,0,Yes,Yes,Yes,Yes,No,Yes,Yes,Two year,80.85,,No
1082,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.75,,No
1340,Female,0,Yes,Yes,0,No,Yes,Yes,Yes,Yes,Yes,No,Two year,56.05,,No
3331,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,19.85,,No
3826,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.35,,No
4380,Female,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.0,,No
5218,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,19.7,,No
6670,Female,0,Yes,Yes,0,Yes,No,Yes,Yes,Yes,Yes,No,Two year,73.35,,No


In [10]:
# Identificar las filas donde 'totalcharges' es NaN
null_totalcharges = data_cross['totalcharges'].isnull()

# Multiplicar los valores de 'monthlycharges' por 24 en las filas donde 'totalcharges' es NaN
data_cross.loc[null_totalcharges & (data_cross.index != 5218), 'monthlycharges'] *= 24

# Multiplicar por 12 en la fila 5218
data_cross.loc[5218, 'monthlycharges'] *= 12

# Rellenar los valores NaN en 'totalcharges' con los nuevos valores de 'monthlycharges'
data_cross['totalcharges'].fillna(data_cross['monthlycharges'], inplace=True)

# Verificar que ya no hay valores NaN en 'totalcharges'
print(data_cross['totalcharges'].isnull().sum())

0


In [11]:
#Como en el lab anterior decidi remplazar los 11 nulos de la columna totalcharges por la multiplicacion de monthlycharges con la columna contract que indica de cuanrto es el contrato firmado, me parecio el metedo que mas se puede acercar a la realidad.

In [12]:
#1. Apply SMOTE for upsampling the data

    #- Use logistic regression to fit the model and compute the accuracy of the model.
    #- Use decision tree classifier to fit the model and compute the accuracy of the model.
    #- Compare the accuracies of the two models.

In [13]:
X = data_cross[['tenure', 'seniorcitizen', 'monthlycharges', 'totalcharges']]
y = data_cross['churn']

In [14]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_sm, y_sm = smote.fit_resample(X, y)

In [29]:
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

churn
No     5174
Yes    5174
Name: count, dtype: int64

In [31]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=42)

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Inicializar y entrenar el modelo
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

# Predecir y evaluar el modelo
y_pred_log_reg = log_reg.predict(X_test)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f'Logistic Regression Accuracy: {accuracy_log_reg:.4f}')

Logistic Regression Accuracy: 0.7362


In [35]:
from sklearn.tree import DecisionTreeClassifier

# Inicializar y entrenar el modelol
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)

# Predecir y evaluar el modelo
y_pred_tree_clf = tree_clf.predict(X_test)
accuracy_tree_clf = accuracy_score(y_test, y_pred_tree_clf)
print(f'Decision Tree Classifier Accuracy: {accuracy_tree_clf:.4f}')

Decision Tree Classifier Accuracy: 0.7366


In [41]:
modelo1 = LogisticRegression()
modelo2 = DecisionTreeClassifier()

from sklearn.model_selection import cross_val_score

model_pipeline = [modelo1, modelo2]
model_names = ['Logistic Regression', 'Decision Tree Classifier']
scores = {}
i=0

for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

{'Logistic Regression': 0.7362, 'Decision Tree Classifier': 0.7366}
model_pipeline = [modelo1, modelo2]
model_names = ['Logistic Regression', 'Decision Tree Classifier']
scores = {}
i=0

for model in model_pipeline:
    var_score = np.var(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_names[i]] = var_score
    i = i+1
print(scores)

{'Logistic Regression': 0.7259441798437798, 'Decision Tree Classifier': 0.7314627548104401}
{'Logistic Regression': 0.0002560800694294398, 'Decision Tree Classifier': 0.00012518502971889414}


In [44]:
#Logistic Regression: Precisión de 0.7362
#Decision Tree Classifier: Precisión de 0.7366

#Conclusiones:
#Similitud en el rendimiento: Ambos modelos presentan una precisión muy similar, con una diferencia mínima. 
#Esto sugiere que, en este caso, ambos algoritmos son igualmente efectivos para predecir la variable de interés después de aplicar SMOTE.

#En resumen, ambos modelos son válidos, pero la regresión logística podría ser preferida si se busca una mayor interpretabilidad.
#Mientras que el árbol de decisión podría ser útil si se necesita explorar relaciones más complejas entre las variables.

In [46]:
#2. Apply TomekLinks for downsampling

    #- It is important to remember that it does not make the two classes equal but only removes the points from the majority class that are close to other points in minority class.
    #- Use logistic regression to fit the model and compute the accuracy of the model.
    #- Use decision tree classifier to fit the model and compute the accuracy of the model.
    #- Compare the accuracies of the two models.
    #- You can also apply this algorithm one more time and check the how the imbalance in the two classes changed from the last time.

In [48]:
X = data_cross[['tenure', 'seniorcitizen', 'monthlycharges', 'totalcharges']]
y = data_cross['churn']

In [50]:
from imblearn.under_sampling import TomekLinks

# Aplicar TomekLinks para el submuestreo
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_resample(X, y)

In [52]:
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

churn
No     5174
Yes    5174
Name: count, dtype: int64

In [54]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [56]:
# Inicializar y entrenar el modelo
log_reg2 = LogisticRegression(max_iter=1000, random_state=42)
log_reg2.fit(X_train, y_train)

# Predecir y evaluar el modelo
y_pred_log_reg = log_reg2.predict(X_test)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f'Logistic Regression Accuracy: {accuracy_log_reg:.4f}')

Logistic Regression Accuracy: 0.7744


In [58]:
# Inicializar y entrenar el modelol
tree_clf2 = DecisionTreeClassifier(random_state=42)
tree_clf2.fit(X_train, y_train)

# Predecir y evaluar el modelo
y_pred_tree_clf = tree_clf2.predict(X_test)
accuracy_tree_clf = accuracy_score(y_test, y_pred_tree_clf)
print(f'Decision Tree Classifier Accuracy: {accuracy_tree_clf:.4f}')

Decision Tree Classifier Accuracy: 0.7452


In [60]:
modelo3 = LogisticRegression()
modelo4 = DecisionTreeClassifier()

model_pipeline = [modelo3, modelo4]
model_names = ['Logistic Regression', 'Decision Tree Classifier']
scores = {}
i=0

for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

{'Logistic Regression': 0.7744, 'Decision Tree Classifier': 0.7452}
model_pipeline = [modelo3, modelo4]
model_names = ['Logistic Regression', 'Decision Tree Classifier']
scores = {}
i=0

for model in model_pipeline:
    var_score = np.var(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_names[i]] = var_score
    i = i+1
print(scores)

{'Logistic Regression': 0.7994971839336001, 'Decision Tree Classifier': 0.7544308581591819}
{'Logistic Regression': 0.0001688286320195139, 'Decision Tree Classifier': 0.00021498972562920008}


In [62]:
#Basándonos en las precisiones reportadas, observamos que el modelo de regresión logística tuvo una precisión ligeramente superior (77.44%) en comparación con el modelo de árbol de decisión (74.52%). 
#Esto sugiere que, para este conjunto de datos y después de aplicar TomekLinks para el submuestreo, el modelo de regresión logística podría ser más adecuado para predecir la variable objetivo. 
#Sin embargo, la diferencia en precisión entre los dos modelos no es significativa, por lo que otros factores como la interpretabilidad del modelo y el costo computacional podrían influir en la elección del modelo final.