# Predicción de Churn con Regresión Logística

In [1]:
# Librerías

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# datos
data=pd.read_csv("https://raw.githubusercontent.com/Albertuff/Machine-Learning/master/datos/Telco-Customer-Churn.csv")
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [4]:
data.shape

(7043, 21)

In [5]:
del data["customerID"]


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [7]:
# Totalcharges es de tipo numerica, sin embargo están como tipo objeto. Vamos a convertir la variable a numerica
data.TotalCharges=pd.to_numeric(data.TotalCharges,errors="coerce")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [8]:
# La variable target debe ser de tipo numerica, la transformamos:
data.Churn=(data.Churn=="Yes").astype(int)      # Si la condicion es verdadera el tipo de dato es 1, si es falsa entonces devuelve un cero
data.SeniorCitizen=(data.SeniorCitizen==1).astype(bool)
data=data.fillna(0)     

In [9]:
# Vamos a poner orden en los nombres de los atributos y en las respuestas de los atributos
# Eliminamos las altas-bajas y concatenamos con _
data.columns=data.columns.str.lower().str.replace(" ","_")
data.columns

Index(['gender', 'seniorcitizen', 'partner', 'dependents', 'tenure',
       'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod',
       'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [10]:
data.head()

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,Female,False,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,False,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,False,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,False,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,Female,False,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [11]:
continuas=data[["tenure","monthlycharges","totalcharges"]]

In [12]:
# Preparamos al conjunto de datos en entrenamiento y prueba
train,test=train_test_split(data,train_size=0.2,random_state=1234)

# Definimos el target
y_train=train.churn

In [13]:
# Proporcion de abandono

churn_ratio=data.churn.mean()
churn_ratio.round(3)

0.265

¿Cómo incorporamos la información adicional que tenemos en la tarea predictiva

Atributos de importancia
¿Que atributos tiene un cliente que cancela su servicio?

In [14]:
# Influye el genero en la cancelación del servicio?

ratio_churn_mujeres=data[data.gender=="Female"].churn.mean()
print("Proporcion de mujeres que cancelan: ",ratio_churn_mujeres)
ratio_churn_hombres=data[data.gender=="Male"].churn.mean()
print("Proporcion de hombres que cancelan: ",ratio_churn_hombres)


# No se ven diferencias entre hombres y mujeres

Proporcion de mujeres que cancelan:  0.26920871559633025
Proporcion de hombres que cancelan:  0.2616033755274262


In [15]:
ratio_churn_partner_y=data[data.partner=="Yes"].churn.mean()
print(" Proporcion de clientes que viven con alguien mas y cancelan: ",ratio_churn_partner_y)
ratio_churn_partner_n=data[data.partner=="No"].churn.mean()
print("Proporcion de clientes que no viven con alguien mas y que cancelan: ",ratio_churn_partner_n)

 Proporcion de clientes que viven con alguien mas y cancelan:  0.1966490299823633
Proporcion de clientes que no viven con alguien mas y que cancelan:  0.32957978577313923


In [16]:
# Otra forma de identificar los atributos de interes es a través del riesgo.
# Riesgo de abandono de mujeres: es comparar la proporcion de abandono en mujeres con la proporcion global.
# Riesgo de abandono en ("atributo") = proporcion de ("atributo") que cancelan dividida por la proporción de abandono en la base de datos.
data[data.gender=="Female"].churn.mean()/data.churn.mean()

# El grupo de mujeres no es un grupo de riesgo 

1.014466016021912

In [17]:
data[data.partner=="No"].churn.mean()/data.churn.mean()
# Las personas que no viven con alguien mas si es un grupo de riesgo

1.241963847619165

In [18]:
categoricas=data[["gender","seniorcitizen","partner","dependents","phoneservice","internetservice","onlinesecurity","onlinebackup","deviceprotection","techsupport","streamingmovies","contract","paperlessbilling","paymentmethod"]]
categoricas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   gender            7043 non-null   object
 1   seniorcitizen     7043 non-null   bool  
 2   partner           7043 non-null   object
 3   dependents        7043 non-null   object
 4   phoneservice      7043 non-null   object
 5   internetservice   7043 non-null   object
 6   onlinesecurity    7043 non-null   object
 7   onlinebackup      7043 non-null   object
 8   deviceprotection  7043 non-null   object
 9   techsupport       7043 non-null   object
 10  streamingmovies   7043 non-null   object
 11  contract          7043 non-null   object
 12  paperlessbilling  7043 non-null   object
 13  paymentmethod     7043 non-null   object
dtypes: bool(1), object(13)
memory usage: 722.3+ KB


In [19]:
# Como obtenemos todos los riesgos para los atributos en el dataframe?

from IPython.display import display     # Desplegar los dataframes

#Proporcion de abandono
ratio=data.churn.mean()

for atributos in categoricas:
    df_atributos=data.groupby(by=atributos).churn.agg(["mean"])
    df_atributos["Riesgo"]=df_atributos["mean"]/ratio
    display(df_atributos)



Unnamed: 0_level_0,mean,Riesgo
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.269209,1.014466
Male,0.261603,0.985807


Unnamed: 0_level_0,mean,Riesgo
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.236062,0.889557
True,0.416813,1.570686


Unnamed: 0_level_0,mean,Riesgo
partner,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.32958,1.241964
Yes,0.196649,0.741038


Unnamed: 0_level_0,mean,Riesgo
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.312791,1.1787
Yes,0.154502,0.582215


Unnamed: 0_level_0,mean,Riesgo
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.249267,0.939319
Yes,0.267096,1.006506


Unnamed: 0_level_0,mean,Riesgo
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1
DSL,0.189591,0.714441
Fiber optic,0.418928,1.578656
No,0.07405,0.279044


Unnamed: 0_level_0,mean,Riesgo
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.417667,1.573906
No internet service,0.07405,0.279044
Yes,0.146112,0.550597


Unnamed: 0_level_0,mean,Riesgo
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.399288,1.504645
No internet service,0.07405,0.279044
Yes,0.215315,0.811377


Unnamed: 0_level_0,mean,Riesgo
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.391276,1.474456
No internet service,0.07405,0.279044
Yes,0.225021,0.847951


Unnamed: 0_level_0,mean,Riesgo
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.416355,1.56896
No internet service,0.07405,0.279044
Yes,0.151663,0.571517


Unnamed: 0_level_0,mean,Riesgo
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.336804,1.269188
No internet service,0.07405,0.279044
Yes,0.299414,1.128291


Unnamed: 0_level_0,mean,Riesgo
contract,Unnamed: 1_level_1,Unnamed: 2_level_1
Month-to-month,0.427097,1.60944
One year,0.112695,0.424672
Two year,0.028319,0.106714


Unnamed: 0_level_0,mean,Riesgo
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.163301,0.615371
Yes,0.335651,1.264842


Unnamed: 0_level_0,mean,Riesgo
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1
Bank transfer (automatic),0.167098,0.629681
Credit card (automatic),0.152431,0.57441
Electronic check,0.452854,1.706502
Mailed check,0.191067,0.720003


#### Este análisis nos sirve para identificar los atributos que mejor discriminan a los clientes que abandonan y los que no abandonan
#### Ahora la pregunta es: ¿Que atributos son mas importantes? 
#### ¿Cuáles son mas informativos, que pueda extraer mas información de clientes que cancelan?

#### La siguiente métrica nos ayuda a responder la pregunta anterior, y se llama información mutua (mutual information, MI). Esta métrica mide que tanta información podemos obtener de una variable (churn) cuando conocemos el valor de la otra variable (ejemplo: tipo de contrato).

In [20]:
from sklearn.metrics import mutual_info_score

# Que tanta información puedo extraer de churn a partir de partner
mutual_info_score(data.partner,data.churn)

0.011453657253317984

In [21]:
# Que atributos son mas informativos?
# Definimos una función
def MI(atributo):
    return  mutual_info_score(atributo,data.churn)

df_info_mutua=data[categoricas].apply(MI)
df_info_mutua

ValueError: Boolean array expected for the condition, not object

In [None]:
# Importante: score de MI siki es oara atributos categoricos, y para las continuas?
# Para las continuas empleamos la correlacion


In [28]:
from sklearn.linear_model import LogisticRegression

In [33]:
from sklearn.feature_extraction import DictVectorizer


test_dict=test[categoricas+continuas].to_dict(orient="records")
dv=DictVectorizer(sparse=False)
dv.fit(test_dict)
X_test=dv.transform(test_dict)
X_test                                  # Conjunto de prueba

# Una vez mas tenemos los datos en una matris, hacemos las predicciones 
y_predict=model.predict(X_test)
y_predict
print(classification_report(y_test,y_predict))

NameError: name 'model' is not defined

In [29]:
# Para hacer la validación necesitamos pasar todos los datos a matriz
data_dict=test[categoricas+continuas].to_dict(orient="records")
dv=DictVectorizer(sparse=False)
dv.fit(data.dict)
X=dv.transform(data_dict)
X            
Y=data.churn

NameError: name 'DictVectorizer' is not defined

In [34]:
# El modelo que construimos tiene una exactitud del 80%, una sensibilidad del 52% y una especificidad del 89%
# Ojo: Estos valores dependen del conjunto de prueba y entrenamiento que elejimos. Para resolver este problema vamos a hacer una validación cruzada para tener una métrica más estable

from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

# Definimos el tipo de validación cruzada 
cv=KFold(n_splits=10,random_state=1234,shuffle=True)

scores=cross_val_score(modelo,X,Y,scoring="accuracy",n_jobs=-1)
scores

# Una métrica más estable
print(f"Exactitud = {scores.mean():.3f}  +/-  {scores.std():.3f}")

NameError: name 'modelo' is not defined

In [30]:
# Vamos a ver la interpretación de los coeficientes
# Seleccionar solo por ahora algunos de los atributos

small=data[["contract","tenure","totalcharges"]]
data_dict=train[small].to_dict(orient="records")
dv=DictVectorizer(sparse=False)
dv.fit(data.dict)
X_small_train=dv.transform(data_dict)
X_small_train
y_train

# Definir el modelo reducido
modelo_small=LogisticRegression(solver="liblinear")

# Entrenamos el modelo reducido
modelo_small.fit(X_small_train,y_train)

print("Término independiente de la regresión logistíca", modelo_small.intercept_[0])

ValueError: Boolean array expected for the condition, not int64

TypeError: list indices must be integers or slices, not tuple