# Importamos Librerías

In [None]:
# Datos
import numpy as np
import pandas as pd

# Visualizacion
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from random import random

# ML
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import xgboost as xgb

# Code to read csv file into Colaboratory:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

import warnings
warnings.filterwarnings("ignore")

# Importamos los Datos

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
churn = pd.read_csv("/content/drive/MyDrive/NuclioDSC/TFM/churn.csv")

# Analizamos los Datos

In [None]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
#Limpiamos los datos
churn = churn.drop('customerID', axis = 1)

#Transformamos los datos
for i in ['Partner', 'Dependents', 'PhoneService', 'OnlineSecurity',
          'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']:
    churn[i] = churn[i].apply(lambda x: 1 if x == 'Yes' else 0)

churn['TotalCharges'] = churn['TotalCharges'].apply(lambda x: 0 if x == ' ' else float(x))

#Hacemos el One Hot Encoding
churn = pd.get_dummies(churn, columns = ['gender', 'MultipleLines', 'InternetService', 'Contract', 'PaymentMethod'], drop_first = True)

churn

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,gender_Male,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,1,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,0,34,1,1,0,1,0,0,...,1,0,0,0,0,1,0,0,0,1
2,0,0,0,2,1,1,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0,0,0,45,0,1,0,1,1,0,...,1,1,0,0,0,1,0,0,0,0
4,0,0,0,2,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,1,24,1,1,0,1,1,1,...,1,0,1,0,0,1,0,0,0,1
7039,0,1,1,72,1,0,1,1,0,1,...,0,0,1,1,0,1,0,1,0,0
7040,0,1,1,11,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
7041,1,1,0,4,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,1


# Implementamos el Modelo

In [None]:
#Se usa regresión logística

#Preparamos el modelo
X_train, X_test, y_train, y_test = train_test_split(churn.drop('Churn', axis =1), churn['Churn'], test_size = 0.3,stratify = churn['Churn'], random_state = 101)

#Entrenamos el modelo
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

# Aplicamos la Métrica de Evaluación

## Analizamos otras métricas de evaluación

In [None]:
#Importamos el paquete que nos permite ver las distintas métricas
from sklearn.metrics import classification_report

#Hacemos predicciones
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1552
           1       0.64      0.53      0.58       561

    accuracy                           0.80      2113
   macro avg       0.74      0.71      0.72      2113
weighted avg       0.79      0.80      0.79      2113



Hay que observar que la capacidad de predecir el target (la clase 1 -fila del 1) es baja. La idea es analizar si el modelo genera algún tipo de beneficio para las predicciones del negocio. Para esto calcularemos la probabilidad de predicción del modelo.

## Calculamos la probabilidad de predicción

La probabilidad de predicción es la probabilidad de que la variable dependiente (el target) tome un valor específico cuando cambia una variable independiente y las demás no cambian.

> Such predicted probabilities permit a characterization of the magnitude of the impact of any independent variable, Xi, on P(Y=1∣X) through the calculation of the change in the predicted probability that Y equals 1 that results when Xi is increased from one value to another while the other independent variables are fixed at specified values. Ver https://www.sciencedirect.com/topics/mathematics/predicted-probability

In [None]:
#Calculamos la probabilidad de predicción para el target
X_test['Prob']  = model.predict_proba(X_test)[:,1]

#Ordenamos los resultados
X_test = X_test.sort_values(by = 'Prob', ascending = False)
X_test['Churn'] = y_test

#Creamos un DataFrame para visualizar el target y la probabilidad
df1 = X_test[['Churn', 'Prob']].copy()
df1

Unnamed: 0,Churn,Prob
5989,1,0.873756
1976,1,0.871818
2208,1,0.867070
1410,1,0.861796
301,1,0.857102
...,...,...
6528,0,0.001537
6614,0,0.001333
4732,0,0.001323
109,0,0.001266


In [None]:
#Dividimos los datos en deciles
X_test['Decile'] = pd.qcut(X_test['Prob'], 10, labels=[i for i in range (10, 0, -1)])

#Visualizamos
df2 = X_test[['Churn', 'Prob', 'Decile']].copy()
df2

Unnamed: 0,Churn,Prob,Decile
5989,1,0.873756,1
1976,1,0.871818,1
2208,1,0.867070,1
1410,1,0.861796,1
301,1,0.857102,1
...,...,...,...
6528,0,0.001537,10
6614,0,0.001333,10
4732,0,0.001323,10
109,0,0.001266,10


In [None]:
#Calculamos el número de respuestas
res = pd.crosstab(X_test['Decile'], X_test['Churn'])[1].reset_index().rename(columns = {1: 'Number of Responses'})
lg = X_test['Decile'].value_counts(sort = False).reset_index().rename(columns = {'Decile': 'Number of Cases', 'index': 'Decile'})
lg = pd.merge(lg, res, on = 'Decile').sort_values(by = 'Decile', ascending = False).reset_index(drop = True)

lg

Unnamed: 0,Decile,Number of Cases,Number of Responses
0,1,212,154
1,2,211,129
2,3,211,89
3,4,211,64
4,5,211,41
5,6,212,30
6,7,211,26
7,8,211,16
8,9,211,5
9,10,212,7


Obtuvimos en el paso anterior el **número de casos** (número de datos en casa decil) y el **número de respuestas** (el número de datos positivos en cada decil).

Con esta información es posible calcular el Gain Number

## Calculamos el Gain Number

El Gain Number es la razón entre el acumulado del número de respuestas (los datos positivos) en cada decil y el total de observaciones positivas en las observaciones de los datos.

En otras palabras, es el porcentaje de targets cubiertas en un decil en específico. Por ejemplo, el 80% de targets estuvieron en el top 20% de los datos. En este caso podemos decir que enviando un e-mail al 20% de los clientes nos permitirá alcanzar al 80% de los clientes que son más propensos a comprar el producto. Véase https://www.listendata.com/2014/08/excel-template-gain-and-lift-charts.html

In [None]:
#Calculamos el acumulado
lg['Cumulative Responses'] = lg['Number of Responses'].cumsum()

#Calculamos el porcentaje de taregt positivo en cada decil comparado con el total nu
lg['% of Events'] = np.round(((lg['Number of Responses']/lg['Number of Responses'].sum())*100),2)

#Calculamos el Gain Number
lg['Gain'] = lg['% of Events'].cumsum()

lg

Unnamed: 0,Decile,Number of Cases,Number of Responses,Cumulative Responses,% of Events,Gain
0,1,212,154,154,27.45,27.45
1,2,211,129,283,22.99,50.44
2,3,211,89,372,15.86,66.3
3,4,211,64,436,11.41,77.71
4,5,211,41,477,7.31,85.02
5,6,212,30,507,5.35,90.37
6,7,211,26,533,4.63,95.0
7,8,211,16,549,2.85,97.85
8,9,211,5,554,0.89,98.74
9,10,212,7,561,1.25,99.99


## Calculamos el Lift

In [None]:
#Calculamos el Lift
lg['Decile'] = lg['Decile'].astype('int')
lg['lift'] = np.round((lg['Gain']/(lg['Decile']*10)),2)

lg

Unnamed: 0,Decile,Number of Cases,Number of Responses,Cumulative Responses,% of Events,Gain,lift
0,1,212,154,154,27.45,27.45,2.74
1,2,211,129,283,22.99,50.44,2.52
2,3,211,89,372,15.86,66.3,2.21
3,4,211,64,436,11.41,77.71,1.94
4,5,211,41,477,7.31,85.02,1.7
5,6,212,30,507,5.35,90.37,1.51
6,7,211,26,533,4.63,95.0,1.36
7,8,211,16,549,2.85,97.85,1.22
8,9,211,5,554,0.89,98.74,1.1
9,10,212,7,561,1.25,99.99,1.0


Podemos interpretar el lift como la razón entre el número de observaciones positivas en cada decil entre el número esperado de positivos en cada decil (usando el modelo).

> The lift could be interpreted as the gain ratio percentage to the random percentage at a given decile level. Tomado de: https://towardsdatascience.com/model-benefit-evaluation-with-lift-and-gain-analysis-4b69f9288ab3

Para el caso del dataset esto se puede interpretar como que al seleccionar el 20% de los datos basados en el modelo podemos encontrar el número de positivos (reales) 2.25 veces más que seleccionando datos al azar sin el modelo.