# Ejemplo de Regresión Logística

Los datos fueron descargados de: https://www.kaggle.com/blastchar/telco-customer-churn

In [30]:
# Importar las librerías necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

from IPython.display import Image
import pydotplus # Si no lo tienen instalado: conda install -c conda-forge pydotplus


### Importamos los datos

In [31]:
data = pd.read_csv('./../../dataset/telcoChurn.csv')

In [32]:
data.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [52]:
#Imputamos los nulos que figuran como "vacios"
data['TotalCharges'] = data['TotalCharges'].replace(' ',-1).astype(float)

### Convertirmos las variables categóricas a Dummies

In [53]:
# Seleccionamos las variables categóricas
cat_vars = ['gender', 'Partner', 'Dependents', 'PhoneService','MultipleLines', 'InternetService',
           'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
           'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
           'PaymentMethod']

In [54]:
# Iteramos sobre cada variable creando su dummie         
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(data[var], prefix=var)
    data1=data.join(cat_list)
    data=data1

KeyError: 'gender'

In [36]:
# Descartamos las variables originales
data = data.drop(cat_vars, axis = 1)

In [37]:
# El target también los convertimos en una variable numérica dummie
data['target'] = np.where(data.Churn == 'Yes',1,0)

In [38]:
# Eliminamos la variable Target y el ID de cliente que no arroja información (realmente no tiene información?)
data = data.drop(['Churn', 'customerID'], axis = 1)

In [39]:
data.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,target
0,0,1,29.85,29.85,1,0,0,1,1,0,...,1,0,0,0,1,0,0,1,0,0
1,0,34,56.95,1889.5,0,1,1,0,1,0,...,0,1,0,1,0,0,0,0,1,0
2,0,2,53.85,108.15,0,1,1,0,1,0,...,1,0,0,0,1,0,0,0,1,1
3,0,45,42.3,1840.75,0,1,1,0,1,0,...,0,1,0,1,0,1,0,0,0,0
4,0,2,70.7,151.65,1,0,1,0,1,0,...,1,0,0,0,1,0,0,1,0,1


In [40]:
# Separamos la base en las columnas Independientes y la Dependiente (X e Y)
X, y = data.drop(data.columns[-1], axis=1), data.iloc[:,-1]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [42]:
print("Tamaño de Base:", data.shape)
print("Tamaño de Muestra de Entrenamiento:", X_train.shape)
print("Tamaño de Muestra de Testeo", X_test.shape)
print("Tamaño del Target de Entrenamiento:", y_train.shape)
print("Tamaño del Target de Testeo", y_test.shape)

Tamaño de Base: (7043, 46)
Tamaño de Muestra de Entrenamiento: (4930, 45)
Tamaño de Muestra de Testeo (2113, 45)
Tamaño del Target de Entrenamiento: (4930,)
Tamaño del Target de Testeo (2113,)


In [43]:
# Definimos los parámetros de nuestor modelo
logreg = LogisticRegression(solver = 'lbfgs', penalty='l2') # Otros Solvers: ‘liblinear’, ‘newton-cg’, ‘sag’, ‘saga’, ‘lbfgs’. Penalty:  ‘l1’ Lasso, ‘l2’ Ridge, ‘elasticnet’ or ‘none’

In [44]:
# Hacemos un Fit del modelo en nuestros datos
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [45]:
# Nos guardamos la predicción que hizo el modelo
y_pred = logreg.predict(X_test)

In [46]:
pd.crosstab(np.asarray(y_test), y_pred)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1393,146
1,260,314


In [47]:
# Calculo el Accuracy 
# ¿Qué porcentaje de predicciones fue correcta? (1389 + 328) / 2113 =
metrics.accuracy_score(np.asarray(y_test), y_pred)

0.8078561287269286

In [48]:
# Calculo el Error Medio Absoluto 
# ¿Qué porcentaje de predicciones fue incorrecta? (246 + 150) / 2113 =
metrics.mean_absolute_error(np.asarray(y_test), y_pred)

0.19214387127307148

In [49]:
# Calculo el Recall 
# ¿Qué porcentaje de casos positivos fueron capturados? 328 / (246 + 328) =
metrics.recall_score(np.asarray(y_test), y_pred)

0.5470383275261324

In [50]:
# Calculo de la Precisión
# ¿Qué porcentaje de predicciones positivos fueron correctas? 328 / (150 + 328) =
metrics.precision_score(np.asarray(y_test), y_pred)

0.6826086956521739

In [51]:
logreg.coef_

array([[ 0.1939444 , -0.06027303,  0.00221946,  0.0003013 , -0.02885728,
        -0.09561492, -0.04644741, -0.07802479,  0.05492177, -0.17939397,
         0.04236901, -0.16684121, -0.23617706,  0.04236901,  0.06933585,
        -0.29694688,  0.26748846, -0.09501378,  0.25633464, -0.09501378,
        -0.28579306,  0.09932161, -0.09501378, -0.12878004,  0.03006735,
        -0.09501378, -0.05952577,  0.22935023, -0.09501378, -0.25880865,
        -0.03722334, -0.09501378,  0.00776492, -0.11881868, -0.09501378,
         0.08936025,  0.27235243, -0.13665959, -0.26016504, -0.26773022,
         0.14325802, -0.08361837, -0.12568629,  0.28494228, -0.20010981]])