In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from IPython.display import display

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [4]:
df = pd.read_parquet('../data/data_prep.parquet')

In [5]:
df_train = pd.read_parquet('../data/df_train.parquet')
df_val = pd.read_parquet('../data/df_val.parquet')
df_test = pd.read_parquet('../data/df_test.parquet')
df_full_train = pd.read_parquet('../data/df_full_train.parquet')

In [6]:
y_train = np.loadtxt('../data/y_train')
y_val = np.loadtxt('../data/y_val')
y_test = np.loadtxt('../data/y_test')

X_train = np.loadtxt('../data/X_train')
X_val = np.loadtxt('../data/X_val')

## Logistic Regression

In [8]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
model.intercept_[0]

-0.10905631244452796

In [13]:
model.coef_[0].round(3)

array([ 0.474, -0.175, -0.407, -0.029, -0.078,  0.063, -0.089, -0.082,
       -0.034, -0.073, -0.335,  0.317, -0.089,  0.004, -0.258,  0.141,
        0.009,  0.063, -0.089, -0.081,  0.266, -0.089, -0.285, -0.232,
        0.124, -0.166,  0.058, -0.087, -0.032,  0.071, -0.059,  0.141,
       -0.249,  0.216, -0.12 , -0.089,  0.102, -0.071, -0.089,  0.052,
        0.214, -0.089, -0.232, -0.07 ,  0.   ])

In [18]:
model.predict(X_train) # Hard Predictions

array([0., 1., 1., ..., 1., 0., 1.])

In [20]:
model.predict_proba(X_train) # Soft Predictions

array([[0.90447347, 0.09552653],
       [0.32042681, 0.67957319],
       [0.36609034, 0.63390966],
       ...,
       [0.46895613, 0.53104387],
       [0.95746256, 0.04253744],
       [0.30102664, 0.69897336]])

In [21]:
model.predict_proba(X_train)[:, 1] # Soft Predictions

array([0.09552653, 0.67957319, 0.63390966, ..., 0.53104387, 0.04253744,
       0.69897336])

In [22]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.00898196, 0.20435623, 0.21216057, ..., 0.13625449, 0.79988621,
       0.83752419])

In [23]:
churn_decision = (y_pred >= 0.5)

In [24]:
# Gente que probablemente (0.5 de threshold) va a churn

df_val[churn_decision].customerid

3       8433-wxgna
8       3440-jpscl
11      2637-fkfsy
12      7228-omtpn
19      6711-fldfb
           ...    
1397    5976-jcjrh
1398    2034-cgrhz
1399    5276-kqwhg
1407    6521-yytyi
1408    3049-solay
Name: customerid, Length: 312, dtype: object

## Accuracy of the model

In [25]:
churn_decision.astype('int')

array([0, 0, 0, ..., 0, 1, 1])

In [26]:
y_val

array([0., 0., 0., ..., 0., 1., 1.])

In [27]:
(y_val == churn_decision)

array([ True,  True,  True, ...,  True,  True,  True])

In [28]:
(y_val == churn_decision).mean()

0.8026969481902059

In [30]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual'] = y_val
df_pred['correct_predictions'] = df_pred.prediction == df_pred.actual
df_pred

Unnamed: 0,probability,prediction,actual,correct_predictions
0,0.008982,0,0.0,True
1,0.204356,0,0.0,True
2,0.212161,0,0.0,True
3,0.543253,1,1.0,True
4,0.213772,0,0.0,True
...,...,...,...,...
1404,0.313342,0,0.0,True
1405,0.039313,0,1.0,False
1406,0.136254,0,0.0,True
1407,0.799886,1,1.0,True


In [31]:
df_pred.correct_predictions.mean()

0.8026969481902059

## Model Interpretation

In [33]:
numerical_variables = ['tenure', 'monthlycharges', 'totalcharges']
categorical_variables = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

In [34]:
train_dicts = df_train[categorical_variables + numerical_variables].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

In [35]:
# Take a look at the coefficients

dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'contract=month-to-month': 0.474,
 'contract=one_year': -0.175,
 'contract=two_year': -0.407,
 'dependents=no': -0.029,
 'dependents=yes': -0.078,
 'deviceprotection=no': 0.063,
 'deviceprotection=no_internet_service': -0.089,
 'deviceprotection=yes': -0.082,
 'gender=female': -0.034,
 'gender=male': -0.073,
 'internetservice=dsl': -0.335,
 'internetservice=fiber_optic': 0.317,
 'internetservice=no': -0.089,
 'monthlycharges': 0.004,
 'multiplelines=no': -0.258,
 'multiplelines=no_phone_service': 0.141,
 'multiplelines=yes': 0.009,
 'onlinebackup=no': 0.063,
 'onlinebackup=no_internet_service': -0.089,
 'onlinebackup=yes': -0.081,
 'onlinesecurity=no': 0.266,
 'onlinesecurity=no_internet_service': -0.089,
 'onlinesecurity=yes': -0.285,
 'paperlessbilling=no': -0.232,
 'paperlessbilling=yes': 0.124,
 'partner=no': -0.166,
 'partner=yes': 0.058,
 'paymentmethod=bank_transfer_(automatic)': -0.087,
 'paymentmethod=credit_card_(automatic)': -0.032,
 'paymentmethod=electronic_check': 0.071,

Create a smaller model

In [36]:
small = ['contract', 'tenure', 'monthlycharges']

df_train[small].iloc[:10].to_dict(orient='records')

[{'contract': 'two_year', 'tenure': 72, 'monthlycharges': 115.5},
 {'contract': 'month-to-month', 'tenure': 10, 'monthlycharges': 95.25},
 {'contract': 'month-to-month', 'tenure': 5, 'monthlycharges': 75.55},
 {'contract': 'month-to-month', 'tenure': 5, 'monthlycharges': 80.85},
 {'contract': 'two_year', 'tenure': 18, 'monthlycharges': 20.1},
 {'contract': 'month-to-month', 'tenure': 4, 'monthlycharges': 30.5},
 {'contract': 'month-to-month', 'tenure': 1, 'monthlycharges': 75.1},
 {'contract': 'month-to-month', 'tenure': 1, 'monthlycharges': 70.3},
 {'contract': 'two_year', 'tenure': 72, 'monthlycharges': 19.75},
 {'contract': 'month-to-month', 'tenure': 6, 'monthlycharges': 109.9}]

In [37]:
dicts_train_small = df_train[small].to_dict(orient='records')
dicts_val_small = df_val[small].to_dict(orient='records')

In [38]:
dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)

In [39]:
dv_small.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'monthlycharges', 'tenure'], dtype=object)

In [40]:
X_train_small = dv_small.transform(dicts_train_small)

In [41]:
model_small = LogisticRegression()
model_small.fit(X_train_small, y_train)

In [50]:
w0 = model_small.intercept_[0]
w0.round(3)

-2.478

In [48]:
w = model_small.coef_[0]
w.round(3)

array([ 0.971, -0.024, -0.948,  0.027, -0.036])

In [49]:
dict(zip(dv_small.get_feature_names_out(), w.round(3)))

{'contract=month-to-month': 0.971,
 'contract=one_year': -0.024,
 'contract=two_year': -0.948,
 'monthlycharges': 0.027,
 'tenure': -0.036}

In [51]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [64]:
# Example

w0 + 1*0.97 + 50*0.027 + 5*(-0.036)

-0.33795759577928713

In [65]:
sigmoid(_) # _ toma el valor de la salida inmediatamente anterior 

0.41630568661975526

In [66]:
# Example

sigmoid(w0 + 1*0.97 + 60*0.027 + 1*(-0.036))

0.5190014456993783

## Using the model

First, train a new model

In [69]:
dicts_full_train = df_full_train[categorical_variables + numerical_variables].to_dict(orient='records')

In [70]:
dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

In [71]:
y_full_train = df_full_train.churn.values
y_full_train

array([0, 1, 0, ..., 1, 1, 0])

In [72]:
model = LogisticRegression()
model.fit(X_full_train, y_full_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [73]:
dicts_test = df_test[categorical_variables + numerical_variables].to_dict(orient='records')
X_test = dv.transform(dicts_test)

In [74]:
y_pred = model.predict_proba(X_test)[:, 1]

In [75]:
churn_decision = (y_pred >= 0.5)


In [76]:
(churn_decision == y_test).mean()

0.8133427963094393

Use the model

In [78]:
customer = dicts_test[10]
customer

{'gender': 'male',
 'seniorcitizen': 1,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'no',
 'onlinebackup': 'yes',
 'deviceprotection': 'no',
 'techsupport': 'no',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'mailed_check',
 'tenure': 32,
 'monthlycharges': 93.95,
 'totalcharges': 2861.45}

In [87]:
X_customer = dv.transform([customer])

In [82]:
X_customer.shape

(1, 45)

In [84]:
model.predict_proba(X_customer)[0,1]

0.4974002060398377

In [85]:
y_test[10]

0.0

In [86]:
customer_2 = dicts_test[-1]
customer_2

{'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'yes',
 'onlinebackup': 'no',
 'deviceprotection': 'yes',
 'techsupport': 'no',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'tenure': 17,
 'monthlycharges': 104.2,
 'totalcharges': 1743.5}

In [88]:
X_customer_2 = dv.transform([customer_2])

In [89]:
X_customer_2.shape

(1, 45)

In [90]:
model.predict_proba(X_customer_2)[0,1]

0.6606520440579291

In [91]:
y_test[-1]

1.0