## Churn Prediction with Scikit-learn package

### Import packages and data

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from IPython.display import display

In [13]:
df= pd.read_csv('data.csv')

### Data Cleaning

In [14]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [17]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [21]:
df.columns = df.columns.str.lower().str.replace(' ','_')

In [26]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [30]:
df.totalcharges = pd.to_numeric(df.totalcharges,errors = 'coerce')

In [32]:
numericals = ['tenure', 'monthlycharges', 'totalcharges']
categorical = ['gender', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [33]:
for c in categorical:
    df[c] = df[c].str.lower().str.replace(' ','_')

In [35]:
df.isnull().sum()

customerid           0
gender               0
seniorcitizen        0
partner              0
dependents           0
tenure               0
phoneservice         0
multiplelines        0
internetservice      0
onlinesecurity       0
onlinebackup         0
deviceprotection     0
techsupport          0
streamingtv          0
streamingmovies      0
contract             0
paperlessbilling     0
paymentmethod        0
monthlycharges       0
totalcharges        11
churn                0
dtype: int64

In [37]:
df.totalcharges = df.totalcharges.fillna(0)

In [41]:
df.churn = (df.churn == 'Yes').astype(int)
df.churn.head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

### Exploratory Data Analysis

In [42]:
numericals = ['tenure', 'monthlycharges', 'totalcharges']
categorical = ['gender','seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [43]:
df.churn.mean() ### this gives the percentage of people who have switched networks

np.float64(0.2653698707936959)

In [44]:
df[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

### Feature Importance Identification

#### Churn Rate and Risk Ratio

In [50]:
for c in categorical:
    print(c)
    df_group = df.groupby(c).churn.agg(['count','mean'])
    df_group['churn rate'] = df_group['mean'] - df.churn.mean()
    df_group['risk ratio'] = df_group['mean'] / df.churn.mean()
    display(df_group)
    print()

gender


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,3488,0.269209,0.003839,1.014466
male,3555,0.261603,-0.003766,0.985807



seniorcitizen


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5901,0.236062,-0.029308,0.889557
1,1142,0.416813,0.151443,1.570686



partner


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,3641,0.32958,0.06421,1.241964
yes,3402,0.196649,-0.068721,0.741038



dependents


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,4933,0.312791,0.047422,1.1787
yes,2110,0.154502,-0.110868,0.582215



phoneservice


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,682,0.249267,-0.016103,0.939319
yes,6361,0.267096,0.001726,1.006506



multiplelines


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,3390,0.250442,-0.014927,0.943749
no_phone_service,682,0.249267,-0.016103,0.939319
yes,2971,0.286099,0.020729,1.078114



internetservice


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,2421,0.189591,-0.075779,0.714441
fiber_optic,3096,0.418928,0.153558,1.578656
no,1526,0.07405,-0.19132,0.279044



onlinesecurity


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,3498,0.417667,0.152297,1.573906
no_internet_service,1526,0.07405,-0.19132,0.279044
yes,2019,0.146112,-0.119258,0.550597



onlinebackup


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,3088,0.399288,0.133918,1.504645
no_internet_service,1526,0.07405,-0.19132,0.279044
yes,2429,0.215315,-0.050055,0.811377



deviceprotection


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,3095,0.391276,0.125906,1.474456
no_internet_service,1526,0.07405,-0.19132,0.279044
yes,2422,0.225021,-0.040349,0.847951



techsupport


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,3473,0.416355,0.150985,1.56896
no_internet_service,1526,0.07405,-0.19132,0.279044
yes,2044,0.151663,-0.113706,0.571517



streamingtv


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,2810,0.335231,0.069861,1.263261
no_internet_service,1526,0.07405,-0.19132,0.279044
yes,2707,0.300702,0.035332,1.133143



streamingmovies


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,2785,0.336804,0.071434,1.269188
no_internet_service,1526,0.07405,-0.19132,0.279044
yes,2732,0.299414,0.034044,1.128291



contract


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,3875,0.427097,0.161727,1.60944
one_year,1473,0.112695,-0.152675,0.424672
two_year,1695,0.028319,-0.237051,0.106714



paperlessbilling


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,2872,0.163301,-0.102069,0.615371
yes,4171,0.335651,0.070281,1.264842



paymentmethod


Unnamed: 0_level_0,count,mean,churn rate,risk ratio
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),1544,0.167098,-0.098271,0.629681
credit_card_(automatic),1522,0.152431,-0.112939,0.57441
electronic_check,2365,0.452854,0.187484,1.706502
mailed_check,1612,0.191067,-0.074303,0.720003





#### Mutual Information

In [51]:
def mutual_info(column):
    return mutual_info_score(column,df.churn)
    

In [53]:
m_i = df[categorical].apply(mutual_info)
m_i.sort_values(ascending = False)

contract            0.098453
onlinesecurity      0.064677
techsupport         0.063021
internetservice     0.055574
onlinebackup        0.046792
paymentmethod       0.044519
deviceprotection    0.043917
streamingmovies     0.032001
streamingtv         0.031908
paperlessbilling    0.019194
dependents          0.014467
partner             0.011454
seniorcitizen       0.010577
multiplelines       0.000801
phoneservice        0.000072
gender              0.000037
dtype: float64

#### Correlation

In [55]:
df[numericals].corrwith(df.churn)

tenure           -0.352229
monthlycharges    0.193356
totalcharges     -0.198324
dtype: float64

### Split data into Train, Validation and Test sets

In [56]:
df_train_full,df_test = train_test_split(df,test_size = 0.2, random_state = 120)
df_train,df_val = train_test_split(df_train_full,test_size = 0.25, random_state =120) # test_size is 0.25 to get 20% of the full data

In [57]:
len(df_train),len(df_val),len(df_test)

(4225, 1409, 1409)

In [59]:
y_train = df_train.churn
y_val = df_val.churn
y_test = df_test.churn
del df_train['churn']
del df_val['churn']
del df_test['churn']

In [67]:
x_train = df_train.to_dict(orient = 'records')
x_val = df_val.to_dict(orient = 'records')
x_test = df_test.to_dict(orient = 'records')

### Encode Categorical Values

In [70]:
dv= DictVectorizer(sparse = False)

In [72]:
x_train = dv.fit_transform(x_train)
x_val = dv.transform(x_val)

### Logistic Regression

In [85]:
lr = LogisticRegression(random_state = 120)

In [86]:
lr.fit(x_train,y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [87]:
lr.intercept_[0]

np.float64(-0.12558670863212615)

In [92]:
lr.coef_[0].round(2)

array([ 0.55, -0.24, -0.43, ..., -0.21, -0.07,  0.  ], shape=(4270,))

In [94]:
train_pred = lr.predict_proba(x_train)[:,1]
churn_decision = (train_pred >= 0.5)
(y_train == churn_decision).mean()


np.float64(0.805207100591716)

In [95]:
val_pred = lr.predict_proba(x_val)[:,1]
churn_decision = (val_pred >= 0.5)
(y_val == churn_decision).mean()

np.float64(0.808374733853797)

##### The accuracy across training and validation is almost similar meaning the model generalizes well to unseen data. Now the model is ready for testing!!!

### Testing for generalizability in test data

In [97]:
y_full_train = df_train_full.churn
del df_train_full['churn']
x_full_train = df_train_full.to_dict(orient = 'records')
x_full_train = dv.fit_transform(x_full_train)
x_test = dv.transform(x_test)

In [98]:
model = LogisticRegression(random_state = 120)
model.fit(x_full_train,y_full_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [99]:
y_pred = model.predict_proba(x_test)[:,1]
y_pred = (y_pred >= 0.5)
accuracy = (y_test == y_pred).mean()
accuracy

np.float64(0.8034066713981547)

### The model is ready for deployment, since the model performs with the same accuracy range in the unseen test set!!!