## Import libraries

In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

## Loading and Overviewing of Dataset

In [2]:
data = pd.read_csv("dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv")

## 
df = data.copy()

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.tail().T

Unnamed: 0,7038,7039,7040,7041,7042
customerID,6840-RESVB,2234-XADUH,4801-JZAZL,8361-LTMKD,3186-AJIEK
gender,Male,Female,Female,Male,Male
SeniorCitizen,0,0,0,1,0
Partner,Yes,Yes,Yes,Yes,No
Dependents,Yes,Yes,Yes,No,No
tenure,24,72,11,4,66
PhoneService,Yes,Yes,No,Yes,Yes
MultipleLines,Yes,Yes,No phone service,Yes,No
InternetService,DSL,Fiber optic,DSL,Fiber optic,Fiber optic
OnlineSecurity,Yes,No,Yes,No,Yes


In [5]:
print(f'total number of rows: {df.shape[0]} => total number of columns: {df.shape[1]}')

total number of rows: 7043 => total number of columns: 21


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [8]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [9]:
np.unique(df['TotalCharges'])

array([' ', '100.2', '100.25', ..., '999.45', '999.8', '999.9'],
      dtype=object)

## Data Preprocessing - Step 1
- Normalize the column names
- Change the column type for TotalChanges

In [10]:
df.columns = df.columns.str.lower()
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [11]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce') 

In [12]:
df.isnull().sum()

customerid           0
gender               0
seniorcitizen        0
partner              0
dependents           0
tenure               0
phoneservice         0
multiplelines        0
internetservice      0
onlinesecurity       0
onlinebackup         0
deviceprotection     0
techsupport          0
streamingtv          0
streamingmovies      0
contract             0
paperlessbilling     0
paymentmethod        0
monthlycharges       0
totalcharges        11
churn                0
dtype: int64

In [13]:
df.totalcharges = df.totalcharges.fillna(0)

In [14]:
df.churn.head()

0     No
1     No
2    Yes
3     No
4    Yes
Name: churn, dtype: object

In [15]:
df.churn = (df.churn == 'Yes').astype(int)

In [16]:
df.churn.head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

## Exploratory Data Analysis

In [17]:
## target variable 
df.churn.value_counts()

churn
0    5174
1    1869
Name: count, dtype: int64

In [18]:
numeric_cols = df.select_dtypes(exclude=[object])

corr_matrix = numeric_cols.corr()

corr_matrix['churn']

seniorcitizen     0.150889
tenure           -0.352229
monthlycharges    0.193356
totalcharges     -0.198324
churn             1.000000
Name: churn, dtype: float64

## Data Proprocessing - Step 2

## Build a Validation Framework


In [19]:
## 
df_train_full , df_test = train_test_split(df, test_size=0.2, random_state=11) 
df_train, df_valid = train_test_split(df_train_full, test_size=0.25, random_state=11)


print(f'Training dataset: {len(df_train)}')
print(f'Validation dataset: {len(df_valid)}')
print(f'Test dataset: {len(df_test)}')

Training dataset: 4225
Validation dataset: 1409
Test dataset: 1409


In [20]:
y_train = df_train['churn'].values
y_valid = df_valid['churn'].values
y_test = df_test['churn'].values

In [21]:
del df_train['churn']
del df_valid['churn']
del df_test['churn']

## Feature Engineering 
- Dividing our data into numerical and categorical
- perform the one-hot encoding

In [22]:
numerical_features = ['tenure', 'monthlycharges', 'totalcharges', 'seniorcitizen']

categorical_features = ['gender', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 
                        'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies',
                        'contract', 'paperlessbilling', 'paymentmethod'] 


In [23]:
## convert the dataframe to into dict
train_dict = df_train[categorical_features + numerical_features].to_dict(orient='records')

valid_dict = df_valid[categorical_features + numerical_features].to_dict(orient='records')

In [24]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [25]:
X_train = dv.transform(train_dict)

X_valid = dv.transform(valid_dict)

## Training The Model

In [26]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

In [27]:
y_valid_pred = model.predict_proba(X_valid)

In [28]:
y_valid_pred

array([[0.90329729, 0.09670271],
       [0.80132077, 0.19867923],
       [0.87623834, 0.12376166],
       ...,
       [0.93054901, 0.06945099],
       [0.83259853, 0.16740147],
       [0.96890582, 0.03109418]])

 The predictions of the model: a two-column matrix. The first column contains 
the probability that the target is zero (the client won’t churn). The second column contains 
the opposite probability (the target is one, and the client will churn).

In [29]:
y_valid_pred = model.predict_proba(X_valid)[:, 1]

 This output (probabilities) is often called soft predictions. These tell us the probability of churning as a number between zero and one. It’s up to us to decide how to
interpret this number and how to use it.

 To make the actual decision about whether to send a promotional letter to our customers, using the probability alone is not enough. We need hard predictions — binary
values of True (churn, so send the mail) or False (not churn, so don’t send the mail).
 To get the binary predictions, we take the probabilities and cut them above a certain threshold.

In [30]:
churn = y_valid_pred >= 0.5

In [31]:
(y_valid == churn).mean()

0.8090844570617459

In [38]:
acc_score = accuracy_score(y_valid, churn)
print(f'Validation Accuracy Score: {round(acc_score * 100, 1)}%')

Validation Accuracy Score: 80.9%


## Saving The Model

In [39]:
import pickle

In [43]:
## specifyging where to save the file
with open('churn-model.bin', 'wb') as f_out:
    ## save the model
    pickle.dump((dv,model), f_out)

## Loading The Model 

In [44]:
with open('churn-model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [45]:
## a sample customer
customer = {
 'customerid': '8879-zkjof',
 'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'no',
 'dependents': 'no',
 'tenure': 41,
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'dsl',
 'onlinesecurity': 'yes',
 'onlinebackup': 'no',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'one_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'bank_transfer_(automatic)',
 'monthlycharges': 79.85,
 'totalcharges': 3320.75,
}

In [48]:
def predict_single(df, dv, model):
    X = dv.transform([customer])
    y_pred = model.predict_proba(X)[:,1]
    return y_pred[0]

In [49]:
prediction = predict_single(customer, dv, model)

In [50]:
print(f'{prediction}')

0.195617523610102


In [51]:
if prediction >= 0.5:
    print('verdict: Churn')
else:
    print('verdict: Not Churn')

verdict: Not Churn
