In [2]:
#import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
#reading training data from telcom customer churn csv file
#view all columns
pd.set_option("display.max_columns",50)
#read data
data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
print(data.columns)
print(data.shape)

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')
(7043, 21)


In [4]:
print(data.dtypes)
data.head(5)

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [20]:
#creating the data for training purpose
def createTrainingData(data):
    cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'Churn']
    modelData = data.copy()
    for col in cols:
        li = sorted(modelData[col].unique())
        for x in range(len(li)):
            modelData[col] = modelData[col].replace(li[x],x)
    return modelData
            
modelData = createTrainingData(data)
modelData.head(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0
1,5575-GNVDE,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.5,0
2,3668-QPYBK,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1
3,7795-CFOCW,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1840.75,0
4,9237-HQITU,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1
5,9305-CDSKC,0,0,0,0,8,1,2,1,0,0,2,0,2,2,0,1,2,99.65,820.5,1
6,1452-KIOVK,1,0,0,1,22,1,2,1,0,2,0,0,2,0,0,1,1,89.1,1949.4,0
7,6713-OKOMC,0,0,0,0,10,0,1,0,2,0,0,0,0,0,0,0,3,29.75,301.9,0
8,7892-POOKP,0,0,1,0,28,1,2,1,0,0,2,2,2,2,0,1,2,104.8,3046.05,1
9,6388-TABGU,1,0,0,1,62,1,0,0,2,2,0,0,0,0,1,0,0,56.15,3487.95,0


In [21]:
#converting the data into numeric form
def convertToNumeric(data):
    new_data = data.copy()
    for col in new_data.columns:
        new_data[col] = pd.to_numeric(new_data[col],errors="coerce")
    return new_data
modelData = convertToNumeric(modelData.loc[:,data.columns!="customerID"])
print(modelData.dtypes)

gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService       int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
PaperlessBilling      int64
PaymentMethod         int64
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object


In [26]:
#Creating model
modelData[modelData==np.inf]==np.nan
modelData.fillna(modelData.mean(),inplace=True)
x_orig = modelData.loc[:,modelData.columns!="Churn"]
y_orig = modelData["Churn"]

x_orig,y_orig = np.asarray(x_orig),np.asarray(y_orig)

In [27]:
#splitting data into 20% testing and 80% training

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(x_orig,y_orig,test_size=20/100,random_state =72)
print("Trainig data size: ", X_train.shape)
print("Test data size: ", X_test.shape)

Trainig data size:  (5634, 19)
Test data size:  (1409, 19)


In [32]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
accuracy = lr.fit(X_train,y_train).score(X_test,y_test)
print("Accuracy = {:.2f}%".format(accuracy*100))

Accuracy = 79.99%
