## Customer Churn Prediction - Report

In [1]:
# All imports
import os
import zipfile

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

 ### 1. Business understanding

Customer churn is one of the most important buisness metric used in several companies. This metric is used to monitor health of the company. We need to control the customer attrition rate. The cost of wining a new customer is much more than retaining an exsisiting one. Now let me clear some of my terminologies, churn is an anonym for losing. In this report we will try to analyse the historical data to recognize patterns and corrections which is not clearly visible.

This is supervised learning and a binary classification type problem as the data is labeled and the outcome is either churned till date (date when the data was collected) or not. Therefore, we will build a model based on the given factors.  

 ### 2. Data mining 

In [2]:
!kaggle datasets download -d blastchar/telco-customer-churn

telco-customer-churn.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
# Reading from Zip files

zip_ref = zipfile.ZipFile('telco-customer-churn.zip', 'r')
zip_ref.namelist()

['WA_Fn-UseC_-Telco-Customer-Churn.csv']

In [4]:
zip_ref.extractall()

In [5]:
# Print the list of files in the directory

os.listdir(b'C:\Users\maiti\Desktop\Kaggle')

[b'.ipynb_checkpoints',
 b'Telco Customer Churn Prediction.ipynb',
 b'telco-customer-churn.zip',
 b'WA_Fn-UseC_-Telco-Customer-Churn.csv']

In [6]:
# Read Data

data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [7]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [8]:
data.shape

(7043, 21)

In [9]:
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

### 3. Data cleaning

In [10]:
print ("Rows     : " ,data.shape[0])
print ("Columns  : " ,data.shape[1])
print ("\nFeatures : " ,data.columns.tolist())
print ("\nMissing values : ", data.isnull().sum().values.sum())
print ("\nUnique values :\n",data.nunique())

Rows     :  7043
Columns  :  21

Features :  ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Missing values :  0

Unique values :
 customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64


In [11]:
# To find the nan values and replace by ''
for i in data.columns:
    data[i] = data[i].replace(' ', np.nan)

In [12]:
data.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [13]:
# It is better to drop the missing data as it .15% of the data set
data = data[data.TotalCharges.notnull()]

In [14]:
data.shape

(7032, 21)

In [15]:
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [16]:
data = data.reset_index()[data.columns]

In [33]:
for i in data.columns:
    if (data[i].dtypes == object):
        print(i)
        print(data[i].unique())
        print('\n')

customerID
['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']


gender
['Female' 'Male']


Partner
['Yes' 'No']


Dependents
['No' 'Yes']


PhoneService
['No' 'Yes']


MultipleLines
['No phone service' 'No' 'Yes']


InternetService
['DSL' 'Fiber optic' 'No']


OnlineSecurity
['No' 'Yes' 'No internet service']


OnlineBackup
['Yes' 'No' 'No internet service']


DeviceProtection
['No' 'Yes' 'No internet service']


TechSupport
['No' 'Yes' 'No internet service']


StreamingTV
['No' 'Yes' 'No internet service']


StreamingMovies
['No' 'Yes' 'No internet service']


Contract
['Month-to-month' 'One year' 'Two year']


PaperlessBilling
['Yes' 'No']


PaymentMethod
['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']


Churn
['No' 'Yes']




In [17]:
data.TotalCharges = data.TotalCharges.astype(float)

In [18]:
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7032.0,7032.0,7032.0,7032.0
mean,0.1624,32.421786,64.798208,2283.300441
std,0.368844,24.54526,30.085974,2266.771362
min,0.0,1.0,18.25,18.8
25%,0.0,9.0,35.5875,401.45
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.8625,3794.7375
max,1.0,72.0,118.75,8684.8


In [19]:
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

### 4. Data exploration

### 5. Feature engineering

### 6. Predictive modeling

### 7. Data visualization

### 8. Productionization