### Importing Libraries & Packages

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # For creating plots
import matplotlib.ticker as mtick # For specifying the axes tick format 
import matplotlib.pyplot as plt

sns.set(style = 'white')

# Reading the file

In [None]:
telecomdata = pd.read_csv('Telco-Customer-Churn.csv')

Inspecting values

In [None]:
telecomdata.head()

In [None]:
telecomdata.columns.values

In [None]:
telecomdata.shape

In [None]:
telecomdata.describe()

In [None]:
telecomdata.info()

Checking missing values

In [None]:
telecomdata.dtypes

Converting Total Charges to a numerical data type.

In [None]:
telecomdata.TotalCharges = pd.to_numeric(telecomdata.TotalCharges, errors='coerce')
telecomdata.isnull().sum()

Removing missing values

In [None]:
telecomdata.dropna(inplace = True)
df1 = telecomdata.iloc[:,1:]


In [None]:
Convertin the predictor variable in a binary numeric variable
df1['Churn'].replace(to_replace='Yes', value=1, inplace=True)
df1['Churn'].replace(to_replace='No',  value=0, inplace=True)

#Converting categorical variables into dummy variables
df_dummy = pd.get_dummies(df1)
df_dummy.head()

In [None]:
#Getting Correlation of "Churn" with other variables:
plt.figure(figsize=(15,8))
df_dummy.corr()['Churn'].sort_values(ascending = False).plot(kind='bar')

*Absence of online security, month- month contracts & tech support seem to be positively correlated with churn. While, tenure, 2 year contract seems to be negatively correlated with churn.

*Services such as streaming TV, Online security,online backup, tech support, etc. without internet connection are negatively related to churn.

### Data Exploration


Look at the distribution of individual variables and then slice and dice the data for any interesting trends.


### A) Demographics 



*Understand the gender, age range, patner and dependent status of the customers

Note- Gender Distribution - About half of the customers in our data set are female while the other half are male.

In [None]:
colors = ['#4D3425','#E4512B']
ax = (telecomdata['gender'].value_counts()*100.0 /len(telecomdata)).plot(kind='bar',
                                                                           stacked = True,
                                                                          rot = 0,
                                                                          color = colors)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_ylabel('% Customers')
ax.set_xlabel('Gender')
ax.set_ylabel('% Customers')
ax.set_title('Gender Distribution')

# create a list to collect the plt.patches data
totals = []

# find the values and append to list
for i in ax.patches:
     totals.append(i.get_width())

# set individual bar lables using above list
total = sum(totals)

for i in ax.patches:
    # get_width pulls left or right; get_y pushes up or down
    ax.text(i.get_x()+.15, i.get_height()-3.5, \
            str(round((i.get_height()/total), 1))+'%',
            fontsize=12,
            color='white',
           weight = 'bold')

 Senior Citizens

*There are only 16% of senior citizen customers hence rest in the data are younger people.


In [None]:
ax = (telecomdata['SeniorCitizen'].value_counts()*100.0 /len(telecomdata))\
.plot.pie(autopct='%.1f%%', labels = ['No', 'Yes'],figsize =(5,5), fontsize = 12 )                                                                           
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_ylabel('Senior Citizens',fontsize = 12)
ax.set_title('% of Senior Citizens', fontsize = 12)

Partner and dependent status 

*About 50% of the customers are having partners, while rest 30% of the total customers are dependents.


In [None]:
df1 = pd.melt(telecomdata, id_vars=['customerID'], value_vars=['Dependents','Partner'])
df2 = df1.groupby(['variable','value']).count().unstack()
df2 = df2*100/len(telecom_cust)
colors = ['#4D3425','#E4512B']
ax = df2.loc[:,'customerID'].plot.bar(stacked=True, color=colors,
                                      figsize=(8,6),rot = 0,
                                     width = 0.2)

ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_ylabel('% Customers',size = 14)
ax.set_xlabel('')
ax.set_title('% Customers with dependents and partners',size = 14)
ax.legend(loc = 'center',prop={'size':14})

for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()
     ax.annotate('{:.0f}%'.format(height), (p.get_x()+.25*width, p.get_y()+.4*height),
                color = 'white',
               weight = 'bold',
               size = 14)

Check the % of customers, who have partners and also have dependents.



*Among customers who have partners, only half of them are dependent, while other half are independents. 

*Among customers who don't have partners, a majority (80%) of them do not have dependents.



In [None]:
colors = ['#4D3425','#E4512B']
partner_dependent = telecomdata.groupby(['Partner','Dependents']).size().unstack()

ax = (partner_dependent.T*100.0 / partner_dependent.T.sum()).T.plot(kind='bar',
                                                                width = 0.2,
                                                                stacked = True,
                                                                rot = 0, 
                                                                figsize = (8,6),
                                                                color = colors)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.legend(loc='center',prop={'size':14},title = 'Dependents',fontsize =14)
ax.set_ylabel('% Customers',size = 14)
ax.set_title('% Customers with/without dependents based on whether they have a partner',size = 14)
ax.xaxis.label.set_size(14)

# Code to add the data labels on the stacked bar chart
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0f}%'.format(height), (p.get_x()+.25*width, p.get_y()+.4*height),
                color = 'white',
               weight = 'bold',
               size = 14)

*looking at differences between the % of customers with/without dependents and partners by gender. 

*Being no difference in their distribution by gender, there isn't any difference in senior citizen status by gender.

### B) Customer Account Information: tenure, contract


1. Tenure-

Below histogram  shows that lot of customers have been with the telecom company for a month,
while others are there for about 72 months. 
But, different customers have different contracts. 
Hence, it could be more or less easier for customers to stay or leave the telecom company based on the contract they are into.



In [None]:
ax = sns.distplot(telecomdata['tenure'], hist=True, kde=False, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
ax.set_ylabel('# of Customers')
ax.set_xlabel('Tenure (months)')
ax.set_title('# of Customers by their tenure')

2. Contracts: 
    
To understand the above graph, look at the customers by different contracts.

In [None]:
ax = telecomdata['Contract'].value_counts().plot(kind = 'bar',rot = 0, width = 0.3)
ax.set_ylabel('# of Customers')
ax.set_title('# of Customers by Contract Type')

Observations- Most of the customers are in the month to month contract. 
While equal number of customers in the 1 year & 2 year contracts.

Below is the tenure of customers based on their contract type.



In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(nrows=1, ncols=3, sharey = True, figsize = (20,6))

ax = sns.distplot(telecomdata[telecomdata['Contract']=='Month-to-month']['tenure'],
                   hist=True, kde=False,
                   bins=int(180/5), color = 'turquoise',
                   hist_kws={'edgecolor':'black'},
                   kde_kws={'linewidth': 4},
                 ax=ax1)
ax.set_ylabel('# of Customers')
ax.set_xlabel('Tenure (months)')
ax.set_title('Month to Month Contract')

ax = sns.distplot(telecomdata[telecomdata['Contract']=='One year']['tenure'],
                   hist=True, kde=False,
                   bins=int(180/5), color = 'steelblue',
                   hist_kws={'edgecolor':'black'},
                   kde_kws={'linewidth': 4},
                 ax=ax2)
ax.set_xlabel('Tenure (months)',size = 14)
ax.set_title('One Year Contract',size = 14)

ax = sns.distplot(telecomdata[telecomdata['Contract']=='Two year']['tenure'],
                   hist=True, kde=False,
                   bins=int(180/5), color = 'darkblue',
                   hist_kws={'edgecolor':'black'},
                   kde_kws={'linewidth': 4},
                 ax=ax3)

ax.set_xlabel('Tenure (months)')
ax.set_title('Two Year Contract')

Observations- Most of the monthly contracts last for 1-2 months. 
But 2 year contracts last for about 70 months. 
This shows that customers having longer contracts are loyal to the company and aim to stay for long.
As seen in the earlier chart on correlation with the churn rate.



### C) Distribution of various services used by customers

In [None]:
telecomdata.columns.values


In [None]:
services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity',
           'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

fig, axes = plt.subplots(nrows = 3,ncols = 3,figsize = (15,12))
for i, item in enumerate(services):
    if i < 3:
        ax = telecomdata[item].value_counts().plot(kind = 'bar',ax=axes[i,0],rot = 0)
        
    elif i >=3 and i < 6:
        ax = telecomdata[item].value_counts().plot(kind = 'bar',ax=axes[i-3,1],rot = 0)
        
    elif i < 9:
        ax = telecomdata[item].value_counts().plot(kind = 'bar',ax=axes[i-6,2],rot = 0)
    ax.set_title(item)

### D) Relation between monthly & total charges


Assuming that the total charges increases as monthly bill for customer increases.



In [None]:
telecomdata[['MonthlyCharges', 'TotalCharges']].plot.scatter(x = 'MonthlyCharges',
                                                              y='TotalCharges')

### E) Predictor variable (Churn) with other variables as found in correlation plot


Churn rate in our data


In [None]:
colors = ['#4D3425','#E4512B']
ax = (telecomdata['Churn'].value_counts()*100.0 /len(telecomdata)).plot(kind='bar',
                                                                           stacked = True,
                                                                          rot = 0,
                                                                          color = colors,
                                                                         figsize = (8,6))
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_ylabel('% Customers',size = 14)
ax.set_xlabel('Churn',size = 14)
ax.set_title('Churn Rate', size = 14)

# create a list to collect the plt.patches data
totals = []

# find the values and append to list
for i in ax.patches:
    totals.append(i.get_width())
# set individual bar lables using above list
total = sum(totals)

for i in ax.patches:
    # get_width pulls left or right; get_y pushes up or down
    ax.text(i.get_x()+.15, i.get_height()-4.0, \
            str(round((i.get_height()/total), 1))+'%',
            fontsize=12,
            color='white',
           weight = 'bold',
           size = 14)

Observations-
Based on given data, 74% of the customers don't churn. 
Clearly, the data is skewed as a large majority of the customers are expected to not churn.
This needs to be duly noted for our modelling, as skeweness could lead to lot of false negatives.



To check the churn rate by tenure, seniority, contract type, monthly charges and total charges

i.) Churn by Tenure: 
    
    As seen below, customers who don't churn, tend to stay for longer tenure

In [None]:
sns.boxplot(x = telecomdata.Churn, y = telecomdata.tenure)


ii.) Churn by Contract Type: 
    
    Customers who have a month to month contract have a very high churn rate.



In [None]:
colors = ['#4D3425','#E4512B']
contracts_churn = telecomdata.groupby(['Contract','Churn']).size().unstack()

ax = (contracts_churn.T*100.0 / contracts_churn.T.sum()).T.plot(kind='bar',
                                                                width = 0.3,
                                                                stacked = True,
                                                                rot = 0, 
                                                                figsize = (10,6),
                                                                color = colors)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.legend(loc='best',prop={'size':14},title = 'Churn')
ax.set_ylabel('% Customers',size = 14)
ax.set_title('Churn by Contract Type',size = 14)

# Code to add the data labels on the stacked bar chart
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0f}%'.format(height), (p.get_x()+.25*width, p.get_y()+.4*height),
                color = 'white',
               weight = 'bold',
                size = 14)


iii.) Churn by Seniority: 
    
    Senior Citizens have double the churn rate as compared to younger population.



In [None]:
colors = ['#4D3425','#E4512B']
senior_churn = telecomdata.groupby(['SeniorCitizen','Churn']).size().unstack()

ax = (senior_churn.T*100.0 / senior_churn.T.sum()).T.plot(kind='bar',
                                                                width = 0.2,
                                                                stacked = True,
                                                                rot = 0, 
                                                                figsize = (8,6),
                                                                color = colors)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.legend(loc='center',prop={'size':14},title = 'Churn')
ax.set_ylabel('% Customers')
ax.set_title('Churn by Seniority Level',size = 14)

# Code to add the data labels on the stacked bar chart
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0f}%'.format(height), (p.get_x()+.25*width, p.get_y()+.4*height),
                color = 'white',
                weight = 'bold',size =14)

iv.) Churn by Monthly Charges: 
    
    Higher % of customers churn when there are high monthly charges.

In [None]:
ax = sns.kdeplot(telecomdata.MonthlyCharges[(telecomdata["Churn"] == 'No') ],
                color="Red", shade = True)
ax = sns.kdeplot(telecomdata.MonthlyCharges[(telecomdata["Churn"] == 'Yes') ],
                ax =ax, color="Blue", shade= True)
ax.legend(["Not Churn","Churn"],loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Monthly Charges')
ax.set_title('Distribution of monthly charges by churn')

v.) Churn by Total Charges: 
    
    There is higher churn when there are lower total charges.

In [None]:
ax = sns.kdeplot(telecomdata.TotalCharges[(telecomdata["Churn"] == 'No') ],
                color="Red", shade = True)
ax = sns.kdeplot(telecomdata.TotalCharges[(telecomdata["Churn"] == 'Yes') ],
                ax =ax, color="Blue", shade= True)
ax.legend(["Not Churn","Churn"],loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Total Charges')
ax.set_title('Distribution of total charges by churn')

### To develop predictive models and compare them.
Using-- Logistic Regression, Random Forest

1. Logistic Regression

In [None]:
# We will use the data frame where we had created dummy variables
y = df_dummy['Churn'].values
X = df_dummy.drop(columns = ['Churn'])

# Scaling all the variables to a range of 0 to 1
from sklearn.preprocessing import MinMaxScaler
features = X.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = features

In [None]:
To scale variables in logistic regression so that all of them are within a range of 0 to 1.
This helps improve the accuracy from 79.7% to 80.7%. 
the importance of variables is also aligned with what is seen in Random Forest algorithm and the EDA done above.

In [None]:
# Create Train & Test Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
# Run logistic regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
result = model.fit(X_train, y_train)

In [None]:
from sklearn import metrics
predictiontest = model.predict(X_test)
# Print prediction accuracy
print (metrics.accuracy_score(y_test, prediction_test))

In [None]:
# Getting weights of all variables
weights = pd.Series(model.coef_[0],
                 index=X.columns.values)
print (weights.sort_values(ascending = False)[:10].plot(kind='bar'))

In [None]:
print(weights.sort_values(ascending = False)[-10:].plot(kind='bar'))

Observations

* Some variables have negative relation to our predicted variable (Churn), while some have positive relation. 
* When the likeliness of churn decreases with that variable it means it is Negative relation.

* As seen in EDA, having 2 month contract reduces chances of churn. 
* 2 month contract along with tenure have negative relation with Churn as predicted by logistic regressions

* Having DSL internet service reduces the proability of Churn
* Fibre optic internet services, total charges, monthly contracts and seniority can lead to higher churn rates.
* Customers are likely to churn because of faster fibre optic services

2. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
model_rf = RandomForestClassifier(n_estimators=1000 , oob_score = True, n_jobs = -1,
                                  random_state =50, max_features = "auto",
                                  max_leaf_nodes = 30)
model_rf.fit(X_train, y_train)

# Make predictions
predictiontest = model_rf.predict(X_test)
print (metrics.accuracy_score(y_test, predictiontest))

In [None]:
importances = model_rf.feature_importances_
weights = pd.Series(importances,
                 index=X.columns.values)
weights.sort_values()[-10:].plot(kind = 'barh')

Observations:

* From random forest algorithm, monthly contract, tenure and total charges are important predictor variables to predict churn.
* Results from random forest are same as logistic regression as expected from EDA