In [None]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
# Importing the dataset
df = pd.read_csv(r"../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.info()

In [None]:
# Taking care of missing data using 'mean' and convert 'TotalCharges' from object type to float
from sklearn.impute import SimpleImputer
df['TotalCharges']=df['TotalCharges'].str.strip().replace('',np.nan)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df['TotalCharges'] = imputer.fit_transform(df['TotalCharges'].values.reshape(-1,1))

### Exploratory Data Analysis

In [None]:
def autolabel(rects,total):
    """Attach a text label above each bar in *rects*, displaying its percentage."""
    for j in range(len(rects)):
        height = rects[j].get_height()
        percentage = '{:.1f}%'.format(100 * rects[j].get_height() / total)
        ax.annotate(percentage,
                    xy=(rects[j].get_x() + rects[j].get_width() / 2, height),
                    xytext=(0, 0.5),
                    textcoords="offset points",
                    ha='center', va='bottom')
# Churn
total = len(df['Churn'])        
colors1 =['#C03028','#78C850']#Fighting,Grass
colors2 = ['#6890F0','#F08030','#F8D030','#A8A878']#Water,Fire,Electric,Normal
ax = sns.countplot(x='Churn',hue='Churn',data=df,palette=colors1)
plt.ylabel('Number of Occurrences')
autolabel(ax.patches,total)

26.5% of the customers left the company.

In [None]:
# Gender
f, axes = plt.subplots(1, 3,figsize=(15, 10))
# Male - Female Countplot
ax = sns.countplot(x='gender',hue='Churn',data=df,ax=axes[0],palette=colors1)
ax.set(xlabel="Gender", ylabel="Number of Occurrences")
autolabel(ax.patches,total)
 
# Male - Female Pies Charts 
group = df.groupby(['gender', 'Churn']).size().unstack()
axes[1].pie(group.iloc[:,0], labels=group.index, autopct='%1.1f%%',colors=colors2)
axes[1].legend(title='Gender',fontsize='x-small')
axes[1].set_title('NO CHURN',color=colors1[0], weight='bold')
axes[2].pie(group.iloc[:,1], labels=group.index, autopct='%1.1f%%',colors=colors2)
axes[2].legend(title='Gender',fontsize='x-small')
axes[2].set_title('CHURN',color=colors1[1],weight='bold')

Churning rates for men end women are alomst equal. We can say that gender has not big importance.

In [None]:
# OnlineSecurity-OnlineBackup-DeviceProtection-TechSupport-StreamingTV-StreamingMovies
fig = plt.figure(figsize=(15, 10))
IVs = ['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
OnlineServices = df[IVs].replace({'No internet service':2,'No': 0, 'Yes': 1})
OnlineServicesSum = OnlineServices.sum(axis=1)
ax = sns.countplot(x=OnlineServicesSum,hue='Churn',data=df,palette=colors1)
plt.ylabel("Number of Occurrences")
plt.xlabel('Number of OnlineServices')
plt.xticks(np.arange(8),(0,1,2,3,4,5,6,'No internet service'))
autolabel(ax.patches,total)

In [None]:
OnlineServicesSum = pd.crosstab(OnlineServicesSum,df.Churn)
OnlineServicesSum.rename(index={12:'No Internet service'}, inplace=True)
f, axes = plt.subplots(1,2,figsize=(15, 10))
colors3 = ['#6890F0','#F08030','#F8D030','#98D8D8','#A8A878','#C03028','#78C850','#F85888']
axes[0].pie(OnlineServicesSum.iloc[:,0], labels=OnlineServicesSum.index, autopct='%1.1f%%',colors=colors3)
axes[0].legend(title='Gender',fontsize='x-small')
axes[0].set_title('NO CHURN',color=colors1[0], weight='bold')
axes[1].pie(OnlineServicesSum.iloc[:,1], labels=OnlineServicesSum.index, autopct='%1.1f%%',colors=colors3)
axes[1].legend(title='Gender',fontsize='x-small')
axes[1].set_title('CHURN',color=colors1[1],weight='bold')

There are 6 Online Services. Customers with 0 OnlineServices are denoted with '0'. The churning percentages are larger for customers with 0,1 and 2 OnlineServices. Also, customers with No internet service tend to stay in the company.

In [None]:
for i in range(len(IVs)):
 f, axes = plt.subplots(1,3,figsize=(10, 5))
 # Countplot
 ax = sns.countplot(x=IVs[i],hue='Churn',data=df,ax=axes[0],palette=colors1)
 ax.set(xlabel=IVs[i], ylabel="Number of Occurrences")
 autolabel(ax.patches,total)
 
 # Pie Charts 
 group = df.groupby([IVs[i], 'Churn']).size().unstack()
 axes[1].pie(group.iloc[:,0], labels=group.index, autopct='%1.1f%%',colors=colors2)
 axes[1].legend(title=IVs[i],fontsize='xx-small',title_fontsize=6)
 axes[1].set_title('NO CHURN',color=colors1[0], weight='bold')
 axes[2].pie(group.iloc[:,1], labels=group.index, autopct='%1.1f%%',colors=colors2)
 axes[2].legend(title=IVs[i],fontsize='xx-small',title_fontsize=6)
 axes[2].set_title('CHURN',color=colors1[1], weight='bold')

Churning customers with no OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport are more likely to leave the company.
Churning customers with and without StreamingMovies and StreamingTV have similar percentages. 

In [None]:
#SeniorCitizen-Partner-Dependents
IVs = ['SeniorCitizen','Partner','Dependents']
for i in range(len(IVs)):
 f, axes = plt.subplots(1, 3,figsize=(10, 5))
 # Countplot
 ax = sns.countplot(x=IVs[i],hue='Churn',data=df,ax=axes[0],palette=colors1)
 ax.set(xlabel=IVs[i], ylabel="Number of Occurrences")
 autolabel(ax.patches,total)
 
 # Pie Charts 
 group = df.groupby([IVs[i], 'Churn']).size().unstack()
 axes[1].pie(group.iloc[:,0], labels=group.index, autopct='%1.1f%%',colors=colors2)
 axes[1].legend(title=IVs[i],fontsize='x-small')
 axes[1].set_title('NO CHURN',color=colors1[0], weight='bold')
 axes[2].pie(group.iloc[:,1], labels=group.index, autopct='%1.1f%%',colors=colors2)
 axes[2].legend(title=IVs[i],fontsize='x-small')
 axes[2].set_title('CHURN',color=colors1[1], weight='bold')

75.4% of the churning customers aren't SeniorSitizens.
64.2% of the churning customers don't have partners.
82.6% of the churning customers don't have dependents.

In [None]:
# Contract-PaperlessBilling-PaymentMethod
IVs = ['Contract','PaperlessBilling','PaymentMethod']
for i in range(len(IVs)):
 f, axes = plt.subplots(1, 3,figsize=(18, 10))
 # Countplot
 ax = sns.countplot(x=IVs[i],hue='Churn',data=df,ax=axes[0],palette=colors1)
 ax.set(xlabel=IVs[i], ylabel="Number of Occurrences")
 if i == 2:
     ax.set_xticklabels(labels=['Electronic check','Mailed check','Bank transfer','Credit card'],fontsize=7)
 autolabel(ax.patches,total)
 
 # Pie Charts 
 group = df.groupby([IVs[i], 'Churn']).size().unstack()
 axes[1].pie(group.iloc[:,0], labels=group.index, autopct='%1.1f%%',colors=colors2)
 axes[1].legend(title=IVs[i],fontsize='x-small')
 axes[1].set_title('NO CHURN',color=colors1[0], weight='bold')
 axes[2].pie(group.iloc[:,1], labels=group.index, autopct='%1.1f%%',colors=colors2)
 axes[2].legend(title=IVs[i],fontsize='x-small')
 axes[2].set_title('CHURN',color=colors1[1], weight='bold')

88.6% of the churning customers have a Month to Month Contract.
74.9% of the churning customers have Paperless Billing.
57.3% of the churning customers have Electronic check as a payment method.

In [None]:
# PhoneService-MultipleLines-InternetService
IVs = ['PhoneService','MultipleLines','InternetService']
for i in range(len(IVs)):
 f, axes = plt.subplots(1, 3,figsize=(10, 5))
 # Countplot
 ax = sns.countplot(x=IVs[i],hue='Churn',data=df,ax=axes[0],palette=colors1)
 ax.set(xlabel=IVs[i], ylabel="Number of Occurrences")
 autolabel(ax.patches,total)
 
 # Pie Charts 
 group = df.groupby([IVs[i], 'Churn']).size().unstack()
 axes[1].pie(group.iloc[:,0], labels=group.index, autopct='%1.1f%%',colors=colors2)
 axes[1].legend(title=IVs[i],fontsize='x-small')
 axes[1].set_title('NO CHURN',color=colors1[0], weight='bold')
 axes[2].pie(group.iloc[:,1], labels=group.index, autopct='%1.1f%%',colors=colors2)
 axes[2].legend(title=IVs[i],fontsize='x-small')
 axes[2].set_title('CHURN',color=colors1[1], weight='bold')

9/10 customers have a phone service.
Customers with and without MultipleLines have siilar rates of churning and no churning.
Customers with Fiber Optic Internet Service tend to leave the company.

In [None]:
# Tenure
fig = plt.figure(figsize=(15, 10))
T = df.groupby(['tenure', 'Churn']).size().unstack().fillna(0)
plt.bar(T.index,T['Yes'],color = '#78C850')
plt.bar(T.index,T['No'],color = '#C03028',bottom=T['Yes'])
plt.legend(['Yes', 'No'],loc='upper right')
plt.ylabel("Number of Occurrences")
plt.xlabel('Tenure in months')  

As the tenure increases the customes tend to stay in the company. On the other hand new customers with small tenures left the bank.

In [None]:
# MonthlyCharges
plt.figure(figsize=(15, 10))
bins = np.arange(0,df['MonthlyCharges'].max(), 1)
df['bins'] = pd.cut(df['MonthlyCharges'], bins)
bins = bins[:-1]
M = df.groupby(['bins','Churn']).size().unstack().fillna(0)
M.index = bins.astype(int)
plt.bar(M.index,M['Yes'],color = '#78C850')
plt.bar(M.index,M['No'],color = '#C03028',bottom=M['Yes'])
plt.legend(['Yes', 'No'],loc='upper right')
plt.ylabel("Number of Occurrences")
plt.xlabel('MonthlyCharges')

Customers with Monthly Charges between 70 and 110 tend to leave the company.

In [None]:
# TotalCharges
plt.figure(figsize=(15, 10))
bins = np.arange(0,df['TotalCharges'].max(), 120)
df['bins'] = pd.cut(df['TotalCharges'], bins)
bins = bins[:-1]
C = df.groupby(['bins','Churn']).size().unstack().fillna(0)
C.index = bins.astype(int)/120
plt.bar(C.index,C['Yes'],color = '#78C850')
plt.bar(C.index,C['No'],color = '#C03028',bottom=C['Yes'])
plt.legend(['Yes', 'No'],loc='upper right')
plt.ylabel("Number of Occurrences")
plt.xlabel('TotalCharges * 100 ')

The number of churning customers decreases as the Total Charges increase.
Many new customers with small Total Charges left the company. 

### Data Preprocessing

In [None]:
# Dataset split to Categorical (Nominal,Ordinal,Binary) and Numeric Vars
# Living outside the features ['gender','Parter','StreamingMovies','StreamingTV']
df_Cat_Bin = df[['SeniorCitizen','Dependents','PhoneService','PaperlessBilling']]
df_Cat_Nom = df[['MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','Contract','PaymentMethod']]
df_Num = df[['tenure','MonthlyCharges','TotalCharges']]

# Categorical Output
y = df['Churn']

In [None]:
# LABEL ENCODING - ONE HOT ENCODING
from sklearn.preprocessing import LabelEncoder

# Categorical Binary Features Encoding
df_Cat_Bin_Ld = df_Cat_Bin.apply(LabelEncoder().fit_transform)

# Categorical Nominal Features Encoding

df_Cat_Nom_OHEd = pd.get_dummies(df_Cat_Nom)

# Remove one variable from each One Hot Encoded feature
df_Cat_Nom_OHEd = df_Cat_Nom_OHEd.drop(['InternetService_No','OnlineSecurity_No internet service',
                      'OnlineBackup_No internet service','DeviceProtection_No internet service','TechSupport_No internet service',
                      'Contract_One year','MultipleLines_No phone service',
                      'PaymentMethod_Credit card (automatic)'],axis=1)

# All Categorical Features
df_Cat = pd.concat([df_Cat_Bin_Ld,df_Cat_Nom_OHEd],axis=1)

# Categorical Outpout Encoding
y_Ld = y.replace({'No': 0, 'Yes': 1})

In [None]:
# Correlation Matrix
corr = pd.concat([df_Cat,df_Num],axis=1)
corr = corr.corr()
plt.figure(figsize=(10, 10),dpi=80)
sns.heatmap(corr, xticklabels=corr.columns,yticklabels=corr.columns)

In [None]:
# For Numeric IVs, calculate VIF and save in dataframe
from statsmodels.stats.outliers_influence import variance_inflation_factor
V = df_Num
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(V.values, i) for i in range(V.shape[1])]
vif["features"] = V.columns
print(vif)

#TotalCharges and Tenure are strongly correlated. Remove TotalCharges
df_Num = df_Num.drop(['TotalCharges'],axis=1)

In [None]:
# ALL the Selected IVs
X = pd.concat([df_Num,df_Cat],axis=1)
columns=X.columns
#print(X)

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler,MinMaxScaler
X = StandardScaler().fit_transform(X)

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_Ld, test_size = 0.2, random_state = 0)

In [None]:
# Choosing Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Used GridSearchCV for parameter tuning
CF = [None]*6
Names = ['Logistic Regression','SVM linear','Naive Bayes','kNN','Decision Tree','Random Forest']
CF[0] = LogisticRegression(solver='newton-cg')
CF[1] = SVC(kernel = 'linear', random_state = 0,probability=True)
CF[2] = GaussianNB()
CF[3] = KNeighborsClassifier(n_neighbors = 15, metric = 'minkowski', p = 2)
CF[4] = DecisionTreeClassifier(criterion = 'gini', random_state = 0)
CF[5] = RandomForestClassifier(n_estimators=100,max_leaf_nodes=35,min_samples_leaf=4,criterion = 'gini',max_depth=7, random_state = 0)

In [None]:
# Classification Metrics
Classifiers = ['Logistic Regression','SVM linear','Naive Bayes','k-NN','Decision Tree','Random Forest']
Cols = ['Accuracy','Recall','Precision','f1 score','AUC ROC score']
Scores = pd.DataFrame(index=Classifiers,columns=Cols).astype('float')
for i in range(len(CF)):
    classifier = CF[i]
    classifier.fit(X_train, y_train)
    c_probs = classifier.predict_proba(X_test)
    c_probs = c_probs[:, 1]
    
    y_pred = classifier.predict(X_test)
    
    from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score
    Scores.Accuracy[i] = accuracy_score(y_test,y_pred)
    Scores.Recall[i] = recall_score(y_test,y_pred)
    Scores.Precision[i] = precision_score(y_test,y_pred)
    Scores['f1 score'][i] = f1_score(y_test,y_pred)
    Scores['AUC ROC score'][i] = roc_auc_score(y_test,c_probs)
    
print(Scores)

In [None]:
# Feature Importances
FImportances = pd.DataFrame(data=classifier.feature_importances_,index=columns,columns=['Importance'])
#print(FImportances)
plt.barh(range(FImportances.shape[0]),FImportances['Importance'],color = '#78C850')
plt.yticks(range(FImportances.shape[0]), FImportances.index)
plt.title('Feature Importances')
plt.show()

In [None]:
# ROC - Curves for models
fig = plt.figure(figsize=(15,10))
fig.subplots_adjust(hspace=0.3, wspace=0.3)    
for i in range(len(CF)):
    plt.subplot(2, 3, i+1)
    #i=4
    classifier = CF[i]
    classifier.fit(X_train, y_train)  
     
    # Predict probabilities
    r_probs = [0 for _ in range(len(y_test))]
    c_probs = classifier.predict_proba(X_test)

    # Keep probabilities for the positive outcome only
    c_probs = c_probs[:, 1]

    # Calculate AUROC
    from sklearn.metrics import roc_curve, roc_auc_score, auc
    r_auc = roc_auc_score(y_test, r_probs)
    c_auc = roc_auc_score(y_test, c_probs)
    #print('Random (chance) Prediction: AUROC = %.3f' % (r_auc))
    #print('%s: AUROC = %.3f' % (Names[i],c_auc))

    # Calculate ROC curve
    r_fpr, r_tpr, _ = roc_curve(y_test, r_probs)
    c_fpr, c_tpr, _ = roc_curve(y_test, c_probs)
    plt.plot(r_fpr, r_tpr, linestyle='--',c='r', label='Random Prediction (AUROC = %0.3f)' % r_auc)
    plt.plot(c_fpr, c_tpr, marker='.',c='b', label='%s (AUROC = %0.3f)' % (Names[i],c_auc))

    plt.title('ROC Plot')
    plt.xlabel('False Positive Rate - 1 - Specificity')
    plt.ylabel('True Positive Rate - Sensitivity')
    plt.legend(fontsize='small')

In [None]:
# Cap Curve
fig = plt.figure(figsize=(15,10))
fig.subplots_adjust(hspace=0.3, wspace=0.3)    
for i in range(len(CF)):
    plt.subplot(2, 3, i+1)
    
    total = len(y_test)
    class_1_count = np.sum(y_test)
    class_0_count = total - class_1_count

    #plt.figure(figsize = (20, 12))
    plt.plot([0, total], [0, class_1_count], c = 'r', linestyle = '--', label = 'Random Model')

    plt.plot([0, class_1_count, total], 
             [0, class_1_count, class_1_count], 
             c = 'grey', linewidth = 2, label = 'Perfect Model')

    classifier = CF[i]
    classifier.fit(X_train, y_train)  
    c_probs = classifier.predict_proba(X_test)

    # Keep probabilities for the positive outcome only
    c_probs = c_probs[:, 1]

    model_y = [y for _, y in sorted(zip(c_probs, y_test), reverse = True)]
    y_values = np.append([0], np.cumsum(model_y))
    x_values = np.arange(0, total + 1)

    from sklearn.metrics import auc
    # Area under Random Model
    a = auc([0, total], [0, class_1_count])

    # Area between Perfect and Random Model
    aP = auc([0, class_1_count, total], [0, class_1_count, class_1_count]) - a

    # Area between Trained and Random Model
    aR = auc(x_values, y_values) - a

    AR = aR / aP

    plt.plot(x_values, y_values, c = 'b', label = '%s (AR = %0.3f)' % (Names[i],AR), linewidth = 4)

    # Plot information
    plt.xlabel('Total observations')
    plt.ylabel('Class 1 observations')
    plt.title('Cumulative Accuracy Profile')
    plt.legend(fontsize='small')