# Customer Churn Analysis
Churn analysis is analyzing historical customer data to make predicitons on customers' churn. In this analysis we can study the customer life time value.

In [224]:
import numpy as np
import pandas as pd
import seaborn as sns
import pandasgui as pdgui
import panel as pn
import hvplot.pandas
import matplotlib.pyplot as plt
from pandasgui import show
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, log_loss, accuracy_score
 
# importing machine learning models for prediction
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

In [89]:
df = pd.read_excel('./data/Telco_customer_churn.xlsx')
df.head(3)

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved


In [90]:
print('columns', df.columns)
print('\n\nshape of the data frame:\n', df.shape)

columns Index(['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code',
       'Lat Long', 'Latitude', 'Longitude', 'Gender', 'Senior Citizen',
       'Partner', 'Dependents', 'Tenure Months', 'Phone Service',
       'Multiple Lines', 'Internet Service', 'Online Security',
       'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
       'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
       'Monthly Charges', 'Total Charges', 'Churn Label', 'Churn Value',
       'Churn Score', 'CLTV', 'Churn Reason'],
      dtype='object')


shape of the data frame:
 (7043, 33)


In [None]:
df=df.convert_dtypes()

In [136]:
for col in df.columns:
    if len(df[col].unique()) ==1:
        print(col)
        print(df[col].unique())
        df.drop(col, axis=1, inplace=True)

Count
<IntegerArray>
[1]
Length: 1, dtype: Int64
Country
<StringArray>
['United States']
Length: 1, dtype: string
State
<StringArray>
['California']
Length: 1, dtype: string


In [164]:
for col in df.columns:
    if df[col].isna().sum() != 0:
        print('The percentage of nas in ', col,' column:\n', round(df[col].isna().sum()/len(df)*100,2),'%')

The percentage of nas in  Churn Reason  column:
 73.46 %


In [167]:
df['Churn Reason'] = df['Churn Reason'].fillna('Not Mentioned')
print(df['Churn Reason'].unique())

<StringArray>
[             'Competitor made better offer',
                                     'Moved',
             'Competitor had better devices',
 'Competitor offered higher download speeds',
              'Competitor offered more data',
                            'Price too high',
                   'Product dissatisfaction',
                   'Service dissatisfaction',
           'Lack of self-service on Website',
                       'Network reliability',
                 'Limited range of services',
  'Lack of affordable download/upload speed',
                     'Long distance charges',
                        'Extra data charges',
                                "Don't know",
          'Poor expertise of online support',
           'Poor expertise of phone support',
              'Attitude of service provider',
                'Attitude of support person',
                                  'Deceased',
                             'Not Mentioned']
Length: 21, dtype: s

In [139]:
print('shape of dataframe:\n\n', df.shape, '\n\n')
df.info()

shape of dataframe:

 (7043, 30) 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         7043 non-null   string 
 1   City               7043 non-null   string 
 2   Zip Code           7043 non-null   Int64  
 3   Lat Long           7043 non-null   string 
 4   Latitude           7043 non-null   Float64
 5   Longitude          7043 non-null   Float64
 6   Gender             7043 non-null   string 
 7   Senior Citizen     7043 non-null   string 
 8   Partner            7043 non-null   string 
 9   Dependents         7043 non-null   string 
 10  Tenure Months      7043 non-null   Int64  
 11  Phone Service      7043 non-null   string 
 12  Multiple Lines     7043 non-null   string 
 13  Internet Service   7043 non-null   string 
 14  Online Security    7043 non-null   string 
 15  Online Backup      7043 non-null   s

In [168]:
print('Percentage of Churn:\n\n', np.round(len(df[df['Churn Value']==1])/len(df)*100, 2),'%')

Percentage of Churn:

 26.54 %


In [169]:
numeric_df = df.select_dtypes(include=np.number)
object_df = df.select_dtypes(include=['object', 'bool', 'string'])

print('numeric dataframe shape:\n\n', numeric_df.shape)
print('objective dataframe size:\n\n', object_df.shape)

numeric dataframe shape:

 (7043, 8)
objective dataframe size:

 (7043, 22)


In [194]:
convert_dict = {'Total Charges': float}
numeric_df['Total Charges']=pd.to_numeric(object_df['Total Charges'], errors='coerce')
numeric_df = numeric_df['Total Charges'].fillna(0)
object_df.drop('Total Charges', axis=1, inplace=True)

In [197]:
object_df_to_numeric = object_df.apply(preprocessing.LabelEncoder().fit_transform)
object_df_to_numeric.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   CustomerID         7043 non-null   int64
 1   City               7043 non-null   int64
 2   Lat Long           7043 non-null   int64
 3   Gender             7043 non-null   int64
 4   Senior Citizen     7043 non-null   int64
 5   Partner            7043 non-null   int64
 6   Dependents         7043 non-null   int64
 7   Phone Service      7043 non-null   int64
 8   Multiple Lines     7043 non-null   int64
 9   Internet Service   7043 non-null   int64
 10  Online Security    7043 non-null   int64
 11  Online Backup      7043 non-null   int64
 12  Device Protection  7043 non-null   int64
 13  Tech Support       7043 non-null   int64
 14  Streaming TV       7043 non-null   int64
 15  Streaming Movies   7043 non-null   int64
 16  Contract           7043 non-null   int64
 17  Paperless Bill

In [201]:
print(object_df['Churn Label'].value_counts(),'\n', object_df_to_numeric['Churn Label'].value_counts())

No     5174
Yes    1869
Name: Churn Label, dtype: Int64 
 0    5174
1    1869
Name: Churn Label, dtype: int64


In [196]:
df_final = pd.concat([object_df_to_numeric, numeric_df], axis=1)
df_final.shape

(7043, 22)

In [211]:
df_final.hvplot(x='City', y='Total Charges', kind='scatter', by='Churn Label')

In [203]:
df_final.head(4)

Unnamed: 0,CustomerID,City,Lat Long,Gender,Senior Citizen,Partner,Dependents,Phone Service,Multiple Lines,Internet Service,...,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Churn Label,Churn Reason,Total Charges
0,2564,562,327,1,0,0,0,1,0,0,...,0,0,0,0,0,1,3,1,3,108.15
1,6511,562,405,0,0,0,1,1,0,1,...,0,0,0,0,0,1,2,1,13,151.65
2,6551,562,393,0,0,0,1,1,2,1,...,2,0,2,2,0,1,2,1,13,820.5
3,5604,562,410,0,0,1,1,1,2,1,...,2,2,2,2,0,1,2,1,13,3046.05


In [205]:
x = df_final.loc[:, df_final.columns != 'Churn Label']
y = df_final.loc[:, df_final.columns == 'Churn Label']

In [227]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20)
# initializing all the model objects with default parameters
model_1=LogisticRegression(solver ='lbfgs', multi_class ='multinomial', max_iter = 200)
model_2=SVC(gamma ='auto', probability = True)
model_3=DecisionTreeClassifier()
model_4=XGBClassifier()
model_5=RandomForestClassifier()
 
# Making the final model using voting classifier
final_model = VotingClassifier(estimators=[('lr', model_1), ('svc', model_2), ('dtr', model_3), ('xgb', model_4), ('rfc', model_5)], 
                               voting='hard')

In [228]:
# training all the model on the train dataset
final_model.fit(X_train, y_train)
 
# predicting the output on the test dataset
y_pred = final_model.predict(X_test)
 
# printing log loss between actual and predicted value
print(log_loss(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
# using accuracy_score metric to predict accuracy
score = accuracy_score(y_test, y_pred)
print("Hard Voting Score % d" % score)

# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = [('lr', model_1), ('svc', model_2), ('dtr', model_3), ('xgb', model_4), ('rfc', model_5)]
                            ,voting ='soft')
vot_soft.fit(X_train, y_train)
y_pred = vot_soft.predict(X_test)
  
# using accuracy_score
score = accuracy_score(y_test, y_pred)
print("Soft Voting Score % d" % score)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



0.05116203461904515
0.0014194464158978
Hard Voting Score  0



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Soft Voting Score  1


In [239]:
lst = list(y_test ['Churn Label']==y_pred)
print(lst.count(True))
print(lst.count(False))

1409
0
