# Telco Customer Churn Classification Review

Predict behavior to retain customer analyzing all relevant customer data to develop focused customer retention programs fotr telecomunication company.

In [2]:
# Data Handling
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer

# Machine Learning Modeling
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost as xgb
import lightgbm as lgb

# Model Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV

In [3]:
df = pd.read_csv('Dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Dataset

https://www.kaggle.com/datasets/blastchar/telco-customer-churn

Each row represents a customer, each column contains customer’s attributes described on the column Metadata.

The data set includes information about:

- Customers who left within the last month – the column is called Churn
- Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
- Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges
- Demographic info about customers – gender, age range, and if they have partners and dependents

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


## Preprocessing

In [5]:
# view unique values of each column

for column in df.columns:
    print('Column: {} - Unique Values: {}'.format(column, df[column].unique()))

Column: customerID - Unique Values: ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
Column: gender - Unique Values: ['Female' 'Male']
Column: SeniorCitizen - Unique Values: [0 1]
Column: Partner - Unique Values: ['Yes' 'No']
Column: Dependents - Unique Values: ['No' 'Yes']
Column: tenure - Unique Values: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
Column: PhoneService - Unique Values: ['No' 'Yes']
Column: MultipleLines - Unique Values: ['No phone service' 'No' 'Yes']
Column: InternetService - Unique Values: ['DSL' 'Fiber optic' 'No']
Column: OnlineSecurity - Unique Values: ['No' 'Yes' 'No internet service']
Column: OnlineBackup - Unique Values: ['Yes' 'No' 'No internet service']
Column: DeviceProtection - Unique Values: ['No' 'Yes' 'No internet service']
Column: TechSuppor

In [6]:
# Convert the 'TotalCharges' column to numeric values
# Fill missing values with 0

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.fillna(0)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [8]:
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [9]:
# Drop Customer ID column

df.drop(columns='customerID', inplace=True)

In [10]:
# change seniorcitizen column dtype to object

df['SeniorCitizen'] = df['SeniorCitizen'].astype('object')

In [11]:
df['SeniorCitizen'].value_counts()

SeniorCitizen
0    5901
1    1142
Name: count, dtype: int64

In [12]:
# Replace 0 values with 'No' and 1 values with 'Yes'

df['SeniorCitizen'] = df['SeniorCitizen'].replace({0: 'No', 1: 'Yes'})

In [13]:
df['SeniorCitizen'].value_counts()

SeniorCitizen
No     5901
Yes    1142
Name: count, dtype: int64

In [14]:
# Convert the 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1.

labelEncoder = LabelEncoder()

df['Churn'] = labelEncoder.fit_transform(df['Churn'])

In [15]:
df['Churn'].value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

In [16]:
# Split the data into an 80-20 train-test split with a random state of “1”

X = df.drop(columns=['Churn'])
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [17]:
X_train.shape, y_train.shape

((5634, 19), (5634,))

In [18]:
X_test.shape, y_test.shape

((1409, 19), (1409,))

In [19]:
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
               'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 
               'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
               'Contract', 'PaperlessBilling', 'PaymentMethod']

numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

## Feature engineering

### Standardization

In [20]:
# scale numerical values using StandardScaler
# convert the output back to a dataframe and put back the column names

scaler = StandardScaler()

Scaled_Train = scaler.fit_transform(X_train[numerical])
Scaled_Test = scaler.transform(X_test[numerical])

# Create new DataFrames with the scaled values
Scaled_Train_df = pd.DataFrame(Scaled_Train, columns=numerical)
Scaled_Test_df = pd.DataFrame(Scaled_Test, columns=numerical)

In [21]:
Scaled_Train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5634 entries, 0 to 5633
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tenure          5634 non-null   float64
 1   MonthlyCharges  5634 non-null   float64
 2   TotalCharges    5634 non-null   float64
dtypes: float64(3)
memory usage: 132.2 KB


In [22]:
Scaled_Test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1409 entries, 0 to 1408
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tenure          1409 non-null   float64
 1   MonthlyCharges  1409 non-null   float64
 2   TotalCharges    1409 non-null   float64
dtypes: float64(3)
memory usage: 33.2 KB


### Encoding

In [23]:
# OneHotEncoder(set sparse_output to false) for categorical values
# convert the output back to a dataframe and put back the column names.

transformer = make_column_transformer(
    (OneHotEncoder(sparse_output=False), categorical)
)

Encoded_Train = transformer.fit_transform(X_train[categorical])
Encoded_Test = transformer.transform(X_test[categorical])

# Transforming back to a dataframe
Encoded_Train_df = pd.DataFrame(Encoded_Train, columns=transformer.get_feature_names_out())
Encoded_Test_df = pd.DataFrame(Encoded_Test, columns=transformer.get_feature_names_out())

In [24]:
Encoded_Train_df.columns = Encoded_Train_df.columns.str.replace('onehotencoder__', '')
Encoded_Test_df.columns = Encoded_Test_df.columns.str.replace('onehotencoder__', '')

### Merge Datasets

In [25]:
# Combine scaled numerical and one-hot encoded categorical features into train and test set dataframes (use pd.concat)

train_df = pd.concat([Scaled_Train_df, Encoded_Train_df], axis=1)
test_df = pd.concat([Scaled_Test_df, Encoded_Test_df], axis=1)

In [26]:
train_df.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-0.825884,-1.49753,-0.890947,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.395961,0.302996,0.389693,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.577078,0.01232,1.060945,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
3,1.577078,0.686687,1.775397,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.092777,0.186726,-0.102671,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [27]:
train_df.shape, y_train.shape

((5634, 46), (5634,))

In [28]:
test_df.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.355233,0.500655,0.460383,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,1.373437,1.249767,1.850854,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
2,-0.825884,-0.657063,-0.77357,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,-1.110981,-0.471031,-0.894653,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.90734,0.037235,-0.713691,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [29]:
test_df.shape, y_test.shape

((1409, 46), (1409,))

## Modeling

### Random Forest Classifier

In [30]:
rf_classifier = RandomForestClassifier(random_state=1)
rf_classifier.fit(train_df, y_train)

### Extra Trees Classifier

In [31]:
et_classifier = ExtraTreesClassifier(random_state=1)
et_classifier.fit(train_df, y_train)

### XGBoost Classifier

In [32]:
xgb_classifier = xgb.XGBClassifier(random_state=1)
xgb_classifier.fit(train_df, y_train)

### LightGBM Classifier

In [33]:
lgb_classifier = lgb.LGBMClassifier(random_state=1)
lgb_classifier.fit(train_df, y_train)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000436 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


## Evaluation

In [34]:
model_names = ['Random Forest Classifier', 'Extra Trees Classifier', 'XGBoost Classifier', 'LightGBM Classifier']
models = [rf_classifier, et_classifier, xgb_classifier, lgb_classifier]
model_names_list = []
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Loop through each model to calculate metrics and store information
for name, model in zip(model_names, models):
    # Make predictions on the test data
    y_pred = model.predict(test_df)
    
    # Calculate accuracy, precision, recall, and F1-score
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store model name and metrics
    model_names_list.append(name)
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Create a DataFrame with the calculated metrics
metrics_df = pd.DataFrame({
    'Model': model_names_list,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1-Score': f1_scores
})

# Display the DataFrame
metrics_df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
3,LightGBM Classifier,0.803407,0.605341,0.586207,0.59562
2,XGBoost Classifier,0.793471,0.586103,0.557471,0.571429
0,Random Forest Classifier,0.791341,0.584906,0.534483,0.558559
1,Extra Trees Classifier,0.767211,0.532258,0.474138,0.50152


## Hyperparameter Tuning

Hyperparameter tuning for extra trees classifier

In [35]:
# Check current model parameters

current_params = et_classifier.get_params()
current_params

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [36]:
param_dist = {
    'n_estimators': [50, 100, 300, 500, 1000],
    'min_samples_split': [2, 3, 5, 7, 9],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

# Initialize RandomizedSearchCV
random_search_et = RandomizedSearchCV(estimator=et_classifier, param_distributions=param_dist, scoring='accuracy',
                                     cv=5, n_jobs=-1, random_state=1, n_iter=10, verbose = 1)

random_search_et.fit(train_df, y_train)

# best parameters
best_params = random_search_et.best_params_

best_params

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jmutonyi\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jmutonyi\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\jmutonyi\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\jmutonyi\AppData\Local\Programs\Python\Py

{'n_estimators': 1000,
 'min_samples_split': 9,
 'min_samples_leaf': 8,
 'max_features': 'sqrt'}

In [37]:
# Fit tuned model on train data
tuned_et_model = random_search_et.best_estimator_
tuned_et_model.fit(train_df, y_train)

In [38]:
model_names = ['Original Extra Trees Classifier', 'Tuned Extra Trees Classifier']
models = [et_classifier, tuned_et_model]
model_names_list = []
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Loop through each model to calculate metrics and store information
for name, model in zip(model_names, models):
    # Make predictions on the test data
    y_pred = model.predict(test_df)
    
    # Calculate accuracy, precision, recall, and F1-score
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store model name and metrics
    model_names_list.append(name)
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Create a DataFrame with the calculated metrics
metrics_df = pd.DataFrame({
    'Model': model_names_list,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1-Score': f1_scores
})

# Display the DataFrame
metrics_df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
1,Tuned Extra Trees Classifier,0.804116,0.620805,0.531609,0.572755
0,Original Extra Trees Classifier,0.767211,0.532258,0.474138,0.50152


### Feature Importance

In [39]:
feature_importance = tuned_et_model.feature_importances_

importance_df = pd.DataFrame({'Feature': train_df.columns, 'Importance': feature_importance})

importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df


Unnamed: 0,Feature,Importance
37,Contract_Month-to-month,0.152237
0,tenure,0.0928
19,OnlineSecurity_No,0.074998
17,InternetService_Fiber optic,0.065287
28,TechSupport_No,0.064141
39,Contract_Two year,0.054423
44,PaymentMethod_Electronic check,0.051666
2,TotalCharges,0.047714
16,InternetService_DSL,0.032687
22,OnlineBackup_No,0.030077
