In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [3]:
# importing tools for Feature engineering, scale numerical features and one-hot encoding categorical features
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [4]:
# importing tools to build, train, test, score the model 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

In [5]:
raw_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

raw_data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [6]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
raw_data.head(11).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU,9305-CDSKC,1452-KIOVK,6713-OKOMC,7892-POOKP,6388-TABGU,9763-GRSKD
gender,Female,Male,Male,Male,Female,Female,Male,Female,Female,Male,Male
SeniorCitizen,0,0,0,0,0,0,0,0,0,0,0
Partner,Yes,No,No,No,No,No,No,No,Yes,No,Yes
Dependents,No,No,No,No,No,No,Yes,No,No,Yes,Yes
tenure,1,34,2,45,2,8,22,10,28,62,13
PhoneService,No,Yes,Yes,No,Yes,Yes,Yes,No,Yes,Yes,Yes
MultipleLines,No phone service,No,No,No phone service,No,Yes,Yes,No phone service,Yes,No,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic,Fiber optic,Fiber optic,DSL,Fiber optic,DSL,DSL
OnlineSecurity,No,Yes,Yes,Yes,No,No,No,Yes,No,Yes,Yes


In [8]:
raw_data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [9]:
raw_data.shape

(7043, 21)

In [10]:
data = raw_data.copy()

data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [11]:
gender_unique_values = data['gender'].unique()
print(gender_unique_values)


['Female' 'Male']


In [12]:
partner_unique_values = data['Partner'].unique()
print(partner_unique_values)


['Yes' 'No']


In [13]:
dependents_unique_values = data['Dependents'].unique()
print(dependents_unique_values)

['No' 'Yes']


In [14]:
data.drop(['customerID'], axis=1, inplace=True)

In [15]:
def data_quick_view():
    for column in data.columns:
        unique_values = data[column].unique()
        print(f'Unique values in column "{column}": {len(unique_values)}    They are {unique_values}')
        value_counts = data[column].value_counts()
        print(f'Value counts for column "{column}"\n{value_counts}\n\n')
data_quick_view()

Unique values in column "gender": 2    They are ['Female' 'Male']
Value counts for column "gender"
gender
Male      3555
Female    3488
Name: count, dtype: int64


Unique values in column "SeniorCitizen": 2    They are [0 1]
Value counts for column "SeniorCitizen"
SeniorCitizen
0    5901
1    1142
Name: count, dtype: int64


Unique values in column "Partner": 2    They are ['Yes' 'No']
Value counts for column "Partner"
Partner
No     3641
Yes    3402
Name: count, dtype: int64


Unique values in column "Dependents": 2    They are ['No' 'Yes']
Value counts for column "Dependents"
Dependents
No     4933
Yes    2110
Name: count, dtype: int64


Unique values in column "tenure": 73    They are [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
Value counts for column "tenure"
tenure
1     613
72    362
2     238
3     200

In [16]:
data.isnull().sum()

#GOod, now empty cell, impressive dataset !!!

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [17]:
data['TotalCharges'] = data['TotalCharges'].replace(' ', np.nan, regex=True)
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'])

In [18]:
data.isnull().sum() 

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [19]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
data['TotalCharges'] = imputer.fit_transform(data[['TotalCharges']])

data.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [21]:
# Convert 'Churn' to binary values bcus its the dependent variable
churn_map = {'Yes': 1, 'No': 0}
data['Churn'] = data['Churn'].replace(churn_map)


In [22]:
data.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1


In [23]:
data_quick_view()

Unique values in column "gender": 2    They are ['Female' 'Male']
Value counts for column "gender"
gender
Male      3555
Female    3488
Name: count, dtype: int64


Unique values in column "SeniorCitizen": 2    They are [0 1]
Value counts for column "SeniorCitizen"
SeniorCitizen
0    5901
1    1142
Name: count, dtype: int64


Unique values in column "Partner": 2    They are ['Yes' 'No']
Value counts for column "Partner"
Partner
No     3641
Yes    3402
Name: count, dtype: int64


Unique values in column "Dependents": 2    They are ['No' 'Yes']
Value counts for column "Dependents"
Dependents
No     4933
Yes    2110
Name: count, dtype: int64


Unique values in column "tenure": 73    They are [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
Value counts for column "tenure"
tenure
1     613
72    362
2     238
3     200

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [25]:
X = data.drop('Churn', axis=1)
y = data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [26]:
#Select d features
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [27]:
#Creating transformers for numerical and categorical features
num_transformer = Pipeline(steps=[('scaler', StandardScaler())])
cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False))])

In [28]:
# Create a preprocessor that applies transformers to columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical),
        ('cat', cat_transformer, categorical)
    ])



# same but to selected features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical),
        ('cat', OneHotEncoder(sparse=False), categorical)
    ])

In [29]:
# Apply the preprocessor to training and test data splited above
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)



In [30]:
# using sklean to train d Random Forest
rf_model = RandomForestClassifier(random_state=1)

rf_model.fit(X_train_prep, y_train)

rf_preds = rf_model.predict(X_test_prep)

In [31]:
# training with Extra Trees
et_model = ExtraTreesClassifier(random_state=1)

et_model.fit(X_train_prep, y_train)

et_preds = et_model.predict(X_test_prep)

In [32]:
# Using XGBoost to train a Extreme Bossting Method Model
xgb_model = XGBClassifier(random_state=1)

xgb_model.fit(X_train_prep, y_train)

xgb_preds = xgb_model.predict(X_test_prep)

  if is_sparse(data):


In [33]:
# Using LightGBM now
lgbm_model = LGBMClassifier(random_state=1)

lgbm_model.fit(X_train_prep, y_train)

lgbm_preds = lgbm_model.predict(X_test_prep)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [34]:
# Evaluating the models by using classification report
models = [rf_model, et_model, xgb_model, lgbm_model]
model_names = ['Random Forest', 'Extra Trees', 'XGBoost', 'LightGBM']

for model, name in zip(models, model_names):
    accuracy = accuracy_score(y_test, model.predict(X_test_prep))
    print(f"Accuracy for {name}: {accuracy}")
    print(f"Classification Report for {name}:\n{classification_report(y_test, model.predict(X_test_prep))}")


Accuracy for Random Forest: 0.7934705464868701
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      1061
           1       0.59      0.54      0.57       348

    accuracy                           0.79      1409
   macro avg       0.72      0.71      0.71      1409
weighted avg       0.79      0.79      0.79      1409

Accuracy for Extra Trees: 0.7721788502484032
Classification Report for Extra Trees:
              precision    recall  f1-score   support

           0       0.84      0.87      0.85      1061
           1       0.54      0.48      0.51       348

    accuracy                           0.77      1409
   macro avg       0.69      0.67      0.68      1409
weighted avg       0.76      0.77      0.77      1409

Accuracy for XGBoost: 0.801277501774308
Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87 

In [37]:

optimal_model = et_model
feature_importances = optimal_model.feature_importances_
feature_names = list(data.columns)

feature_names

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

In [38]:
feature_importance_dict = dict(zip(feature_names, feature_importances))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.4f}")


Partner: 0.1273
gender: 0.1183
SeniorCitizen: 0.1081
Churn: 0.0339
MonthlyCharges: 0.0268
tenure: 0.0226
Dependents: 0.0221
OnlineSecurity: 0.0179
InternetService: 0.0178
PaperlessBilling: 0.0162
StreamingMovies: 0.0157
PhoneService: 0.0146
DeviceProtection: 0.0145
MultipleLines: 0.0143
OnlineBackup: 0.0141
PaymentMethod: 0.0120
Contract: 0.0032
TechSupport: 0.0031
StreamingTV: 0.0030
TotalCharges: 0.0012
