In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors= 'coerce')

In [5]:
df['TotalCharges'].isna().sum()

11

In [6]:
df['TotalCharges'].fillna(0, inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [8]:
import numpy as np

In [9]:
#convert the churn column to numeric values for analysis

df['Churn'] = np.where(df['Churn'] == 'Yes', 1, 0)

In [10]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [11]:
from sklearn.model_selection import train_test_split

X = df.drop(['Churn'], axis=1)
y = df['Churn']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [12]:
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']

In [13]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [14]:
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [15]:
processed_df_train = preprocessor.fit_transform(x_train)   
processed_df_test = preprocessor.fit_transform(x_test)



In [16]:
processed_columns = (list(numerical_cols) +
                     list(preprocessor.named_transformers_['cat']
                          .named_steps['onehot']
                          .get_feature_names_out(categorical_cols)))

processed_df2_train = pd.DataFrame(processed_df_train, columns=processed_columns)
processed_df2_test = pd.DataFrame(processed_df_test, columns=processed_columns)

In [17]:
processed_df2_train

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-0.825884,-1.497530,-0.890947,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.395961,0.302996,0.389693,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.577078,0.012320,1.060945,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
3,1.577078,0.686687,1.775397,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.092777,0.186726,-0.102671,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,-0.948068,1.186648,-0.599602,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5630,1.129068,-1.489225,-0.479886,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
5631,-0.174233,1.359393,0.309802,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5632,-1.233166,-0.344795,-0.954599,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [18]:
processed_df2_test

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.335987,0.504761,0.454869,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,1.353363,1.255673,1.844061,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
2,-0.844170,-0.655741,-0.777950,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,-1.129035,-0.469262,-0.898922,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.925560,0.040227,-0.718126,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,-0.315134,1.373888,0.161477,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1405,-0.722084,-1.501558,-0.862926,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1406,1.556839,-0.357707,0.705233,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1407,1.312668,0.701230,1.498692,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [22]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.1-py3-none-win_amd64.whl (99.7 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.1


In [24]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.1.0-py3-none-win_amd64.whl (1.3 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.1.0


In [25]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb


classifiers = [
    ('Random Forest', RandomForestClassifier(random_state=1)),
    ('Extra Trees', ExtraTreesClassifier(random_state=1)),
    ('XGBoost', xgb.XGBClassifier(random_state=1)),
    ('LightGBM', lgb.LGBMClassifier(random_state=1))
]

# Iterate through the classifiers, train, and evaluate each one
for name, clf in classifiers:
    clf.fit(processed_df_train, y_train)
    y_pred = clf.predict(processed_df_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Classifier Accuracy: {accuracy}")

Random Forest Classifier Accuracy: 0.7963094393186657
Extra Trees Classifier Accuracy: 0.7665010645848119
XGBoost Classifier Accuracy: 0.7920511000709723
[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001898 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
LightGBM Classifier Accuracy: 0.8041163946061036


In [27]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV

et_classifier = ExtraTreesClassifier(random_state=1)

n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

random_search = RandomizedSearchCV(
    et_classifier,
    param_distributions=hyperparameter_grid,
    n_iter=10,  
    scoring='accuracy',
    cv=5,  
    n_jobs=-1,
    verbose=1,
    random_state=1
)

random_search.fit(processed_df_train, y_train)

best_params = random_search.best_params_
best_estimator = random_search.best_estimator_

print("Best Hyperparameters:", best_params)
print("Best Estimator:", best_estimator)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Abdulrahim\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abdulrahim\anaconda3\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Abdulrahim\anaconda3\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Abdulrahim\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 96, in validate_parameter_constraints
    ra

Best Hyperparameters: {'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}
Best Estimator: ExtraTreesClassifier(min_samples_leaf=8, min_samples_split=9, n_estimators=1000,
                     random_state=1)


In [29]:
from sklearn.ensemble import ExtraTreesClassifier

best_hyperparameters = {
    'n_estimators': best_params['n_estimators'],
    'min_samples_split': best_params['min_samples_split'],
    'min_samples_leaf': best_params['min_samples_leaf'],
    'max_features': best_params['max_features'],
    'random_state': 1 
}

optimal_et_classifier = ExtraTreesClassifier(**best_hyperparameters)

optimal_et_classifier.fit(processed_df_train, y_train)

y_pred_optimal = optimal_et_classifier.predict(processed_df_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"{name} Classifier Accuracy: {accuracy}")

LightGBM Classifier Accuracy: 0.8041163946061036


In [30]:
feature_importances = best_estimator.feature_importances_

In [31]:
feature_importances_df = pd.DataFrame({'feature': processed_columns, 'importance': feature_importances})

In [32]:
feature_importances_df

Unnamed: 0,feature,importance
0,tenure,0.0928
1,MonthlyCharges,0.014926
2,TotalCharges,0.047714
3,gender_Female,0.010653
4,gender_Male,0.010333
5,SeniorCitizen_0,0.006864
6,SeniorCitizen_1,0.007144
7,Partner_No,0.008556
8,Partner_Yes,0.008969
9,Dependents_No,0.005045


In [35]:
feature_importances_df.sort_values(by=['importance'], ascending = False)

Unnamed: 0,feature,importance
37,Contract_Month-to-month,0.152237
0,tenure,0.0928
19,OnlineSecurity_No,0.074998
17,InternetService_Fiber optic,0.065287
28,TechSupport_No,0.064141
39,Contract_Two year,0.054423
44,PaymentMethod_Electronic check,0.051666
2,TotalCharges,0.047714
16,InternetService_DSL,0.032687
22,OnlineBackup_No,0.030077


In [37]:
feature_importances_df.sort_values(by=['importance'], ascending = False).head(2)

Unnamed: 0,feature,importance
37,Contract_Month-to-month,0.152237
0,tenure,0.0928
