# Outlines
1. [Preprocessing](#Task1)
2. [Feature Engineering](#Task2)

In [1]:
# import all necessary packages
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
# load the dataset into a pandas DataFrame
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# display numbers of rows and columns
print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns.')

The dataset has 7043 rows and 21 columns.


In [4]:
# importing all necessary methods from sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.model_selection import RandomizedSearchCV

# Preprocessing <a name= "Task1"></a>

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
# convert the TotalCharges column data type to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors ='coerce')

In [7]:
# check if there's any null values in the TotalCharges column
df['TotalCharges'].isna().sum()

11

In [8]:
# fill all the null values found in the TotalCharges column with 0
df['TotalCharges'].fillna(0,inplace= True)

In [9]:
# check if the data type has been changed
df['TotalCharges'].dtype

dtype('float64')

In [10]:
# check the unique values in the Churn column
df['Churn'].unique()

array(['No', 'Yes'], dtype=object)

In [11]:
# represent the Churn values with numbers 0 --> No and 1 --> Yes
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == "Yes" else 0)

In [12]:
df['Churn'].unique()

array([0, 1], dtype=int64)

In [13]:
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity',
               'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']

numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Feature Engineering  <a name= "Task2"></a>

In [14]:
# calling StandardScaler() method and fit and transform it on the numerical predictors
scaler = StandardScaler()
numeric_scaled = scaler.fit_transform(df[numerical])
numerical_df = pd.DataFrame(numeric_scaled, columns=numerical)
numerical_df

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.277445,-1.160323,-0.992611
1,0.066327,-0.259629,-0.172165
2,-1.236724,-0.362660,-0.958066
3,0.514251,-0.746535,-0.193672
4,-1.236724,0.197365,-0.938874
...,...,...,...
7038,-0.340876,0.665992,-0.127605
7039,1.613701,1.277533,2.242606
7040,-0.870241,-1.168632,-0.852932
7041,-1.155283,0.320338,-0.870513


In [15]:
# use OneHotEncoder() function to transform the categorical features to numbers.
one_hot_encoded = OneHotEncoder(sparse_output= False)
encoded_cat = one_hot_encoded.fit_transform(df[categorical])
encoded_cat_df = pd.DataFrame(encoded_cat)
encoded_cat_df.columns = one_hot_encoded.get_feature_names_out()

In [16]:
encoded_cat_df.head()

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [17]:
# concatenate the two DataFrame 
df_combined = pd.concat([numerical_df, encoded_cat_df], axis= 1)

In [18]:
# calling the combined DataFrame
df_combined

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.277445,-1.160323,-0.992611,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.066327,-0.259629,-0.172165,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-1.236724,-0.362660,-0.958066,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.514251,-0.746535,-0.193672,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,-1.236724,0.197365,-0.938874,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,-0.340876,0.665992,-0.127605,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7039,1.613701,1.277533,2.242606,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7040,-0.870241,-1.168632,-0.852932,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7041,-1.155283,0.320338,-0.870513,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [19]:
# display numbers of rows and columns
print(f'The combined DataFrame has {df_combined.shape[0]} rows and {df_combined.shape[1]} columns.')

The combined DataFrame has 7043 rows and 46 columns.


In [20]:
# splitting the dataset into training set and testing set
X = df_combined
y = df["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size= 0.2, random_state= 1)

In [21]:
# calling all the learning algorithms for the quiz
rfc = RandomForestClassifier(random_state=1)
etc = ExtraTreesClassifier(random_state=1)
xgc = XGBClassifier(random_state=1)
ltb = LGBMClassifier(random_state=1)

In [22]:
# train a RandomForestClassifier 
rfc.fit(X_train, y_train)
y_pred1 = rfc.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred1)}")
print(f"Precision: {precision_score(y_test, y_pred1)}")
print(f"F1_score: {f1_score(y_test, y_pred1)}")

Accuracy: 0.7906316536550745
Precision: 0.583596214511041
F1_score: 0.556390977443609


In [23]:
# train an ExtraTreesClassifier
etc.fit(X_train, y_train)
y_pred2 = etc.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred2)}")
print(f"Precision: {precision_score(y_test, y_pred2)}")
print(f"F1_score: {f1_score(y_test, y_pred2)}")

Accuracy: 0.7700496806245565
Precision: 0.5384615384615384
F1_score: 0.509090909090909


In [24]:
# train a XGBClassifier
xgc.fit(X_train, y_train)
y_pred3 = xgc.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred3)}")
print(f"Precision: {precision_score(y_test, y_pred3)}")
print(f"F1_score: {f1_score(y_test, y_pred3)}")

Accuracy: 0.7934705464868701
Precision: 0.5861027190332326
F1_score: 0.5714285714285715


In [25]:
# train a LGBMClassifier
ltb.fit(X_train, y_train)
y_pred4 = ltb.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred4)}")
print(f"Precision: {precision_score(y_test, y_pred4)}")
print(f"F1_score: {f1_score(y_test, y_pred4)}")

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
Accuracy: 0.8133427963094393
Precision: 0.6299694189602446
F1_score: 0.6103703703703703


In [26]:
# setting hyperparameters
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

In [27]:
hyperparameter_grid = {'n_estimators' : n_estimators, 'min_samples_leaf': min_samples_leaf, 'min_samples_split' : min_samples_split,
                      'max_features' : max_features}
random_search = RandomizedSearchCV(etc, hyperparameter_grid, cv=5, n_iter= 10, scoring= 'accuracy', n_jobs=1, verbose= 1, random_state=1)

In [31]:
# train the ExtraTreesClassifier with hyperparameters
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\PC\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\PC\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\PC\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\PC\AppData\Local\Programs\Python\Python310\lib\site-package

In [32]:
# get the best parameter
random_search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 9,
 'min_samples_leaf': 8,
 'max_features': 'sqrt'}

In [36]:
# check the accuracy of the best estimator
accuracy = accuracy_score(y_test,random_search.best_estimator_.predict(X_test))
accuracy   # 0.8041163946061036

0.8041163946061036

In [34]:
# find the two most importance features
attributes = list(df_combined.columns)
features_importances = random_search.best_estimator_.feature_importances_
sorted(zip(features_importances, attributes), reverse= True)  # Contract_Month_to_month, tenure

[(0.1522365223825757, 'Contract_Month-to-month'),
 (0.09280046728070009, 'tenure'),
 (0.07499756668925914, 'OnlineSecurity_No'),
 (0.06528743070518972, 'InternetService_Fiber optic'),
 (0.06414089705963495, 'TechSupport_No'),
 (0.05442309306017465, 'Contract_Two year'),
 (0.051665955261978744, 'PaymentMethod_Electronic check'),
 (0.047713974468078756, 'TotalCharges'),
 (0.03268688697354864, 'InternetService_DSL'),
 (0.030077412045788524, 'OnlineBackup_No'),
 (0.028542700191358017, 'Contract_One year'),
 (0.021699839701421534, 'OnlineSecurity_Yes'),
 (0.01685724934907098, 'DeviceProtection_No'),
 (0.01492623419583065, 'MonthlyCharges'),
 (0.0145586189112325, 'TechSupport_Yes'),
 (0.012597816332646977, 'OnlineBackup_Yes'),
 (0.011883172396864736, 'PaperlessBilling_No'),
 (0.01177934523732429, 'PaperlessBilling_Yes'),
 (0.010653066633429322, 'gender_Female'),
 (0.010332617797728823, 'gender_Male'),
 (0.009291725680038769, 'StreamingMovies_Yes'),
 (0.008991640248744256, 'OnlineBackup_No in

In [None]:
# min_weight_fraction_leaf
# max_leaf_nodes