In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('./WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
#convert totalcharges column to float
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

#use simple imputer to fill in missing values with 0
imputer = SimpleImputer(strategy='constant', fill_value=0)
df['TotalCharges'] = imputer.fit_transform(df[['TotalCharges']])

In [4]:
#Convert 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [5]:
#Split the data into X for the features and y for target
X = df.drop(columns = 'Churn')
y = df['Churn']

In [6]:
#split the X and y dataset into 80-20 train-test split and set random state to 1 to ensure reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

In [7]:
#numerical features scaling
scaler = StandardScaler()
X_train_numerical = X_train[['tenure', 'MonthlyCharges', 'TotalCharges']]
X_test_numerical = X_test[['tenure', 'MonthlyCharges', 'TotalCharges']]

X_train_numerical_scaled = pd.DataFrame(scaler.fit_transform(X_train_numerical), columns=X_train_numerical.columns)
X_test_numerical_scaled = pd.DataFrame(scaler.transform(X_test_numerical), columns=X_test_numerical.columns)

In [8]:
# Categorical Feature Encoding (One-Hot Encoding)
encoder = OneHotEncoder(sparse_output=False)
X_train_categorical = X_train[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']]
X_test_categorical = X_test[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']]

X_train_categorical_encoded = pd.DataFrame(encoder.fit_transform(X_train_categorical), columns=encoder.get_feature_names_out(X_train_categorical.columns))
X_test_categorical_encoded = pd.DataFrame(encoder.transform(X_test_categorical), columns=encoder.get_feature_names_out(X_test_categorical.columns))


In [9]:
# Train a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=1)
rf_classifier.fit(pd.concat([X_train_numerical_scaled, X_train_categorical_encoded], axis=1), y_train)

In [10]:
# Evaluate the models on the test set
y_pred_rf = rf_classifier.predict(pd.concat([X_test_numerical_scaled, X_test_categorical_encoded], axis=1))

In [11]:
# Calculate accuracy for each model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_rf

0.7913413768630234

In [13]:
# Train an Extra Trees Classifier
et_classifier = ExtraTreesClassifier(random_state=1)
et_classifier.fit(pd.concat([X_train_numerical_scaled, X_train_categorical_encoded], axis=1), y_train)


In [14]:
# Train an XGBoost Classifier
xgb_classifier = XGBClassifier(random_state=1)
xgb_classifier.fit(pd.concat([X_train_numerical_scaled, X_train_categorical_encoded], axis=1), y_train)

In [15]:
# Train a LightGBM Classifier
lgbm_classifier = LGBMClassifier(random_state=1)
lgbm_classifier.fit(pd.concat([X_train_numerical_scaled, X_train_categorical_encoded], axis=1), y_train)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [16]:
# Evaluate the models on the test set
y_pred_rf = rf_classifier.predict(pd.concat([X_test_numerical_scaled, X_test_categorical_encoded], axis=1))
y_pred_et = et_classifier.predict(pd.concat([X_test_numerical_scaled, X_test_categorical_encoded], axis=1))
y_pred_xgb = xgb_classifier.predict(pd.concat([X_test_numerical_scaled, X_test_categorical_encoded], axis=1))
y_pred_lgbm = lgbm_classifier.predict(pd.concat([X_test_numerical_scaled, X_test_categorical_encoded], axis=1))

In [17]:
# Calculate accuracy for each model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_et = accuracy_score(y_test, y_pred_et)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)


In [18]:
accuracy_lgbm

0.8034066713981547

In [19]:
accuracy_xgb

0.7934705464868701

In [20]:
accuracy_et

0.7672107877927609

In [21]:
# Generate classification reports for each model
report_rf = classification_report(y_test, y_pred_rf)
report_et = classification_report(y_test, y_pred_et)
report_xgb = classification_report(y_test, y_pred_xgb)
report_lgbm = classification_report(y_test, y_pred_lgbm)

In [22]:
report_lgbm

'              precision    recall  f1-score   support\n\n           0       0.87      0.87      0.87      1061\n           1       0.61      0.59      0.60       348\n\n    accuracy                           0.80      1409\n   macro avg       0.74      0.73      0.73      1409\nweighted avg       0.80      0.80      0.80      1409\n'

In [23]:
report_xgb

'              precision    recall  f1-score   support\n\n           0       0.86      0.87      0.86      1061\n           1       0.59      0.56      0.57       348\n\n    accuracy                           0.79      1409\n   macro avg       0.72      0.71      0.72      1409\nweighted avg       0.79      0.79      0.79      1409\n'

In [None]:
report_et