In [None]:
#import libraries

import pandas as pd
import numpy as np
import seaborn as sns

# read the data in an xlsx file saved in my drive

from google.colab import drive
drive.mount('/content/drive')

# Path to the Excel file on Google Drive
file_path = "/content/drive/My Drive/Colab Notebooks/WA_Fn-UseC_-Telco-Customer-Churn.csv"

# Read the Excel file using pandas
try:
    data = pd.read_csv(file_path)
    print(data.head())
except Exception as e:
    print("Error loading Excel file:", e)

Mounted at /content/drive
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingM

In [None]:
#Preprocessing
#checking data types
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [None]:
# Converting the 'TotalCharges' column to numeric values and filling missing values with 0.

data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'].fillna(0, inplace=True)


In [None]:
# Convert the 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1.

data['Churn'] = data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)


In [None]:
# Split the data into an 80-20 train-test split with a random state of “1”

from sklearn.model_selection import train_test_split

X = data.drop('Churn', axis=1)
y = data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


X_train shape: (5634, 20)
y_train shape: (5634,)
X_test shape: (1409, 20)
y_test shape: (1409,)


In [None]:
# Select the features:

categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Select categorical features
categorical_features = data[categorical]

# Select numerical features
numerical_features = data[numerical]


In [None]:
# Scaling numerical features using StandardScaler, convert the output back to a dataframe and put back the column names.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_numerical_features = pd.DataFrame(scaler.fit_transform(numerical_features), columns=numerical_features.columns)


In [None]:
# Encoding categorical features using OneHotEncoder(set sparse_output to false), convert the output back to a dataframe and put back the column names

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
encoded_categorical_features = pd.DataFrame(encoder.fit_transform(categorical_features), columns=encoder.get_feature_names_out())


encoded_categorical_features.head()


Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
# Combine scaled numerical and one-hot encoded categorical features into train and test set dataframes (use pd.concat)

X_train_combined = pd.concat([scaled_numerical_features.loc[X_train.index], encoded_categorical_features.loc[X_train.index]], axis=1)
X_test_combined = pd.concat([scaled_numerical_features.loc[X_test.index], encoded_categorical_features.loc[X_test.index]], axis=1)

print("X_train_combined shape:", X_train_combined.shape)
print("X_test_combined shape:", X_test_combined.shape)


X_train_combined shape: (5634, 46)
X_test_combined shape: (1409, 46)


In [None]:
# Training a random forest and extra trees classifier and evaluating them

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score

# Train a Random Forest Classifier
random_forest = RandomForestClassifier(random_state=1)
random_forest.fit(X_train_combined, y_train)

# Train an Extra Trees Classifier
extra_trees = ExtraTreesClassifier(random_state=1)
extra_trees.fit(X_train_combined, y_train)

# Predict on test set
y_pred_rf = random_forest.predict(X_test_combined)
y_pred_et = extra_trees.predict(X_test_combined)

# Evaluate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_et = accuracy_score(y_test, y_pred_et)

print("Random Forest Accuracy:", accuracy_rf)
print("Extra Trees Accuracy:", accuracy_et)


Random Forest Accuracy: 0.7906316536550745
Extra Trees Accuracy: 0.7700496806245565


In [None]:
# Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the two most important respectively?

importances = extra_trees.feature_importances_

# Sort the features by importance in descending order
features_sorted_by_importance = sorted(zip(extra_trees.feature_importances_, X_train_combined.columns), reverse=True)

# Print the top two most important features
print("Top two most important features:")
print(features_sorted_by_importance[0][1])
print(features_sorted_by_importance[1][1])


Top two most important features:
TotalCharges
tenure


In [None]:
# Using xgboost and lightgbm to train an extreme boosting model and a light gradient boosting model

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Train an XGBoost Classifier
xgboost = XGBClassifier(random_state=1)
xgboost.fit(X_train_combined, y_train)

# Train a LightGBM Classifier
lightgbm = LGBMClassifier(random_state=1)
lightgbm.fit(X_train_combined, y_train)

# Predict on test set
y_pred_xgboost = xgboost.predict(X_test_combined)
y_pred_lightgbm = lightgbm.predict(X_test_combined)

# Evaluate accuracy
accuracy_xgboost = accuracy_score(y_test, y_pred_xgboost)
accuracy_lightgbm = accuracy_score(y_test, y_pred_lightgbm)

print("XGBoost Accuracy:", accuracy_xgboost)
print("LightGBM Accuracy:", accuracy_lightgbm)


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001581 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
XGBoost Accuracy: 0.7934705464868701
LightGBM Accuracy: 0.8133427963094393
