In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [4]:
df.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [10]:
# convert 'TotalCharges' column to numeric values and filling missing values with 0.
#df.TotalCharges = df.TotalCharges.astype(int)
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')

# fill missing values with 0
df.TotalCharges.fillna(0, inplace = True)

In [11]:
df.TotalCharges.isnull().sum()

0

In [13]:
# Convert the 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1
df.Churn = df.Churn.map({"No": 0, "Yes": 1})

In [14]:
df.Churn

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

In [15]:
# Split the data into an 80-20 train-test split with a random state of “1”
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']
from sklearn.model_selection import train_test_split
x = df[categorical + numerical]
y = df.Churn
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [23]:
#Feature engineering: The numerical features should be scaled using StandardScaler, convert the output back to a dataframe and put back the column names.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

x_train[numerical] = scaler.fit_transform(x_train[numerical])
x_test[numerical] = scaler.transform(x_test[numerical])

# convert back to dataframe
x_train = pd.DataFrame(x_train, columns = categorical + numerical)
x_test = pd.DataFrame(x_test, columns = categorical +numerical)

In [33]:
#The categorical features are one-hot encoded using OneHotEncoder(set sparse_output to false), convert the output back to a dataframe and put back the column
#names.
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output= False)
x_train_encoded = encoder.fit_transform(x_train[categorical])
x_test_encoded = encoder.fit_transform(x_test[categorical])

In [36]:
# Convert back to DataFrame with the initial column names
column = encoder.get_feature_names_out(categorical)
x_train_encoded_df = pd.DataFrame(x_train_encoded, columns = column )
x_test_encoded_df = pd.DataFrame(x_test_encoded, columns = column)


In [37]:
x_train_encoded_df

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5630,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
5631,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5632,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [40]:
x_train.drop(categorical, axis=1, inplace=True)
x_test.drop(categorical, axis=1, inplace=True)


In [41]:
# Use scikit learn to train a random forest and extra trees classifier, and use xgboost and lightgbm to train an extreme boosting model and a light gradient
# boosting model. Use random_state = 1 for training all models and evaluate on the test set
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score

# random forest classifier
rf_classifier = RandomForestClassifier(random_state=1)
rf_classifier.fit(x_train, y_train)

# prediction on test set
rf_predict = rf_classifier.predict(x_test)



In [42]:
# Evaluate the models using accuracy
rf_accuracy = accuracy_score(y_test, rf_predict)


In [43]:
print("accuracy for the random forest is = ", rf_accuracy)

accuracy for the random forest is =  0.7665010645848119


In [45]:
# Extra Trees Classifier
extra_tree_classifier = ExtraTreesClassifier(random_state=1)
extra_tree_classifier.fit(x_train, y_train)

# predict
pred = extra_tree_classifier.predict(x_test)

# models accuracy
accuracy = accuracy_score(y_test, pred)

# Print the accuracies
print("extra trees accuracy is =", accuracy)


extra trees accuracy is = 0.7622427253371186


In [46]:
# XGBoost
xgb_model = xgb.XGBClassifier(random_state=1)
xgb_model.fit(x_train, y_train)

# predict
xgb_pred = xgb_model.predict(x_test)

# model accuracy
xgb_accuracy = accuracy_score(y_test, xgb_pred)

xgb_accuracy

0.7778566359119943

In [47]:
# lightGBM
lgb_model = lgb.LGBMClassifier(random_state=1)
lgb_model.fit(x_train, y_train)

# predict
lgb_pred = lgb_model.predict(x_test)

# model accuracy
lgb_accuracy = accuracy_score(y_test, lgb_pred)

# Print the accuracies

lgb_accuracy


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000341 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 584
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


0.7991483321504613