# 1. Data Loading

In [1]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv") # loading dataset

In [2]:
df.shape

(7043, 21)

In [3]:
df.head() # top 5 values

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.tail() # bottom 5 values

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


# 2. Data Description

In [5]:
df.info() # column information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
df.describe() # numerical description

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [7]:
df.isnull().sum() # checks for nulls in all columns

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

# 3. Data Imputation and Data Encoding

In [8]:
df = df.drop(columns=['customerID'],axis=1) # removes IDs, not helpful in predictions

In [9]:
count = 0 # checking if total charges has ' ' instead of NaN
for i in df['TotalCharges']:
    if i in " ":
        pass
    else:
        count+=1

In [10]:
df.replace(r'^\s*$', np.nan, regex=True, inplace=True) # removing all spaces and such special characters from dataframe

In [11]:
df['TotalCharges'] = df['TotalCharges'].astype('float64') # changing datatype of total charges

In [12]:
df.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [13]:
# encoding categorical features

from sklearn.preprocessing import LabelEncoder

encoders = {}

categorical_columns = df.select_dtypes(include=['object', 'string']).columns

for col in categorical_columns:
    encode = LabelEncoder()
    df[col] = encode.fit_transform(df[col])
    encoders[col] = encode

In [14]:
df['TotalCharges'].replace(np.nan, 0, inplace=True)

# 4. Model Creation and Evaluation

In [15]:
x = df.copy().drop(columns=['Churn'], axis = 1)
y = df.copy()['Churn']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)

print("x:", x.shape)
print("y:", y.shape)
print("x_train:", x_train.shape)
print("x_test:", x_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

x: (7043, 19)
y: (7043,)
x_train: (4930, 19)
x_test: (2113, 19)
y_train: (4930,)
y_test: (2113,)


In [16]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()

log_model.fit(x_train, y_train)

predict = log_model.predict(x_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

print("Accuracy:", accuracy_score(y_test, predict)*100)
print("Precision:", precision_score(y_test, predict)*100)
print("Recall:", recall_score(y_test, predict)*100)
print("F1 Score:", f1_score(y_test, predict)*100)
print("ROC-AUC:", roc_auc_score(y_test, predict)*100)

print()

print(confusion_matrix(y_test, predict))

Accuracy: 79.74443918599148
Precision: 62.4750499001996
Recall: 56.6003616636528
F1 Score: 59.39278937381404
ROC-AUC: 72.27453980618537

[[1372  188]
 [ 240  313]]


In [17]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier()

xgb_model.fit(x_train, y_train)

predict = xgb_model.predict(x_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

print("Accuracy:", accuracy_score(y_test, predict)*100)
print("Precision:", precision_score(y_test, predict)*100)
print("Recall:", recall_score(y_test, predict)*100)
print("F1 Score:", f1_score(y_test, predict)*100)
print("ROC-AUC:", roc_auc_score(y_test, predict)*100)

print()

print(confusion_matrix(y_test, predict))

Accuracy: 77.99337434926645
Precision: 59.09090909090909
Recall: 51.71790235081374
F1 Score: 55.15911282545806
ROC-AUC: 69.51279732925302

[[1362  198]
 [ 267  286]]


In [18]:
from sklearn.ensemble import HistGradientBoostingClassifier

hist_model = HistGradientBoostingClassifier()

hist_model.fit(x_train, y_train)

predict = hist_model.predict(x_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

print("Accuracy:", accuracy_score(y_test, predict)*100)
print("Precision:", precision_score(y_test, predict)*100)
print("Recall:", recall_score(y_test, predict)*100)
print("F1 Score:", f1_score(y_test, predict)*100)
print("ROC-AUC:", roc_auc_score(y_test, predict)*100)

print()

print(confusion_matrix(y_test, predict))

Accuracy: 79.31850449597728
Precision: 62.44635193133047
Recall: 52.62206148282098
F1 Score: 57.11481844946026
ROC-AUC: 70.70205638243613

[[1385  175]
 [ 262  291]]


In [20]:
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier(verbose=0)

cat_model.fit(x_train, y_train)

predict = cat_model.predict(x_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

print("Accuracy:", accuracy_score(y_test, predict)*100)
print("Precision:", precision_score(y_test, predict)*100)
print("Recall:", recall_score(y_test, predict)*100)
print("F1 Score:", f1_score(y_test, predict)*100)
print("ROC-AUC:", roc_auc_score(y_test, predict)*100)

print()

print(confusion_matrix(y_test, predict))

Accuracy: 79.88641741599622
Precision: 64.22222222222223
Recall: 52.26039783001808
F1 Score: 57.6271186440678
ROC-AUC: 70.96994250475262

[[1399  161]
 [ 264  289]]


In [21]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(verbose=0)

rf_model.fit(x_train, y_train)

predict = rf_model.predict(x_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

print("Accuracy:", accuracy_score(y_test, predict)*100)
print("Precision:", precision_score(y_test, predict)*100)
print("Recall:", recall_score(y_test, predict)*100)
print("F1 Score:", f1_score(y_test, predict)*100)
print("ROC-AUC:", roc_auc_score(y_test, predict)*100)

print()

print(confusion_matrix(y_test, predict))

Accuracy: 79.17652626597254
Precision: 63.04849884526559
Recall: 49.36708860759494
F1 Score: 55.375253549695735
ROC-AUC: 69.55533917559235

[[1400  160]
 [ 280  273]]
