# Import libraries

In [125]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
import pandas as pd
import numpy as np
from kagglehub import KaggleDatasetAdapter, dataset_load

# Load Dataset

In [126]:
df = dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "blastchar/telco-customer-churn",
  "WA_Fn-UseC_-Telco-Customer-Churn.csv"
).drop('customerID', axis=1)

Using Colab cache for faster access to the 'telco-customer-churn' dataset.


# Analysing the data

The data set includes information about:

*   Customers who left within the last month – the column is called Churn
*   Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
*   Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges
*   Demographic info about customers – gender, age range, and if they have partners and dependents

In [127]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [128]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [130]:
pd.Series({col: df[col].nunique() for col in df})

Unnamed: 0,0
gender,2
SeniorCitizen,2
Partner,2
Dependents,2
tenure,73
PhoneService,2
MultipleLines,3
InternetService,3
OnlineSecurity,3
OnlineBackup,3


This Series shows that there are a lot of binary data - (which should be just encoded as 0 and 1)

# Preprocessing Data

In [131]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

Strangely, this column is treated as an object by pandas(even though it contains floats)

In [132]:
binary_cols = [
    col for col in df.select_dtypes(include='object').columns
    if df[col].nunique() == 2 and col != 'gender'
]

df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})

for col in binary_cols:
    df[col] = df[col].map({'No': 0, 'Yes': 1})

Replacing binary columns with 0s and 1s.

Let's take a look at the columns left.

In [133]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,1,0,0,0,34,1,No,DSL,Yes,No,Yes,No,No,No,One year,0,Mailed check,56.95,1889.5,0
2,1,0,0,0,2,1,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
3,1,0,0,0,45,0,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,0,0,0,0,2,1,No,Fiber optic,No,No,No,No,No,No,Month-to-month,1,Electronic check,70.7,151.65,1


Let's encode other object columns using OneHotEncoder.

In [134]:
categorical_columns = df.select_dtypes(exclude=['int64','float64']).columns.tolist()

encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(df[categorical_columns])

one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

df_encoded = pd.concat([df, one_hot_df], axis=1)

df = df_encoded.drop(categorical_columns, axis=1)

Let's take a look one more time.

In [135]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,1,29.85,29.85,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,0,0,0,34,1,0,56.95,1889.5,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1,0,0,0,2,1,1,53.85,108.15,1,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,0,0,0,45,0,0,42.3,1840.75,0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0,0,0,0,2,1,1,70.7,151.65,1,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


As we'll be dealing with Naive Bayes, we need to drop NaN values.

In [136]:
df.dropna(inplace=True)

# Split train test

Target column - Churn, which represents the customers who left within the last month.

In [137]:
X = df.drop(['Churn'], axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instances of Naive Bayes

In [138]:
clf1 = GaussianNB()

In [139]:
clf2 = MultinomialNB()

In [140]:
clf3 = BernoulliNB()

# Fit Naive Bayes

In [141]:
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)

# Create the Table

In [142]:
df_score = pd.DataFrame(columns=['ML Model', 'Accuracy'],
 data=[
     ['GaussianNB',     clf1.score(X_test, y_test)],
     ['MultinomialNB',  clf2.score(X_test, y_test)],
     ['BernoulliNB',    clf3.score(X_test, y_test)]
     ]
)

In [143]:
df_score.head()

Unnamed: 0,ML Model,Accuracy
0,GaussianNB,0.680881
1,MultinomialNB,0.688699
2,BernoulliNB,0.700071


The Naive Bayes models achieved comparable performance, with BernoulliNB obtaining the highest accuracy (~70%). This suggests that the dataset is largely composed of binary or one-hot encoded features, which aligns with the assumptions of BernoulliNB. GaussianNB performed worse, likely due to its assumption of normally distributed features, which does not hold for binary data.

# Some more Ranking

In [144]:
print('GaussianNB')
print(classification_report(y_test, clf1.predict(X_test)))
print('MultinomialNB')
print(classification_report(y_test, clf2.predict(X_test)))
print('BernoulliNB')
print(classification_report(y_test, clf3.predict(X_test)))

GaussianNB
              precision    recall  f1-score   support

           0       0.92      0.62      0.74      1033
           1       0.45      0.85      0.59       374

    accuracy                           0.68      1407
   macro avg       0.68      0.73      0.66      1407
weighted avg       0.79      0.68      0.70      1407

MultinomialNB
              precision    recall  f1-score   support

           0       0.88      0.67      0.76      1033
           1       0.45      0.74      0.56       374

    accuracy                           0.69      1407
   macro avg       0.66      0.71      0.66      1407
weighted avg       0.76      0.69      0.71      1407

BernoulliNB
              precision    recall  f1-score   support

           0       0.91      0.66      0.76      1033
           1       0.46      0.81      0.59       374

    accuracy                           0.70      1407
   macro avg       0.68      0.74      0.68      1407
weighted avg       0.79      0.70    

The classification reports show that all Naive Bayes variants suffer from low precision for the minority class (class 1), despite achieving high recall. This indicates a tendency to over-predict the positive class, likely due to class imbalance and the strong independence assumptions of Naive Bayes. BernoulliNB achieves the best overall trade-off, particularly in recall for class 1, while maintaining comparable performance for class 0.