In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [2]:
raw_df=pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/TelecomCustomerChurn.csv')

In [3]:
raw_df.shape

(7043, 21)

In [4]:
raw_df.head()

Unnamed: 0,customerID,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No,DSL,No,...,No,No,No,No,Monthly,Yes,Manual,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Manual,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Monthly,Yes,Manual,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Monthly,Yes,Manual,70.7,151.65,Yes


In [5]:
raw_df.columns

Index(['customerID', 'Gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'Tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [6]:
raw_df.describe()

Unnamed: 0,SeniorCitizen,Tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [7]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   Gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   Tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [8]:
raw_df['TotalCharges'] = pd.to_numeric(raw_df['TotalCharges'], errors='coerce')

In [9]:
gender_counts = raw_df['Gender'].value_counts()
fig_gender = px.pie(gender_counts, values=gender_counts.values, names=gender_counts.index, title='Gender Distribution')
fig_gender.show()

churn_counts = raw_df['Churn'].value_counts()
fig_churn = px.pie(churn_counts, values=churn_counts.values, names=churn_counts.index, title='Churn Distribution')
fig_churn.show()

26.6 % of customers switched to another firm.

Customers are 49.5 % female and 50.5 % male.

In [10]:
service_subscription = raw_df.groupby(['InternetService', 'Contract']).size().reset_index(name='Count')

# Create the bar chart
fig = px.bar(service_subscription,
             x='InternetService',
             y='Count',
             color='Contract',
             barmode='group',
             title='Service Subscription Distribution by Internet Service and Contract Type')

fig.show()

In [11]:
fig = px.histogram(raw_df, x="Churn", color="Contract", barmode="group", title="<b>Customer contract distribution<b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

About 75% of customer with Month-to-Month Contract opted to move out as compared to 13% of customrs with One Year Contract and 3% with Two Year Contract


In [12]:
labels = raw_df['PaymentMethod'].unique()
values = raw_df['PaymentMethod'].value_counts()

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_layout(title_text="<b>Payment Method Distribution</b>")
fig.show()

In [13]:
payment_churn = raw_df.groupby(['PaymentMethod', 'Churn']).size().reset_index(name='Count')

fig = px.bar(
    payment_churn,
    x='PaymentMethod',
    y='Count',
    color='Churn',
    barmode='group',
    title='Customer Churn by Payment Method',
    labels={'PaymentMethod': 'Payment Method', 'Count': 'Number of Customers'}
)

fig.show()

In [14]:
churn_internet_gender = raw_df.groupby(['Churn', 'InternetService', 'Gender']).size().reset_index(name='Count')

fig = px.bar(
    churn_internet_gender,
    x='Churn',
    y='Count',
    color='InternetService',
    facet_col='Gender',
    barmode='group',
    title='Churn Distribution by Internet Service and Gender'
)

fig.show()

In [15]:
churn_dependents = raw_df.groupby(['Churn', 'Dependents']).size().reset_index(name='Count')

fig = px.bar(
    churn_dependents,
    x='Churn',
    y='Count',
    color='Dependents',
    barmode='group',
    title='Churn Distribution by Dependents',
    labels={'Churn': 'Churn Status', 'Count': 'Number of Customers'}
)

fig.show()

Customers without dependents are more likely to churn


In [16]:
churn_partner = raw_df.groupby(['Churn', 'Partner']).size().reset_index(name='Count')

fig = px.bar(
    churn_partner,
    x='Churn',
    y='Count',
    color='Partner',
    barmode='group',
    title='Churn Distribution by Partner Status',
    labels={'Churn': 'Churn Status', 'Count': 'Number of Customers'}
)

fig.show()

Customers that doesn't have partners are more likely to churn


In [17]:
churn_senior = raw_df.groupby(['Churn', 'SeniorCitizen']).size().reset_index(name='Count')

fig = px.bar(
    churn_senior,
    x='Churn',
    y='Count',
    color='SeniorCitizen',
    barmode='group',
    title='Churn Distribution by Senior Citizen Status',
    labels={'Churn': 'Churn Status', 'Count': 'Number of Customers', 'SeniorCitizen': 'Senior Citizen'}
)

fig.show()

In [18]:
raw_df = raw_df.drop('customerID', axis=1)
raw_df

Unnamed: 0,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Monthly,Yes,Manual,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Manual,56.95,1889.50,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Monthly,Yes,Manual,53.85,108.15,Yes
3,Male,0,No,No,45,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Monthly,Yes,Manual,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Manual,84.80,1990.50,No
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7040,Female,0,Yes,Yes,11,No,No,DSL,Yes,No,No,No,No,No,Monthly,Yes,Manual,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Monthly,Yes,Manual,74.40,306.60,Yes


OneHotEncoded

In [19]:
categorical_cols = raw_df.select_dtypes(include=['object']).columns.tolist()
if 'TotalCharges' in categorical_cols:
    categorical_cols.remove('TotalCharges')
onehot_encoded = pd.get_dummies(raw_df[categorical_cols], drop_first=True)
raw_df_dropped = raw_df.drop(categorical_cols, axis=1)
raw_df = pd.concat([raw_df_dropped, onehot_encoded], axis=1)

In [20]:
raw_df.head()

Unnamed: 0,SeniorCitizen,Tenure,MonthlyCharges,TotalCharges,Gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_Yes,InternetService_Fiber optic,...,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Manual,Churn_Yes
0,0,1,29.85,29.85,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,True,False,False,True,False,False,...,True,False,False,False,True,False,False,False,True,False
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,True,True
3,0,45,42.3,1840.75,True,False,False,False,False,False,...,True,True,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,False,False,False,True,False,True,...,False,False,False,False,False,False,True,False,True,True


In [21]:
print(raw_df.columns)

Index(['SeniorCitizen', 'Tenure', 'MonthlyCharges', 'TotalCharges',
       'Gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
       'MultipleLines_Yes', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_Yes', 'OnlineBackup_Yes',
       'DeviceProtection_Yes', 'TechSupport_Yes', 'StreamingTV_Yes',
       'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year',
       'PaperlessBilling_Yes', 'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Manual', 'Churn_Yes'],
      dtype='object')


###Model Imports :-

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

###Scaled data :-





In [23]:
scaler=StandardScaler()
raw_df[['MonthlyCharges', 'TotalCharges', 'Tenure', 'SeniorCitizen']] = scaler.fit_transform(raw_df[['MonthlyCharges', 'TotalCharges', 'Tenure', 'SeniorCitizen']])

In [24]:
nan_counts = raw_df.isnull().sum()
print(nan_counts[nan_counts > 0])

TotalCharges    11
dtype: int64


In [25]:
raw_df = raw_df.dropna(subset=['TotalCharges'])

###Train Test Split :-

In [26]:
train,test=train_test_split(raw_df,test_size=0.3,random_state=40)
x_train=train.drop('Churn_Yes',axis=1)
y_train=train['Churn_Yes']
x_test=test.drop('Churn_Yes',axis=1)
y_test=test['Churn_Yes']

###Model :-



In [27]:
#Logistic Regression:-
model=LogisticRegression()
model.fit(x_train,y_train)
Predicted=model.predict(x_test)
print('Accuracy:',metrics.accuracy_score(Predicted,y_test))
print(metrics.classification_report(Predicted,y_test))

Accuracy: 0.795260663507109
              precision    recall  f1-score   support

       False       0.89      0.84      0.86      1631
        True       0.54      0.63      0.58       479

    accuracy                           0.80      2110
   macro avg       0.71      0.74      0.72      2110
weighted avg       0.81      0.80      0.80      2110



In [28]:
#SVM
model=SVC()
model.fit(x_train,y_train)
Predicted=model.predict(x_test)
print('Accuracy:',metrics.accuracy_score(Predicted,y_test))
print(metrics.classification_report(Predicted,y_test))

Accuracy: 0.7928909952606635
              precision    recall  f1-score   support

       False       0.91      0.83      0.87      1708
        True       0.47      0.64      0.54       402

    accuracy                           0.79      2110
   macro avg       0.69      0.74      0.70      2110
weighted avg       0.82      0.79      0.80      2110



In [29]:
#KNN
model=KNeighborsClassifier(n_neighbors=10)
model.fit(x_train,y_train)
Predicted=model.predict(x_test)
print('Accuracy:',metrics.accuracy_score(Predicted,y_test))
print(metrics.classification_report(Predicted,y_test))

Accuracy: 0.7800947867298578
              precision    recall  f1-score   support

       False       0.89      0.83      0.86      1663
        True       0.48      0.60      0.54       447

    accuracy                           0.78      2110
   macro avg       0.68      0.71      0.70      2110
weighted avg       0.80      0.78      0.79      2110



In [30]:
#Decision Tree
model=DecisionTreeClassifier()
model.fit(x_train,y_train)
Predicted=model.predict(x_test)
print('Accuracy:',metrics.accuracy_score(Predicted,y_test))
print(metrics.classification_report(Predicted,y_test))

Accuracy: 0.7350710900473933
              precision    recall  f1-score   support

       False       0.81      0.83      0.82      1528
        True       0.52      0.49      0.51       582

    accuracy                           0.74      2110
   macro avg       0.67      0.66      0.66      2110
weighted avg       0.73      0.74      0.73      2110



In [31]:
#RandomForestClassifier
model=RandomForestClassifier(n_estimators=250)
model.fit(x_train,y_train)
Predicted=model.predict(x_test)
print('Accuracy:',metrics.accuracy_score(Predicted,y_test))
print(metrics.classification_report(Predicted,y_test))

Accuracy: 0.7748815165876777
              precision    recall  f1-score   support

       False       0.88      0.83      0.85      1656
        True       0.48      0.59      0.53       454

    accuracy                           0.77      2110
   macro avg       0.68      0.71      0.69      2110
weighted avg       0.79      0.77      0.78      2110



In [32]:
#XGB
model= xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(x_train, y_train)
Predicted = model.predict(x_test)
print('Accuracy:', metrics.accuracy_score(Predicted, y_test))
print(metrics.classification_report(Predicted, y_test))


Parameters: { "use_label_encoder" } are not used.




Accuracy: 0.7701421800947867
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      1596
           1       0.53      0.57      0.55       514

    accuracy                           0.77      2110
   macro avg       0.69      0.70      0.70      2110
weighted avg       0.78      0.77      0.77      2110

