In [256]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler    
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
import pydotplus
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os 
from sklearn.metrics import f1_score

In [257]:
current_dir = os.getcwd()
df = pd.read_csv(f'{current_dir}\\Telco_customer_churn.csv')
df_copy = df.copy()
df.head(5)

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


In [258]:
df.columns

Index(['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code',
       'Lat Long', 'Latitude', 'Longitude', 'Gender', 'Senior Citizen',
       'Partner', 'Dependents', 'Tenure Months', 'Phone Service',
       'Multiple Lines', 'Internet Service', 'Online Security',
       'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
       'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
       'Monthly Charges', 'Total Charges', 'Churn Label', 'Churn Value',
       'Churn Score', 'CLTV', 'Churn Reason'],
      dtype='object')

In [259]:
columns_to_drop = ['Lat Long', 'Latitude', 'Longitude', 'Count', 'Streaming TV', 'Streaming Movies', 'Zip Code', 'Churn Reason','CustomerID', 'Country', 'State', 'City']

df_copy.drop(columns=columns_to_drop, inplace=True)
df_copy.head()

Unnamed: 0,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,...,Tech Support,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV
0,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,...,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239
1,Female,No,No,Yes,2,Yes,No,Fiber optic,No,No,...,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701
2,Female,No,No,Yes,8,Yes,Yes,Fiber optic,No,No,...,No,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372
3,Female,No,Yes,Yes,28,Yes,Yes,Fiber optic,No,No,...,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003
4,Male,No,No,Yes,49,Yes,Yes,Fiber optic,No,Yes,...,No,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340


In [260]:
df_copy.isnull().sum()

Gender               0
Senior Citizen       0
Partner              0
Dependents           0
Tenure Months        0
Phone Service        0
Multiple Lines       0
Internet Service     0
Online Security      0
Online Backup        0
Device Protection    0
Tech Support         0
Contract             0
Paperless Billing    0
Payment Method       0
Monthly Charges      0
Total Charges        0
Churn Label          0
Churn Value          0
Churn Score          0
CLTV                 0
dtype: int64

In [261]:
df_copy = pd.get_dummies(df_copy, columns=['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Contract', 'Paperless Billing', 'Payment Method'], drop_first=False, dummy_na=False, dtype=int)

df_copy.head(1)

Unnamed: 0,Tenure Months,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Gender_Female,Gender_Male,Senior Citizen_No,...,Tech Support_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,Paperless Billing_No,Paperless Billing_Yes,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,2,53.85,108.15,Yes,1,86,3239,0,1,1,...,0,1,0,0,0,1,0,0,0,1


In [262]:
df_copy.columns

# Replace blanks with NaN
df_copy['Total Charges'] = df_copy['Total Charges'].replace(" ", np.nan)

# Convert column to float
df_copy['Total Charges'] = df_copy['Total Charges'].astype(float)

# Handle missing values (e.g., fill with mean or 0)
df_copy['Total Charges'] = df_copy['Total Charges'].fillna(df_copy['Total Charges'].mean())

In [263]:
df_copy.dtypes

Tenure Months                                 int64
Monthly Charges                             float64
Total Charges                               float64
Churn Label                                  object
Churn Value                                   int64
Churn Score                                   int64
CLTV                                          int64
Gender_Female                                 int64
Gender_Male                                   int64
Senior Citizen_No                             int64
Senior Citizen_Yes                            int64
Partner_No                                    int64
Partner_Yes                                   int64
Dependents_No                                 int64
Dependents_Yes                                int64
Phone Service_No                              int64
Phone Service_Yes                             int64
Multiple Lines_No                             int64
Multiple Lines_No phone service               int64
Multiple Lin

In [264]:
leak_cols = ['Churn Value', 'Churn Score', 'CLTV']
feature_cols = [col for col in df_copy.columns if col not in leak_cols + ['Churn Label']]
feature_cols

['Tenure Months',
 'Monthly Charges',
 'Total Charges',
 'Gender_Female',
 'Gender_Male',
 'Senior Citizen_No',
 'Senior Citizen_Yes',
 'Partner_No',
 'Partner_Yes',
 'Dependents_No',
 'Dependents_Yes',
 'Phone Service_No',
 'Phone Service_Yes',
 'Multiple Lines_No',
 'Multiple Lines_No phone service',
 'Multiple Lines_Yes',
 'Internet Service_DSL',
 'Internet Service_Fiber optic',
 'Internet Service_No',
 'Online Security_No',
 'Online Security_No internet service',
 'Online Security_Yes',
 'Online Backup_No',
 'Online Backup_No internet service',
 'Online Backup_Yes',
 'Device Protection_No',
 'Device Protection_No internet service',
 'Device Protection_Yes',
 'Tech Support_No',
 'Tech Support_No internet service',
 'Tech Support_Yes',
 'Contract_Month-to-month',
 'Contract_One year',
 'Contract_Two year',
 'Paperless Billing_No',
 'Paperless Billing_Yes',
 'Payment Method_Bank transfer (automatic)',
 'Payment Method_Credit card (automatic)',
 'Payment Method_Electronic check',
 'Pay

In [265]:

X = df_copy[feature_cols].dropna()
y = df_copy['Churn Label']
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test


In [266]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth=5, random_state=1)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
y_pred

array(['No', 'No', 'No', ..., 'No', 'No', 'Yes'],
      shape=(2113,), dtype=object)

In [267]:
# --- Decision Tree Evaluation ---
dt_accuracy = accuracy_score(y_test, y_pred)
print(f'Decision Tree Accuracy: {dt_accuracy:.2f}')

dt_report = classification_report(y_test, y_pred)
print(dt_report)

dt_confusion = confusion_matrix(y_test, y_pred)
print(dt_confusion)

dt_f1_score = f1_score(y_test, y_pred, pos_label='Yes')
print(f'Decision Tree F1 Score: {dt_f1_score:.2f}')


Decision Tree Accuracy: 0.79
              precision    recall  f1-score   support

          No       0.86      0.86      0.86      1575
         Yes       0.59      0.58      0.58       538

    accuracy                           0.79      2113
   macro avg       0.72      0.72      0.72      2113
weighted avg       0.79      0.79      0.79      2113

[[1358  217]
 [ 227  311]]
Decision Tree F1 Score: 0.58


In [268]:
from sklearn.tree import export_graphviz
import graphviz

dot_data = export_graphviz(clf, 
                            out_file=None, 
                            feature_names=X.columns,
                            class_names=y.unique().astype(str),
                            filled=True, rounded=True,
                            special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("decision_tree", format="png")  # Saves a PNG


'decision_tree.png'

In [269]:
#KNN ML ALGORITHM
X = df_copy[feature_cols].dropna()
y = df_copy['Churn Label']
# Split dataset into training set and test set
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X, y, test_size=0.3, random_state=1)
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_knn)
X_test_scaled = scaler.transform(X_test_knn)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

0,1,2
,n_neighbors,3
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [270]:
y_knn_pred = knn.predict(X_test_scaled)
y_knn_pred

array(['Yes', 'Yes', 'No', ..., 'No', 'Yes', 'Yes'],
      shape=(2113,), dtype=object)

In [271]:
knn_accuracy = accuracy_score(y_test, y_knn_pred)
print(f"Accuracy: {knn_accuracy}")

knn_report = classification_report(y_test_knn, y_knn_pred)
print(knn_report)

knn_f1_score = f1_score(y_test_knn, y_knn_pred, pos_label='Yes')
print(f"F1 Score: {knn_f1_score}") 

knn_confusion_matrix = confusion_matrix(y_test, y_knn_pred)
print("Confusion Matrix:\n", knn_confusion_matrix)

Accuracy: 0.7501183151916706
              precision    recall  f1-score   support

          No       0.83      0.84      0.83      1575
         Yes       0.51      0.49      0.50       538

    accuracy                           0.75      2113
   macro avg       0.67      0.67      0.67      2113
weighted avg       0.75      0.75      0.75      2113

F1 Score: 0.500945179584121
Confusion Matrix:
 [[1320  255]
 [ 273  265]]
