In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

## 1. Load Data

In [2]:
df = pd.read_excel("../data/raw/Telco_customer_churn.xlsx")
df.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


## 2. Preprocessing
We will apply the cleaning steps discovered during EDA

In [3]:
# 1. Fix Total Charges
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
df['Total Charges'] = df['Total Charges'].fillna(0)

# 2. Drop Unnecessary Columns
drop_cols = ['CustomerID', 'Count', 'Country', 'State', 'City', 
             'Zip Code', 'Lat Long', 'Latitude', 'Longitude', 
             'Churn Label', 'Churn Score', 'CLTV', 'Churn Reason']

df_clean = df.drop(columns=drop_cols)

# 3. One-Hot Encoding
df_model = pd.get_dummies(df_clean, drop_first=True)

print(f"Original Shape: {df.shape}")
print(f"Model Data Shape: {df_model.shape}")

Original Shape: (7043, 33)
Model Data Shape: (7043, 31)


## 3. Train-Test Split

In [4]:
X = df_model.drop('Churn Value', axis=1)
y = df_model['Churn Value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

Training samples: 5634
Testing samples: 1409


## 4. Baseline Model: Logistic Regression

In [5]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

y_pred = log_model.predict(X_test)

print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy Score: 0.8019872249822569

Confusion Matrix:
 [[900 109]
 [170 230]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.87      1009
           1       0.68      0.57      0.62       400

    accuracy                           0.80      1409
   macro avg       0.76      0.73      0.74      1409
weighted avg       0.79      0.80      0.80      1409

