In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/processed/churn_eda_ready.csv")
df.shape


(7032, 22)

In [2]:
df = df.drop(columns=['customerID'])

**encoding target varieable/churn**

In [3]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [4]:
df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded.shape

(7032, 35)

In [5]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,0-1 year
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,...,No,No,No,One year,No,Mailed check,56.95,1889.5,0,2-4 years
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1,0-1 year
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0,2-4 years
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1,0-1 year


In [6]:
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

**Train-Test Split**

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [8]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**Model training**

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


**Model evaluation**

In [10]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7938877043354655

~80% accuracy

In [11]:
#Detailed Metrics

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1033
           1       0.64      0.52      0.57       374

    accuracy                           0.79      1407
   macro avg       0.74      0.71      0.72      1407
weighted avg       0.78      0.79      0.79      1407



**Feature Importance**

In [12]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'coefficient': model.coef_[0]
}).sort_values(by='coefficient', ascending=False)

feature_importance.head(10)

Unnamed: 0,feature,coefficient
10,InternetService_Fiber optic,0.72102
33,tenure_group_5+ years,0.314501
3,TotalCharges,0.262182
21,StreamingTV_Yes,0.259442
23,StreamingMovies_Yes,0.241299
9,MultipleLines_Yes,0.233993
32,tenure_group_4-5 years,0.201437
28,PaymentMethod_Electronic check,0.176163
26,PaperlessBilling_Yes,0.14394
17,DeviceProtection_Yes,0.083843


**Interpretation of Model Results**
- Month-to-month contracts → positive coefficient → higher churn risk
- Long tenure → negative coefficient → lower churn
- Electronic check payment → higher churn likelihood
 - Higher monthly charges → increased churn probability