### Model Training

In [None]:
### Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report, roc_auc_score
)

In [4]:
df = pd.read_csv('data/churn_prediction.csv')

In [5]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Preparing X and Y variables

In [6]:
X = df.drop(columns=['Churn'])



In [7]:
X.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


In [8]:
y = df['Churn'].map({'Yes':1, 'No':0})

In [9]:
y

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

In [10]:
# Column types
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

In [11]:
# Transformers
numeric_transformer = StandardScaler()
from sklearn.preprocessing import OneHotEncoder

# Correct usage in scikit-learn 1.2+
oh_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['Churn'])  # 'No' -> 0, 'Yes' -> 1

preprocessor = ColumnTransformer(
    transformers=[
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features)
    ]
)


In [12]:
X = preprocessor.fit_transform(X)

In [13]:
X.shape

(7043, 13618)

In [14]:
# Assuming 'Churn' is your target
feature_cols = [col for col in df.columns if col != 'Churn']

X = df[feature_cols]
y = df['Churn']


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape


((5634, 20), (1409, 20))

### Create an Evaluate Function to give all metrics after model Training

In [29]:


# --- Convert target to numeric 0/1 ---
y_num = y.map({'No':0, 'Yes':1})

# --- Split dataset ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y_num, test_size=0.2, random_state=42, stratify=y_num
)

# --- Define models ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss')  # removed use_label_encoder
}


# --- Function to evaluate metrics ---
def evaluate_classification(y_true, y_pred, y_proba=None):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label=1)
    recall = recall_score(y_true, y_pred, pos_label=1)
    f1 = f1_score(y_true, y_pred, pos_label=1)
    
    roc_auc = None
    if y_proba is not None:
        # Ensure y_true is numeric
        if isinstance(y_true, np.ndarray):
            y_true_num = y_true
        else:
            y_true_num = np.where(y_true==1, 1, 0)  # already numeric 0/1
        roc_auc = roc_auc_score(y_true_num, y_proba)
    
    return accuracy, precision, recall, f1, roc_auc

# --- Loop through models ---
for name, model in models.items():
    # Create pipeline
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Train
    pipe.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = pipe.predict(X_train)
    y_test_pred = pipe.predict(X_test)
    
    # Probabilities for ROC-AUC
    if hasattr(pipe, "predict_proba"):
        y_train_proba = pipe.predict_proba(X_train)[:, 1]
        y_test_proba = pipe.predict_proba(X_test)[:, 1]
    else:
        y_train_proba = y_test_proba = None
    
    # Evaluate metrics
    train_acc, train_prec, train_rec, train_f1, train_roc = evaluate_classification(
        y_train, y_train_pred, y_train_proba
    )
    test_acc, test_prec, test_rec, test_f1, test_roc = evaluate_classification(
        y_test, y_test_pred, y_test_proba
    )
    
    # --- Print structured output ---
    print(f"{name}")
    print("Model performance for Training set")
    print(f"- Accuracy : {train_acc:.4f}")
    print(f"- Precision: {train_prec:.4f}")
    print(f"- Recall   : {train_rec:.4f}")
    print(f"- F1 Score : {train_f1:.4f}")
    if train_roc is not None:
        print(f"- ROC AUC  : {train_roc:.4f}")
    print("----------------------------------")
    
    print("Model performance for Test set")
    print(f"- Accuracy : {test_acc:.4f}")
    print(f"- Precision: {test_prec:.4f}")
    print(f"- Recall   : {test_rec:.4f}")
    print(f"- F1 Score : {test_f1:.4f}")
    if test_roc is not None:
        print(f"- ROC AUC  : {test_roc:.4f}")
    print("="*35)
    print("\n")


Logistic Regression
Model performance for Training set
- Accuracy : 0.9356
- Precision: 0.9528
- Recall   : 0.7967
- F1 Score : 0.8678
- ROC AUC  : 0.9743
----------------------------------
Model performance for Test set
- Accuracy : 0.7963
- Precision: 0.6347
- Recall   : 0.5481
- F1 Score : 0.5882
- ROC AUC  : 0.8403


Random Forest
Model performance for Training set
- Accuracy : 1.0000
- Precision: 1.0000
- Recall   : 1.0000
- F1 Score : 1.0000
- ROC AUC  : 1.0000
----------------------------------
Model performance for Test set
- Accuracy : 0.7949
- Precision: 0.6545
- Recall   : 0.4813
- F1 Score : 0.5547
- ROC AUC  : 0.8246


XGBoost
Model performance for Training set
- Accuracy : 0.9288
- Precision: 0.8902
- Recall   : 0.8348
- F1 Score : 0.8616
- ROC AUC  : 0.9786
----------------------------------
Model performance for Test set
- Accuracy : 0.7835
- Precision: 0.6117
- Recall   : 0.5053
- F1 Score : 0.5534
- ROC AUC  : 0.8208




In [30]:
LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000
