## Import Libraries

In [130]:
import pandas as pd
import numpy as np
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier , StackingClassifier

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score ,  classification_report , confusion_matrix

import warnings
warnings.filterwarnings("ignore")

## Load Dataset

In [131]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


## Explore Dataset

In [132]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [133]:
df.shape

(7043, 21)

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [135]:
# Finding Missing values

df.isnull().sum()

# no missing values

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [136]:
# drop unwanted columns
df = df.drop("customerID",axis=1)

In [137]:
df.shape

(7043, 20)

## Feature Selection

In [138]:
df = df.replace(" ",np.nan)

In [139]:
target = "Churn"
X = df.drop(target,axis=1)
y = df[target].map({"No":0,"Yes":1})

In [140]:
binary_col = ["Partner","Dependents","PhoneService","Paperlessbilling","Gender","SenioCitizen","MultipleLines","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies"]
numeric_col = ["tenure","MonthlyCharges","TotalCharges"]
categorical_col = [col for col in X.columns if col not in binary_col + numeric_col]

###  Preprocessing: convert binary columns to 0/1

In [141]:
binary_maping = {"Yes":1 , "No":0 , "Female":0 , "Male":1}

for col in binary_col:
    if  col in X.columns:
        X[col] = X[col].map(binary_maping)
        #any reamnaing nan fill with mode
        X[col] = X[col].fillna(X[col].mode()[0])

# numerical col convert to float
    for col in numeric_col:
        X[col] = pd.to_numeric(X[col],errors="coerce")
        X[col] = X[col].fillna(X[col].median())

# categorical fill missing wiht mode
    for col in categorical_col:
        X[col] = X[col].fillna(X[col].mode()[0])




In [142]:
preprocessor = ColumnTransformer(transformers=[
    ("Scaler",StandardScaler(),numeric_col),
    ("Cat",OneHotEncoder(drop="first"),categorical_col)
],remainder="passthrough")

## Train Test Split

In [143]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)

### Define Models

In [144]:
base_models = {
    "LogisticRegression":LogisticRegression(max_iter=1000),
    "DecisionTreeClassifier":DecisionTreeClassifier(random_state=42),
    "RandomForestClassifier":RandomForestClassifier(n_estimators=200,random_state=42),
    "GradientBoostingClassifier":GradientBoostingClassifier(n_estimators=200,learning_rate=0.1,random_state=42),
    "XGBoost":xgb.XGBClassifier(use_label_encoder=False,eval_metric="logloss",n_estimator=200,random_state=42),
    "LighGBM":lgb.LGBMClassifier(n_estimators=200,random_state=42),
   "CatBoost":cb.CatBoostClassifier(verbose=0,random_state=42) 
}

### Stacking Ensemble

In [145]:
estimators = [(name,models) for name,models in base_models.items()]
stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator = GradientBoostingClassifier(random_state=42),
    passthrough=False,
    cv=5
)

### Pipeline with Stacking

In [146]:
full_pipeline = Pipeline([
    ("preprocessor",preprocessor),
    ("stack_model",stack_model)
])

print("Training stacking pipeline with all models...")
full_pipeline.fit(X_train,y_train)

Training stacking pipeline with all models...
[LightGBM] [Info] Number of positive: 1496, number of negative: 4138
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000611 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 624
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265531 -> initscore=-1.017418
[LightGBM] [Info] Start training from score -1.017418
[LightGBM] [Info] Number of positive: 1197, number of negative: 3310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000476 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 624
[LightGBM] [Info] Number of data points in the train set: 4507, number o

0,1,2
,steps,"[('preprocessor', ...), ('stack_model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('Scaler', ...), ('Cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimators,"[('LogisticRegression', ...), ('DecisionTreeClassifier', ...), ...]"
,final_estimator,GradientBoost...ndom_state=42)
,cv,5
,stack_method,'auto'
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,200
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,200
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


### Evaluate all Model

In [149]:
results = {}

print("\n Training and evaluate all base models individually....")
for name , model in base_models.items():
    pipe = Pipeline([
        ("preprocessor",preprocessor),
        ("model",model)
    ])
    
    pipe.fit(X_train,y_train)
    y_pred = pipe.predict(X_test)

    results[name] = {
        "accuracy":accuracy_score(y_test,y_pred),
        "Precision":precision_score(y_test,y_pred,pos_label=1),
        "Recall":recall_score(y_test,y_pred,pos_label=1),
        "F1-Score":f1_score(y_test,y_pred,pos_label=1)
    }

results_df = pd.DataFrame(results).T.sort_values(by="F1-Score",ascending=False)
print("\nBase Models Performance")
print(results_df)


 Training and evaluate all base models individually....
[LightGBM] [Info] Number of positive: 1496, number of negative: 4138
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000545 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 624
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265531 -> initscore=-1.017418
[LightGBM] [Info] Start training from score -1.017418

Base Models Performance
                            accuracy  Precision    Recall  F1-Score
LogisticRegression          0.823279   0.691358  0.600536  0.642755
GradientBoostingClassifier  0.814053   0.685619  0.549598  0.610119
CatBoost                    0.804826   0.666667  0.525469  0.587706
LighGBM                     0.794890   0.642857  0.506702  0.566717
RandomForestClassifier  

In [150]:
joblib.dump(full_pipeline,"pipeline.pkl")
print("Full Stacking Pipeline saved as pipeline.pkl")

Full Stacking Pipeline saved as pipeline.pkl
