# 1. import data

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd

## 1-3 Read data:

In [3]:
dataset = pd.read_csv('customer_churn_data.csv')
dataset.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


0   customerID       
1   gender: 男性或女性            
2   SeniorCitizen: 客戶是否是老年人 (1, 0)     
3   Partner: 客戶是否有合作夥伴 (Yes, No)               
4   Dependents: 客戶是否有家屬 (Yes, No)      
5   tenure: 客戶停留的月數             
6   PhoneService: 客戶是否有電話服務 (Yes, No)      
7   MultipleLines: 客戶是否有多條線路 (Yes, No, No phone service)     
8   InternetService: 客戶的互聯網服務提供商 (DSL, Fiber optic, No)      
9   OnlineSecurity: 客戶是否有 online security (Yes, No, No internet service)    
10  OnlineBackup: 客戶是否有在線備份 (Yes, No, No internet service)       
11  DeviceProtection: 客戶是否有設備保護 (Yes, No, No internet service)       
12  TechSupport: 客戶是否有技術支持 (Yes, No, No internet service)       
13  StreamingTV: 客戶是否有 streaming TV (Yes, No, No internet service)       
14  StreamingMovies: 客戶是否有 streaming movies (Yes, No, No internet service)      
15  Contract: 客戶的合約期限 (Month-to-month, One year, Two year)         
16  PaperlessBilling: 客戶是否有無紙化計費 (Yes, No)      
17  PaymentMethod: 客戶的付款方式 (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))   
18  MonthlyCharges: 每月向客戶收取的金額     
19  TotalCharges: 向客戶收取的總金額   
20  Churn: 客戶是否流失 (Yes or No)        

In [4]:
try:
    from IPython import get_ipython
    get_ipython().magic('clear')
    get_ipython().magic('reset -f')
except:
    pass

In [5]:
from __future__ import annotations

from dataclasses import dataclass
from typing import List, Tuple

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier

from sklearn.metrics import roc_auc_score, f1_score, classification_report, confusion_matrix




In [6]:
dataset['SeniorCitizen'] = dataset['SeniorCitizen'].astype(object)
dataset['TotalCharges'] = pd.to_numeric(dataset['TotalCharges'], errors='coerce')
dataset.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
dataset.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [8]:
#Removing missing values  #因為數量占得很少，直接刪掉
dataset.dropna(inplace = True)
#若要補值 : 找到那個人屬於方案後去依照方案平均去補值

In [9]:
#Remove customer IDs from the data set
df = dataset.iloc[:,1:]

#Convertin the predictor variable in a binary numeric variable
df['Churn'].replace(to_replace='Yes', value=1, inplace=True)
df['Churn'].replace(to_replace='No',  value=0, inplace=True)

#Let's convert all the categorical variables into dummy variables
df_dummies = pd.get_dummies(df)
df_dummies.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,29.85,29.85,0,True,False,True,False,False,True,...,False,True,False,False,False,True,False,False,True,False
1,34,56.95,1889.5,0,False,True,True,False,True,False,...,False,False,True,False,True,False,False,False,False,True
2,2,53.85,108.15,1,False,True,True,False,True,False,...,False,True,False,False,False,True,False,False,False,True
3,45,42.3,1840.75,0,False,True,True,False,True,False,...,False,False,True,False,True,False,True,False,False,False
4,2,70.7,151.65,1,True,False,True,False,True,False,...,False,True,False,False,False,True,False,False,True,False


In [10]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [11]:

X = df_dummies.drop(columns=["Churn"])
y = df_dummies["Churn"]

In [12]:
# 分割資料集為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=0,
    stratify=y
)

In [17]:
#定義基本模型

base_models = [
    ('lr',LogisticRegression(max_iter=2000)),
    ('rf',RandomForestClassifier(n_estimators=300, random_state=42)),
    ('gb',GradientBoostingClassifier(random_state=42)),
]



In [18]:
meta_model = LogisticRegression(max_iter=2000)

In [13]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss

In [19]:
stack_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    stack_method="predict_proba",
    cv=5,
    n_jobs=-1
)

stack_model.fit(X_train, y_train)

pred_proba = stack_model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, pred_proba)

print("✅ ROC-AUC:", auc)

✅ ROC-AUC: 0.8543207831403264


In [20]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = stack_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[941  92]
 [171 203]]
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1033
           1       0.69      0.54      0.61       374

    accuracy                           0.81      1407
   macro avg       0.77      0.73      0.74      1407
weighted avg       0.80      0.81      0.81      1407



In [21]:
import numpy as np
from sklearn.metrics import f1_score

proba = stack_model.predict_proba(X_test)[:, 1]

for t in [0.3, 0.35, 0.4, 0.45, 0.5]:
    pred = (proba >= t).astype(int)
    f1 = f1_score(y_test, pred)
    print(f"threshold={t:.2f}  F1={f1:.4f}")


threshold=0.30  F1=0.6410
threshold=0.35  F1=0.6301
threshold=0.40  F1=0.6343
threshold=0.45  F1=0.6270
threshold=0.50  F1=0.6069


In [22]:
from sklearn.metrics import roc_auc_score

for name, model in base_models:
    model.fit(X_train, y_train)
    p = model.predict_proba(X_test)[:, 1]
    print(name, roc_auc_score(y_test, p))

p_stack = stack_model.predict_proba(X_test)[:, 1]
print("stack", roc_auc_score(y_test, p_stack))


lr 0.8514373275491663
rf 0.8229237307877476
gb 0.8531249514678705
stack 0.8543207831403264


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

n_folds = 5
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)

base_model_train_pred = np.zeros((X_train.shape[0], len(base_models)))
base_model_test_pred  = np.zeros((X_test.shape[0],  len(base_models)))


for i, (name, model) in enumerate(base_models):
    for train_idx, valid_idx in kf.split(X_train, y_train):
        X_tr = X_train.iloc[train_idx]
        y_tr = y_train.iloc[train_idx]
        X_va = X_train.iloc[valid_idx]

        model.fit(X_tr, y_tr)

        base_model_train_pred[valid_idx, i] = model.predict_proba(X_va)[:, 1]
        base_model_test_pred[:, i] += model.predict_proba(X_test)[:, 1] / n_folds

# meta model
meta_model = LogisticRegression(max_iter=2000)
meta_model.fit(base_model_train_pred, y_train)

test_pred = meta_model.predict_proba(base_model_test_pred)[:, 1]
print("Meta ROC-AUC:", roc_auc_score(y_test, test_pred))


Meta ROC-AUC: 0.8537228673040985
