In [3]:
# https://www.kaggle.com/datasets/sandiledesmondmfazi/bank-customer-churn

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 데이터 불러오기
df = pd.read_csv("./data/bank_churn_data.csv")

# 제거할 불필요한 컬럼 목록
drop_cols = [
    "Surname", "First Name", "Date of Birth", "Address", "Churn Reason", "zip_code"
]

# 존재하는 컬럼만 제거
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)

# 범주형 변수 인코딩
label_encoders = {}
categorical_cols = df.select_dtypes(include="object").columns

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # 인코더 저장 (역변환 시 사용 가능)

# 결과 확인
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115640 entries, 0 to 115639
Data columns (total 16 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Gender                           115640 non-null  int64  
 1   Marital Status                   115640 non-null  int64  
 2   Number of Dependents             115640 non-null  int64  
 3   Occupation                       115640 non-null  int64  
 4   Income                           115640 non-null  float64
 5   Education Level                  115640 non-null  int64  
 6   Customer Tenure                  115640 non-null  int64  
 7   Customer Segment                 115640 non-null  int64  
 8   Preferred Communication Channel  115640 non-null  int64  
 9   Credit Score                     115640 non-null  int64  
 10  Credit History Length            115640 non-null  int64  
 11  Outstanding Loans                115640 non-null  float64
 12  Ch

Unnamed: 0,Gender,Marital Status,Number of Dependents,Occupation,Income,Education Level,Customer Tenure,Customer Segment,Preferred Communication Channel,Credit Score,Credit History Length,Outstanding Loans,Churn Flag,Balance,NumOfProducts,NumComplaints
0,1,0,3,318,77710.14,2,30,1,1,397,24,41959.74,0,211359.05,1,0
1,0,1,1,84,58209.87,2,27,2,0,665,10,8916.67,0,30624.76,4,1
2,0,2,1,597,9794.01,2,14,1,0,715,21,43270.54,0,111956.61,2,6
3,0,0,5,26,15088.98,2,23,0,1,747,17,17887.65,0,201187.61,1,0
4,0,0,2,585,60726.56,3,22,0,0,549,25,32686.84,0,60391.24,5,6


In [5]:
# 피어슨 상관계수 계산
corr_matrix = df.corr(numeric_only=True)

# 'Churn'과의 상관관계만 추출
churn_corr = corr_matrix['Churn Flag'].drop('Churn Flag').sort_values(ascending=False)

# 결과 출력
print(churn_corr)

NumComplaints                      0.204626
Occupation                         0.004725
Number of Dependents               0.003109
Credit History Length              0.002899
Customer Segment                   0.002480
Income                             0.002286
Gender                             0.001280
Education Level                    0.000956
Customer Tenure                    0.000344
Preferred Communication Channel    0.000069
Outstanding Loans                 -0.001146
Marital Status                    -0.003774
NumOfProducts                     -0.179083
Credit Score                      -0.182802
Balance                           -0.499981
Name: Churn Flag, dtype: float64


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 입력(X), 타겟(y) 분리
X = df.drop(columns=["Churn Flag"])
y = df["Churn Flag"]

# 학습/테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 로지스틱 회귀 모델 학습
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# 예측 및 평가
y_pred = model.predict(X_test_scaled)
print("📊 Logistic Regression Classification Report:\n")
print(classification_report(y_test, y_pred))

📊 Logistic Regression Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20309
           1       1.00      1.00      1.00      2819

    accuracy                           1.00     23128
   macro avg       1.00      1.00      1.00     23128
weighted avg       1.00      1.00      1.00     23128



In [7]:
from sklearn.ensemble import RandomForestClassifier

X = df.drop(columns=["Churn Flag"])
y = df["Churn Flag"]

# 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 랜덤 포레스트 모델 학습
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# 예측 및 평가
y_pred = rf_model.predict(X_test_scaled)
print("📊 RandomForest Classification Report:\n")
print(classification_report(y_test, y_pred))

📊 RandomForest Classification Report:

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     20309
           1       0.98      0.91      0.94      2819

    accuracy                           0.99     23128
   macro avg       0.98      0.95      0.97     23128
weighted avg       0.99      0.99      0.99     23128



In [8]:
from xgboost import XGBClassifier

# 입력(X), 타겟(y) 분리
X = df.drop(columns=["Churn Flag"])
y = df["Churn Flag"]

# 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# XGBoost 모델 학습
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# 예측 및 평가
y_pred = xgb_model.predict(X_test_scaled)
print("📊 XGBoost Classification Report:\n")
print(classification_report(y_test, y_pred))

📊 XGBoost Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20309
           1       0.98      0.98      0.98      2819

    accuracy                           0.99     23128
   macro avg       0.99      0.99      0.99     23128
weighted avg       0.99      0.99      0.99     23128



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
