In [None]:
# https://www.kaggle.com/datasets/sakshigoyal7/credit-card-customers/code?datasetId=982921&sortBy=voteCount

In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 데이터 로드
df = pd.read_csv("./data/BankChurners.csv")

# 불필요한 컬럼 제거 (고객 식별자 + Naive Bayes 예측 결과)
df.drop(columns=["CLIENTNUM"], inplace=True)
df = df.loc[:, ~df.columns.str.startswith("Naive_Bayes_")]

# 이탈 여부 컬럼 이진화: Attrited Customer = 1, Existing Customer = 0
df["Churn"] = df["Attrition_Flag"].apply(lambda x: 1 if x == "Attrited Customer" else 0)
df.drop(columns=["Attrition_Flag"], inplace=True)

# 범주형 변수 인코딩
categorical_cols = df.select_dtypes(include="object").columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

#  결과 확인
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Customer_Age              10127 non-null  int64  
 1   Gender                    10127 non-null  int64  
 2   Dependent_count           10127 non-null  int64  
 3   Education_Level           10127 non-null  int64  
 4   Marital_Status            10127 non-null  int64  
 5   Income_Category           10127 non-null  int64  
 6   Card_Category             10127 non-null  int64  
 7   Months_on_book            10127 non-null  int64  
 8   Total_Relationship_Count  10127 non-null  int64  
 9   Months_Inactive_12_mon    10127 non-null  int64  
 10  Contacts_Count_12_mon     10127 non-null  int64  
 11  Credit_Limit              10127 non-null  float64
 12  Total_Revolving_Bal       10127 non-null  int64  
 13  Avg_Open_To_Buy           10127 non-null  float64
 14  Total_

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Churn
0,45,1,3,3,1,2,0,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,0
1,49,0,5,2,2,4,0,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,0
2,51,1,3,2,1,3,0,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,0
3,40,0,4,3,3,4,0,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0
4,40,1,3,5,1,2,0,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0,0


In [10]:
# 피어슨 상관계수 계산
corr_matrix = df.corr(numeric_only=True)

# 'Churn'과의 상관관계만 추출
churn_corr = corr_matrix['Churn'].drop('Churn').sort_values(ascending=False)

# 결과 출력
print(churn_corr)

Contacts_Count_12_mon       0.204491
Months_Inactive_12_mon      0.152449
Dependent_count             0.018991
Marital_Status              0.018597
Customer_Age                0.018203
Income_Category             0.017584
Months_on_book              0.013687
Education_Level             0.005551
Avg_Open_To_Buy            -0.000285
Card_Category              -0.006038
Credit_Limit               -0.023873
Gender                     -0.037272
Total_Amt_Chng_Q4_Q1       -0.131063
Total_Relationship_Count   -0.150005
Total_Trans_Amt            -0.168598
Avg_Utilization_Ratio      -0.178410
Total_Revolving_Bal        -0.263053
Total_Ct_Chng_Q4_Q1        -0.290054
Total_Trans_Ct             -0.371403
Name: Churn, dtype: float64


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 입력(X), 타겟(y) 분리
X = df.drop(columns=["Churn"])
y = df["Churn"]

# 학습/테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 로지스틱 회귀 모델 학습
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# 예측 및 평가
y_pred = model.predict(X_test_scaled)
print("📊 Logistic Regression Classification Report:\n")
print(classification_report(y_test, y_pred))

📊 Logistic Regression Classification Report:

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      1701
           1       0.79      0.53      0.63       325

    accuracy                           0.90      2026
   macro avg       0.85      0.75      0.79      2026
weighted avg       0.90      0.90      0.89      2026



In [None]:

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier


X = df.drop(columns=["Churn"])
y = df["Churn"]

# 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 랜덤 포레스트 모델 학습
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# 예측 및 평가
y_pred = rf_model.predict(X_test_scaled)
print("📊 RandomForest Classification Report:\n")
print(classification_report(y_test, y_pred))

📊 RandomForest Classification Report:

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1701
           1       0.94      0.80      0.86       325

    accuracy                           0.96      2026
   macro avg       0.95      0.89      0.92      2026
weighted avg       0.96      0.96      0.96      2026



In [12]:
from xgboost import XGBClassifier

# 입력(X), 타겟(y) 분리
X = df.drop(columns=["Churn"])
y = df["Churn"]

# 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# XGBoost 모델 학습
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# 예측 및 평가
y_pred = xgb_model.predict(X_test_scaled)
print("📊 XGBoost Classification Report:\n")
print(classification_report(y_test, y_pred))

📊 XGBoost Classification Report:

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1701
           1       0.92      0.86      0.89       325

    accuracy                           0.97      2026
   macro avg       0.95      0.92      0.94      2026
weighted avg       0.97      0.97      0.97      2026



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
