In [1]:
import pandas as pd
import numpy as np

In [2]:

commerce_df = pd.read_csv('data/CommerceData.csv')

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 복사본 작업
df = commerce_df.copy()

# 1. 결축치 처리
df['Tenure'] = df['Tenure'].fillna(df['Tenure'].median())
df['WarehouseToHome'] = df['WarehouseToHome'].fillna(df['WarehouseToHome'].median())
df['HourSpendOnApp'] = df['HourSpendOnApp'].fillna(df['HourSpendOnApp'].median())

df['NoLastYearPurchase'] = df['OrderAmountHikeFromlastYear'].isna().astype(int)
df['OrderAmountHikeFromlastYear'] = df['OrderAmountHikeFromlastYear'].fillna(0)

df['CouponUsed'] = df['CouponUsed'].fillna(0)
df['OrderCount'] = df['OrderCount'].fillna(0)

# NeverOrdered로는 구분하고,
# DaySinceLastOrder는 명확히 큰 값 (ex. 최대값 + 1) 으로 채워서 모델이 구분 가능하게끔 설정
# max_day = df['DaySinceLastOrder'].max()
# df['DaySinceLastOrder'] = df['DaySinceLastOrder'].fillna(max_day + 1)
# df['NeverOrdered'] = (df['DaySinceLastOrder'] > max_day).astype(int)
df['DaySinceLastOrder'] = df['DaySinceLastOrder'].fillna(df['DaySinceLastOrder'].median())

# 2. 범주형 인코딩
cat_cols = ['PreferredLoginDevice', 'PreferredPaymentMode', 'Gender',
            'PreferedOrderCat', 'MaritalStatus']

df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# 3. ID 제거
df.drop(columns=['CustomerID'], inplace=True)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. train/test 분리 (스케일링 전에 분리)
X = df.drop(columns='Churn')
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. 연속형 변수만 선택하기 위해서 제외할 컬럼 리스트 정의
exclude = ['CityTier', 'PreferredPaymentMode', 'Gender',
           'PreferedOrderCat', 'MaritalStatus', 'PreferredLoginDevice']

# 정수형, 실수형 컬럼 중에서 exclude 컬럼 제외한 컬럼 뽑기
num_cols = X.select_dtypes(include=['float64', 'int64']).columns.difference(exclude)

# 3. 스케일러 선언 및 학습
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}
grid = GridSearchCV(SVC(), param_grid, cv=5)
grid.fit(X_train, y_train)


best_model = grid.best_estimator_
print("최적 하이퍼파라미터:", grid.best_params_)


y_pred = best_model.predict(X_test)

최적 하이퍼파라미터: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [6]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("정확도:", accuracy_score(y_test, y_pred))
print("분류 리포트:\n", classification_report(y_test, y_pred))
print("혼동 행렬:\n", confusion_matrix(y_test, y_pred))


정확도: 0.9591474245115453
분류 리포트:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98       936
           1       0.90      0.85      0.88       190

    accuracy                           0.96      1126
   macro avg       0.94      0.91      0.93      1126
weighted avg       0.96      0.96      0.96      1126

혼동 행렬:
 [[919  17]
 [ 29 161]]
