# 06. SHAP 분석

## 목적
- XGBoost 모델로 고위험 예측
- SHAP으로 변수 중요도 계산

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import shap
import warnings
warnings.filterwarnings('ignore')

result_dir = "/Users/yeong-gwang/Documents/배움 오전 1.38.42/외부/공모전/빅콘테스트/Project/work/ver3_/1012/result/3_가설1분석"
cluster_result = pd.read_csv(f"{result_dir}/클러스터링_결과_완전판.csv")

data_path = "/Users/yeong-gwang/Documents/배움 오전 1.38.42/외부/공모전/빅콘테스트/Project/work/ver3_/1009/빅콘테스트_전체병합데이터_20251008.csv"
df_full = pd.read_csv(data_path)

df = cluster_result.merge(df_full, on='가맹점구분번호', how='left')
df['is_high_risk'] = df['cluster'].apply(lambda x: 1 if x in [0, 2] else 0)

print(f"고위험: {(df['is_high_risk'] == 1).sum():,}개")
print(f"정상: {(df['is_high_risk'] == 0).sum():,}개")

고위험: 88,578개
정상: 825개


## XGBoost 모델 학습

In [2]:
# 수치형 변수 선택
features = ['당월_매출_금액', '당월_매출_건수', '배달매출금액 비율', '재방문 고객 비중']
existing_features = [f for f in features if f in df.columns and pd.api.types.is_numeric_dtype(df[f])]

X = df[existing_features].fillna(df[existing_features].median())
y = df['is_high_risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, eval_metric='logloss')
model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_test)[:, 1]
print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")


ROC-AUC Score: 0.9655


## SHAP 분석

In [3]:
explainer = shap.TreeExplainer(model)
X_sample = X_test.sample(n=min(1000, len(X_test)), random_state=42)
shap_values = explainer.shap_values(X_sample)

shap_importance = pd.DataFrame({
    '변수': X_sample.columns,
    'SHAP_importance': np.abs(shap_values).mean(axis=0)
}).sort_values('SHAP_importance', ascending=False)

print("\n[SHAP 변수 중요도]")
print(shap_importance)


[SHAP 변수 중요도]
          변수  SHAP_importance
0   당월_매출_금액         1.006188
1   당월_매출_건수         0.808473
3  재방문 고객 비중         0.542146
2  배달매출금액 비율         0.333572


### 해석
- SHAP 값이 높을수록 예측에 큰 영향
- 폐업 위험 예측의 핵심 변수 파악