In [1]:
!pip install pandas numpy scikit-learn xgboost lightgbm catboost shap lime matplotlib


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import shap
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 데이터 불러오기
df = pd.read_csv("201807_회원_통합_스케일.csv")

In [4]:
# 타겟 및 피처 설정
target = 'CA한도금액'
selected_features = [
    '카드이용한도금액', '카드이용한도금액_B1M', '회원여부_이용가능_CA', '카드이용한도금액_B2M',
    '회원여부_이용가능', '강제한도감액횟수_R12M', '소지카드수_이용가능_신용', '강제한도감액후경과월',
    '상향가능한도금액', 'RV현금서비스이자율_할인전', '상향가능CA한도금액', '_1순위카드이용금액',
    'CL이자율_할인전', 'rv최초시작후경과일', '강제한도감액금액_R12M', '월상환론한도금액',
    '이용가능카드수_신용체크', '일시상환론한도금액', '한도증액후경과월', '한도증액금액_R12M'
]
X = df[selected_features]
y = df[target]

In [5]:
from sklearn.model_selection import train_test_split

# 1단계: 전체 데이터에서 10%를 테스트셋으로 분할
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

# 2단계: 남은 90%에서 7:2 비율로 train/validation 분할
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=2/9, random_state=42
)

# 최종 확인
print(f"Train size: {X_train.shape[0]} rows")
print(f"Validation size: {X_val.shape[0]} rows")
print(f"Test size: {X_test.shape[0]} rows")

Train size: 728860 rows
Validation size: 208246 rows
Test size: 104123 rows


In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np
import pickle
import os

# 🔧 피클 저장 디렉토리 생성
pickle_dir = "스케일데이터_모델별피클"
os.makedirs(pickle_dir, exist_ok=True)

# 모델 정의
models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42)
}

results = {}

# 학습 및 평가 + 모델 저장
for name, model in models.items():
    print(f"🔄 Training model: {name}")
    model.fit(X_train, y_train)

    # ▶ 피클로 저장 (모델별피클 디렉토리 내부)
    filename = os.path.join(pickle_dir, f"model_{name}_scaled.pkl")
    with open(filename, "wb") as f:
        pickle.dump(model, f)
    print(f" 모델 저장 완: {filename}")

    # ▶ Train 성능
    train_preds = model.predict(X_train)
    train_mse = mean_squared_error(y_train, train_preds)
    train_r2 = r2_score(y_train, train_preds)

    # ▶ Test 성능
    test_preds = model.predict(X_test)
    test_mse = mean_squared_error(y_test, test_preds)
    test_r2 = r2_score(y_test, test_preds)

    # ▶ 결과 저장
    results[name] = {
        "Train MSE": train_mse,
        "Train R2": train_r2,
        "Test MSE": test_mse,
        "Test R2": test_r2,
    }

# 결과 테이블 출력
results_df = pd.DataFrame(results).T
print("📊 모델별 Train & Test 성능 비교")
display(results_df)

In [None]:
import shap
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import os

# 한글 폰트 설정
if os.name == 'nt':
    plt.rc('font', family='Malgun Gothic')
else:
    plt.rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

# 결과 저장 디렉토리
output_dir = "스케일데이터_모델별성능시각화"
os.makedirs(output_dir, exist_ok=True)

# 모델별 SHAP + LIME 실행
for name, model in models.items():
    print(f"\n📊 SHAP scaled Feature Importance for {name}")

    try:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)

        plt.figure()
        shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
        plt.title(f"SHAP Summary scaled - {name}")
        plt.tight_layout()
        shap_path = os.path.join(output_dir, f"shap_summary_scaled_{name}.png")
        plt.savefig(shap_path, dpi=300)
        plt.close()
        print(f"✅ SHAP 저장 완료: {shap_path}")
    except Exception as e:
        print(f"❌ SHAP 실패 ({name}): {e}")

    print(f"🔍 LIME Explanation for {name}")
    try:
        lime_explainer = lime.lime_tabular.LimeTabularExplainer(
            training_data=X_train.values,
            feature_names=X.columns.tolist(),
            mode="regression"
        )
        lime_exp = lime_explainer.explain_instance(
            X_test.values[0],
            model.predict
        )

        # PNG로 저장 (matplotlib figure 변환 후 저장)
        fig = lime_exp.as_pyplot_figure()
        lime_path = os.path.join(output_dir, f"lime_summary_scaled_{name}.png")
        fig.savefig(lime_path, dpi=300)
        plt.close(fig)
        print(f"✅ LIME 저장 완료: {lime_path}")
    except Exception as e:
        print(f"❌ LIME 실패 ({name}): {e}")

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import numpy as np
import pickle
import os

# 저장 디렉토리
pickle_dir = "스케일데이터_튜닝된모델별피클"
os.makedirs(pickle_dir, exist_ok=True)

# 튜닝할 모델과 파라미터 그리드 정의
model_grids = {
    "RandomForest": (
        RandomForestRegressor(random_state=42),
        {
            'n_estimators': [100, 200],
            'max_depth': [None, 5, 10],
            'min_samples_split': [2, 5]
        }
    ),
    "XGBoost": (
        XGBRegressor(random_state=42, verbosity=0),
        {
            'n_estimators': [100, 200],
            'max_depth': [3, 6],
            'learning_rate': [0.1, 0.01]
        }
    ),
    "LightGBM": (
        LGBMRegressor(random_state=42),
        {
            'n_estimators': [100, 200],
            'max_depth': [-1, 5, 10],
            'learning_rate': [0.1, 0.01]
        }
    ),
    "CatBoost": (
        CatBoostRegressor(verbose=0, random_state=42),
        {
            'depth': [4, 6, 8],
            'learning_rate': [0.1, 0.01],
            'iterations': [100, 200]
        }
    )
}

results = {}
best_params = {}

# 튜닝 + 학습 + 평가 + 저장
for name, (model, param_grid) in model_grids.items():
    print(f"\n🔄 GridSearchCV tuning: {name}")
    grid = GridSearchCV(model, param_grid, cv=3, scoring='r2', n_jobs=-1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    best_params[name] = grid.best_params_

    # 저장
    filename = os.path.join(pickle_dir, f"model_{name}_scaled_tuned.pkl")
    with open(filename, "wb") as f:
        pickle.dump(best_model, f)
    print(f"✅ 튜닝된 모델 저장 완료: {filename}")

    # 평가
    train_preds = best_model.predict(X_train)
    test_preds = best_model.predict(X_test)

    results[name] = {
        "Train MSE": mean_squared_error(y_train, train_preds),
        "Train R2": r2_score(y_train, train_preds),
        "Test MSE": mean_squared_error(y_test, test_preds),
        "Test R2": r2_score(y_test, test_preds),
    }

# 결과 테이블 출력
results_df = pd.DataFrame(results).T
print("📊 튜닝된 모델 성능 비교")
display(results_df)

# 최적 하이퍼파라미터 출력
print("🔧 최적 하이퍼파라미터")
for name, params in best_params.items():
    print(f"{name}: {params}")

In [None]:
import shap
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import os

# 한글 폰트 설정
if os.name == 'nt':
    plt.rc('font', family='Malgun Gothic')
else:
    plt.rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

# 결과 저장 디렉토리
tuned_output_dir = "스케일데이터_튜닝된모델별성능시각화"
os.makedirs(tuned_output_dir, exist_ok=True)

# SHAP + LIME 시각화 for 튜닝된 모델
for name, model in tuned_models.items():
    print(f"\n📊 [튜닝] SHAP Feature Importance for {name}")

    try:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)

        plt.figure()
        shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
        plt.title(f"SHAP Summary scaled - Tuned {name}")
        plt.tight_layout()
        shap_path = os.path.join(tuned_output_dir, f"shap_summary_scaled_{name}_tuned.png")
        plt.savefig(shap_path, dpi=300)
        plt.close()
        print(f"✅ SHAP 저장 완료: {shap_path}")
    except Exception as e:
        print(f"❌ SHAP 실패 ({name}): {e}")

    print(f"🔍 [튜닝] LIME Explanation for {name}")
    try:
        lime_explainer = lime.lime_tabular.LimeTabularExplainer(
            training_data=X_train.values,
            feature_names=X.columns.tolist(),
            mode="regression"
        )
        lime_exp = lime_explainer.explain_instance(
            X_test.values[0],
            model.predict
        )

        # PNG 저장
        fig = lime_exp.as_pyplot_figure()
        lime_path = os.path.join(tuned_output_dir, f"lime_summary_scaled_{name}_tuned.png")
        fig.savefig(lime_path, dpi=300)
        plt.close(fig)
        print(f"✅ LIME 저장 완료: {lime_path}")
    except Exception as e:
        print(f"❌ LIME 실패 ({name}): {e}")