# 라이브러리

In [1]:
!pip install -U scikit-learn==1.3.2 imbalanced-learn==0.11.0

Collecting scikit-learn==1.3.2
  Downloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.11.0
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Downloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.6/235.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, imbalanced-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-

In [2]:
# 기본 라이브러리
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import matplotlib
import warnings

warnings.filterwarnings(action='ignore')

# 전처리 및 유틸
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.base import clone
from scipy.stats import mode
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.inspection import permutation_importance
from tqdm.notebook import tqdm
from sklearn.svm import SVC
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

# 모델 평가 / 지표
from sklearn.metrics import accuracy_score, f1_score, make_scorer, classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

# 데이터 분할 및 교차검증
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, cross_val_score

# 분류 모델
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier, BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

# 데이터 불러오기

In [3]:
x_train = pd.read_csv('/kaggle/input/uou-ie-g-03874-spring-2025-term-project/ML_x_train.csv')
x_test = pd.read_csv('/kaggle/input/uou-ie-g-03874-spring-2025-term-project/ML_x_test.csv')
y_train = pd.read_csv('/kaggle/input/uou-ie-g-03874-spring-2025-term-project/ML_y_train.csv')

In [4]:
x_train['is_train'] = 1
x_test['is_train'] = 0
x = pd.concat([x_train, x_test], ignore_index=True)

In [5]:
# 결측치 처리
x['Amount_invested_monthly'].fillna(x['Amount_invested_monthly'].median(), inplace=True)
x['Num_of_Loan'].fillna(x['Num_of_Loan'].median(), inplace=True)

In [6]:
# 이상치 처리 예시 (하한/상한 클리핑)
def clip_outliers(col):
    Q1 = x[col].quantile(0.25)
    Q3 = x[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    x[col] = x[col].clip(lower, upper)

clip_targets = ['Annual_Income', 'Monthly_Balance', 'Total_EMI_per_month', 
                'Amount_invested_monthly', 'Outstanding_Debt']

for col in clip_targets:
    clip_outliers(col)

In [7]:
# 파생 변수 생성
x['loan_to_income_ratio'] = x['Total_EMI_per_month'] / (x['Monthly_Inhand_Salary'] + 1)
x['debt_to_income_ratio'] = x['Outstanding_Debt'] / (x['Annual_Income'] + 1)
x['delayed_ratio'] = x['Num_of_Delayed_Payment'] / (x['Credit_History_Months'] + 1)
x['balance_to_emi'] = x['Monthly_Balance'] / (x['Total_EMI_per_month'] + 1)
x['salary_to_balance'] = x['Monthly_Inhand_Salary'] / (x['Monthly_Balance'] + 1)

In [8]:
# 타겟 인코딩 (Occupation, Payment_of_Min_Amount)
# Occupation 인코딩
occ_map = pd.concat([x, y_train], axis=1).groupby('Occupation')['Credit_Score'].mean()
x['Occupation_encoded'] = x['Occupation'].map(occ_map)
x['Occupation_encoded'].fillna(occ_map.mean(), inplace=True)

# Payment_of_Min_Amount 인코딩
pay_map = pd.concat([x, y_train], axis=1).groupby('Payment_of_Min_Amount')['Credit_Score'].mean()
x['Payment_encoded'] = x['Payment_of_Min_Amount'].map(pay_map)
x['Payment_encoded'].fillna(pay_map.mean(), inplace=True)

In [9]:
# 원본 범주형 변수 제거
x.drop(columns=['Occupation', 'Payment_of_Min_Amount'], inplace=True)

In [10]:
# 데이터 재분할
x_train_final = x[x['is_train'] == 1].drop(columns=['is_train'])
x_test_final = x[x['is_train'] == 0].drop(columns=['is_train'])

In [11]:
# 결과 확인
print("Train shape:", x_train_final.shape)
print("Test shape:", x_test_final.shape)

Train shape: (9199, 23)
Test shape: (2300, 23)


In [12]:
# 모델 학습
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(x_train_final, y_train.values.ravel())

In [13]:
# SHAP 계산
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(x_train_final)

In [14]:
# 중요 변수 정리
importances = xgb_model.feature_importances_
importance_df = pd.DataFrame({'feature': x_train_final.columns, 'importance': importances})
top15 = importance_df.sort_values(by='importance', ascending=False)['feature'].head(15).tolist()

print("중요 변수 TOP 15:\n", top15)

중요 변수 TOP 15:
 ['Payment_encoded', 'Outstanding_Debt', 'Interest_Rate', 'Num_Credit_Card', 'Delay_from_due_date', 'delayed_ratio', 'Num_Bank_Accounts', 'Total_EMI_per_month', 'Num_of_Delayed_Payment', 'Credit_History_Months', 'balance_to_emi', 'loan_to_income_ratio', 'Num_Credit_Inquiries', 'Monthly_Inhand_Salary', 'Amount_invested_monthly']


In [15]:
# 데이터 분할
X = x_train_final[top15]
X_test = x_test_final[top15]
y = y_train['Credit_Score']
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [16]:
# SMOTE 적용
smote = SMOTE(random_state=42)
X_tr_resampled, y_tr_resampled = smote.fit_resample(X_tr, y_tr)

In [17]:
# 단일 XGBoost 모델 훈련
xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.02,
    max_depth=5,
    subsample=0.85,
    colsample_bytree=1.0,
    gamma=0.1,
    reg_alpha=0,
    reg_lambda=1.8,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
xgb_model.fit(X_tr_resampled, y_tr_resampled)

In [18]:
val_preds = xgb_model.predict(X_val)
print("Macro F1:", f1_score(y_val, val_preds, average='macro'))
print(classification_report(y_val, val_preds))

Macro F1: 0.652755490187155
              precision    recall  f1-score   support

           0       0.49      0.65      0.56       335
           1       0.66      0.75      0.70       534
           2       0.77      0.63      0.69       971

    accuracy                           0.67      1840
   macro avg       0.64      0.68      0.65      1840
weighted avg       0.69      0.67      0.67      1840



In [19]:
# 학습에 사용한 Feature Column 목록
selected_features = X.columns.tolist()

# 예측용 데이터에서도 동일하게 맞춰서 정렬
x_test_selected = x_test_final[selected_features]

# 예측 수행
pred_test = xgb_model.predict(x_test_selected)

In [20]:
# 제출 파일 생성
submission = pd.read_csv('/kaggle/input/uou-ie-g-03874-spring-2025-term-project/ML_sample_submission.csv')
submission['Credit_Score'] = pred_test
submission.to_csv('submission.csv', index=False)