In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# !pip install optuna
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import optuna
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import numpy as np
import pandas as pd
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Google Drive에서 데이터 로드
train_path = '/content/drive/MyDrive/ITStudy/ML DL Project/open/train.csv'
test_path = '/content/drive/MyDrive/ITStudy/ML DL Project/open/test.csv'
submission_path = "/content/drive/MyDrive/ITStudy/ML DL Project/open/sample_submission.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

df = train.copy()

def preprocess_data(df, is_train=True, label_encoders=None):
    # 불필요한 컬럼 제거
    df = df.drop(columns=['UID'], errors='ignore')

    # 파생 변수 생성 (0으로 나누는 문제 방지)
    df['부채 비율'] = np.where(df['최대 신용한도'] == 0, np.nan, df['현재 미상환 신용액'] / df['최대 신용한도'])
    df['신용 점수 대비 부채 비율'] = np.where(df['신용 점수'] == 0, np.nan, df['부채 비율'] / df['신용 점수'])
    df['연체 리스크 지표'] = df['신용 문제 발생 횟수'] * df['마지막 연체 이후 경과 개월 수']
    df['월 소득 대비 부채 비율'] = np.where(df['연간 소득'] == 0, np.nan, df['월 상환 부채액'] / (df['연간 소득'] / 12))
    df['총 부채 대비 월 상환액'] = np.where((df['현재 대출 잔액'] + df['현재 미상환 신용액']) == 0, np.nan, df['월 상환 부채액'] / (df['현재 대출 잔액'] + df['현재 미상환 신용액']))
    df['연간 소득 대비 최대 신용한도 비율'] = np.where(df['연간 소득'] == 0, np.nan, df['최대 신용한도'] / df['연간 소득'])

    # 무한대 값(NaN) 변환 후 결측치 처리
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    numeric_cols = df.select_dtypes(include=['number']).columns
    categorical_cols = ['주거 형태', '현재 직장 근속 연수', '대출 목적', '대출 상환 기간']

    # 수치형 변수: 중앙값 대체
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

    # 범주형 변수: 'Unknown'으로 대체
    for col in categorical_cols:
        df[col] = df[col].fillna('Unknown')

    # 범주형 변수 인코딩
    if is_train:
        label_encoders = {}
        for col in categorical_cols:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le
    else:
        for col in categorical_cols:
            if col in label_encoders:
                df[col] = df[col].map(lambda x: label_encoders[col].transform([x])[0] if x in label_encoders[col].classes_ else -1)

    return df, label_encoders

# 데이터 전처리
df, label_encoders = preprocess_data(df, is_train=True)

# 데이터 분할
X = df.drop(columns=['채무 불이행 여부'])
y = df['채무 불이행 여부']

# 클래스 불균형 해결 (SMOTE 사용)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-Test Split
X_train, X_valid, y_train, y_valid = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Stacking 앙상블 적용
base_learners = [
    ('lgb', lgb.LGBMClassifier(random_state=42)),
    ('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('cat', cb.CatBoostClassifier(verbose=0, random_state=42))
]
meta_learner = LogisticRegression()
stacking_model = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, passthrough=True)

# 모델 학습
stacking_model.fit(X_train, y_train)
stacking_preds = stacking_model.predict_proba(X_valid)[:, 1]
stacking_auc = roc_auc_score(y_valid, stacking_preds)

# 테스트 데이터 전처리
test, _ = preprocess_data(test, is_train=False, label_encoders=label_encoders)
test = test[X_train.columns]

# 최적 모델을 사용하여 예측
stacking_test_preds = stacking_model.predict_proba(test)[:, 1]

# 제출 파일 생성 (샘플 제출 파일과 동일한 포맷 유지)
submission = pd.read_csv(submission_path)
submission.iloc[:, 1] = stacking_test_preds  # 두 번째 컬럼에 예측값 삽입
submission.to_csv('/content/drive/MyDrive/ITStudy/ML DL Project/open/sample_submission_final2.csv', index=False)

# 결과 출력
print(f"Stacking Ensemble ROC-AUC: {stacking_auc:.4f}")
print("제출 파일이 생성되었습니다: submission.csv")

[LightGBM] [Info] Number of positive: 5270, number of negative: 5270
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3648
[LightGBM] [Info] Number of data points in the train set: 10540, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 4216, number of negative: 4216
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001312 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3644
[LightGBM] [Info] Number of data points in the train set: 8432, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 4216, number of negative: 4216
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001278 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3638
[LightGBM] [Info] Number of data points in the train set: 8432, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Stacking Ensemble ROC-AUC: 0.6247
제출 파일이 생성되었습니다: submission.csv


  submission.iloc[:, 1] = stacking_test_preds  # 두 번째 컬럼에 예측값 삽입
