<a href="https://colab.research.google.com/github/Donggeon2960/LGAIMER-PRACTICE/blob/main/%EC%A0%9C1%ED%9A%8C%EB%AA%A8%EC%9D%98%ED%95%B4%EC%BB%A4%ED%86%A4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# 0. 라이브러리 로드 --------------------------------------------------------
import pandas as pd
import numpy as np

In [7]:
# 전처리·모델링 도구
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [8]:
# 분류 모델
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

In [9]:
# 평가 지표
from sklearn.metrics import f1_score, precision_recall_curve, classification_report


In [10]:
# 1. 데이터 로드 및 분리 ---------------------------------------------------
train = pd.read_pickle('/content/train.pkl')
test  = pd.read_pickle('/content/test.pkl')

# LoanID, Default 분리
X = train.drop(columns=['LoanID','Default'])
y = train['Default']

# train/validation 분리 (레이블 비율 유지)
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [11]:
# 2. 결측치 처리 -----------------------------------------------------------
# 숫자형 + Bool 더미 vs 순수 문자열 컬럼 구분
num_cols = X_train.select_dtypes(include=['int64','float64','bool']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns

# Imputer 정의
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# 숫자형/더미 컬럼 결측치 → 중앙값
X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_val  [num_cols] = num_imputer.transform   (X_val  [num_cols])

# 문자열(object) 컬럼 결측치 → 최빈값 (존재할 경우에만)
if len(cat_cols) > 0:
    X_train[cat_cols] = cat_imputer.fit_transform(X_train[cat_cols])
    X_val  [cat_cols] = cat_imputer.transform   (X_val  [cat_cols])

In [12]:
# 2. 결측치 처리 -----------------------------------------------------------
# 숫자형 + Bool 더미 vs 순수 문자열 컬럼 구분
num_cols = X_train.select_dtypes(include=['int64','float64','bool']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns

# Imputer 정의
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# 숫자형/더미 컬럼 결측치 → 중앙값
X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_val  [num_cols] = num_imputer.transform   (X_val  [num_cols])

# 문자열(object) 컬럼 결측치 → 최빈값 (존재할 경우에만)
if len(cat_cols) > 0:
    X_train[cat_cols] = cat_imputer.fit_transform(X_train[cat_cols])
    X_val  [cat_cols] = cat_imputer.transform   (X_val  [cat_cols])


In [13]:
# 3. 인코딩 & 스케일링 -----------------------------------------------------
# 1) One‑Hot 인코딩
X_train = pd.get_dummies(X_train, drop_first=True)
X_val   = pd.get_dummies(X_val,   drop_first=True)
# train에 있던 컬럼만 남기고, 없으면 0으로 채움
X_val   = X_val.reindex(columns=X_train.columns, fill_value=0)

# 2) 주요 수치형 피처 스케일링
num_features = ['Age','Income','LoanAmount','CreditScore',
                'MonthsEmployed','NumCreditLines','InterestRate',
                'LoanTerm','DTIRatio']
scaler = StandardScaler()
X_train[num_features] = scaler.fit_transform(X_train[num_features])
X_val  [num_features] = scaler.transform   (X_val  [num_features])

In [14]:
# 4. 간단한 피처 엔지니어링 -------------------------------------------------
# 예시: 소득 대비 대출 비율
X_train['Inc_Loan_Ratio'] = X_train['Income'] / (X_train['LoanAmount'] + 1)
X_val  ['Inc_Loan_Ratio'] = X_val  ['Income'] / (X_val  ['LoanAmount'] + 1)


In [None]:
# 5. SMOTE + RandomForest 파이프라인 + 그리드서치 ----------------------------
pipe = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(
        class_weight='balanced_subsample',
        random_state=42,
        n_jobs=-1
    ))
])
param_grid = {
    'smote__k_neighbors': [3, 5],
    'rf__n_estimators':    [100, 200],
    'rf__max_depth':       [None, 10, 20],
}
grid = GridSearchCV(
    pipe,
    param_grid,
    scoring='f1',
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_

print("▶ 최적 파라미터 (SMOTE+RF):", grid.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits




In [None]:
# 6. Validation 성능 확인 -----------------------------------------------
y_pred = best_model.predict(X_val)
print("▶ Validation F1 (SMOTE+RF):", f1_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

In [None]:
# 7. Stacking 앙상블 ------------------------------------------------------
estimators = [
    ('rf',  RandomForestClassifier(
        n_estimators=grid.best_params_['rf__n_estimators'],
        max_depth=   grid.best_params_['rf__max_depth'],
        class_weight='balanced_subsample',
        random_state=42,
        n_jobs=-1
    )),
    ('xgb', XGBClassifier(
        n_estimators=200,
        scale_pos_weight=(len(y_train)-y_train.sum())/y_train.sum(),
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    ))
]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=3,
    n_jobs=-1
)
stack.fit(X_train, y_train)

y_pred_stack = stack.predict(X_val)
print("▶ Validation F1 (Stacking):", f1_score(y_val, y_pred_stack))
print(classification_report(y_val, y_pred_stack))


In [None]:
# 8. Threshold 조정 (Stacking 모델) ---------------------------------------
y_proba = stack.predict_proba(X_val)[:,1]
prec, rec, thr = precision_recall_curve(y_val, y_proba)
f1s = 2 * (prec * rec) / (prec + rec)
ix  = np.argmax(f1s)

print(f"▶ Optimal Threshold = {thr[ix]:.3f}, 이때 F1 = {f1s[ix]:.3f}")

# 임계치 적용 후 성능
y_pred_thresh = (y_proba >= thr[ix]).astype(int)
print("▶ Adjusted F1 (Stacking):", f1_score(y_val, y_pred_thresh))
print(classification_report(y_val, y_pred_thresh))


In [None]:
# 9. 테스트 데이터 예측 & 제출파일 생성 ----------------------------------
# 1) 테스트셋 동일 전처리
X_test = test.drop(columns=['LoanID'])

X_test[num_cols] = num_imputer.transform(X_test[num_cols])
if len(cat_cols) > 0:
    X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

X_test = pd.get_dummies(X_test, drop_first=True)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

X_test[num_features] = scaler.transform(X_test[num_features])
X_test['Inc_Loan_Ratio'] = X_test['Income'] / (X_test['LoanAmount'] + 1)

# 2) 최종 예측 (Threshold 적용)
test_proba = stack.predict_proba(X_test)[:,1]
test_pred  = (test_proba >= thr[ix]).astype(int)


In [None]:
# ▶ (1) 필요한 모듈 import
from sklearn.metrics import f1_score, classification_report

# … (앞부분 생략) …

# 5. SMOTE+RF 그리드서치 이후 검증 성능 확인 -------------------------------
y_pred = best_model.predict(X_val)
# F1 출력
print("▶ Validation F1 (SMOTE+RF):", f1_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

# 6. Stacking 앙상블 성능 확인 --------------------------------------------
y_pred_stack = stack.predict(X_val)
# F1 출력
print("▶ Validation F1 (Stacking):", f1_score(y_val, y_pred_stack))
print(classification_report(y_val, y_pred_stack))

# 7. Threshold 조정 후 성능 확인 -----------------------------------------
y_proba = stack.predict_proba(X_val)[:,1]
prec, rec, thr = precision_recall_curve(y_val, y_proba)
f1s = 2 * (prec * rec) / (prec + rec)
ix  = np.argmax(f1s)

# 최적 임계치 및 F1 출력
print(f"▶ Optimal Threshold = {thr[ix]:.3f}, 이때 F1 = {f1s[ix]:.3f}")

# 임계치 적용 예측
y_pred_thresh = (y_proba >= thr[ix]).astype(int)
# F1 출력
print("▶ Adjusted F1 (Stacking):", f1_score(y_val, y_pred_thresh))
print(classification_report(y_val, y_pred_thresh))

# … (테스트 예측 및 제출파일 생성) …
