<a href="https://colab.research.google.com/github/Donggeon2960/LGAIMER-PRACTICE/blob/main/%EC%A0%9C1%ED%9A%8C%EB%AA%A8%EC%9D%98%ED%95%B4%EC%BB%A4%ED%86%A4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. 라이브러리 불러오기

In [37]:
import pandas as pd
import numpy as np

# 전처리·인코딩·모델링용
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix
# (Optional) SMOTE 사용 시
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


#2. 데이터 로드 및 분할

In [38]:
# (경로는 /mnt/data/ 의 위치에 맞춰 수정)
df = pd.read_pickle('/content/train.pkl')   # 학습용
df_test = pd.read_pickle('/content/test.pkl')  # 검증 혹은 제출용

# 피처/레이블 분리
X = df.drop(columns=['LoanID', 'Default'])
y = df['Default']

# 내부 검증용으로 80:20 분할
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


#3. EDA: 불균형 확인 & 결측치 탐색

In [39]:
# 1) 타깃 분포 (심각한 불균형)
print(y_train.value_counts(normalize=True))

# 2) 결측치 개수
print(X_train.isna().sum().sort_values(ascending=False))

# 3) 기초 통계 & 분포
display(X_train.describe())
# (원한다면 히스토그램/박스플롯 추가)


Default
0    0.942489
1    0.057511
Name: proportion, dtype: float64
DTIRatio          488
InterestRate      469
HasMortgage       449
HasDependents     391
Age               382
MaritalStatus     353
LoanPurpose       341
MonthsEmployed    311
Income            297
LoanAmount        264
EmploymentType    208
HasCoSigner       190
NumCreditLines    157
CreditScore       128
LoanTerm          123
Education         104
dtype: int64


Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio
count,100956.0,101041.0,101074.0,101210.0,101027.0,101181.0,100869.0,101215.0,100850.0
mean,43.008707,82518.444651,127635.585432,573.724958,59.046473,2.001611,13.494326,35.505893,0.499619
std,15.034846,38916.62822,71009.023096,159.304397,34.681053,1.222017,6.641903,17.019016,0.231614
min,17.0,14733.0,4824.0,290.0,0.0,0.0,1.9,11.0,0.1
25%,30.0,48830.0,65985.0,436.0,29.0,1.0,7.77,23.0,0.3
50%,43.0,82420.0,127641.0,573.0,59.0,2.0,13.44,35.0,0.5
75%,56.0,116202.0,189409.5,712.0,89.0,3.0,19.22,48.0,0.7
max,72.0,152947.0,259139.0,875.0,121.0,4.0,26.19,62.0,0.94


#4. 컬럼 구분: 수치형 vs 범주형

In [40]:
numeric_features = [
    'Age', 'Income', 'LoanAmount', 'CreditScore',
    'MonthsEmployed', 'NumCreditLines', 'InterestRate',
    'LoanTerm', 'DTIRatio'
]
categorical_features = [
    'Education', 'EmploymentType', 'MaritalStatus',
    'HasMortgage', 'HasDependents', 'LoanPurpose',
    'HasCoSigner'
]


#5. 전처리 파이프라인 구성

In [41]:
#5-1. 수치형 파이프라인
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

#5-2. 범주형 파이프라인
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#5-3. 컬럼 통합
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numeric_features),
    ('cat', cat_pipeline, categorical_features)
])


#6. 불균형 해소 전략

In [42]:
# ─── 옵션 A: class_weight 이용 ─────────────────
clf = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=100, random_state=42,
        class_weight='balanced_subsample'
    ))
])

# ─── 옵션 B: SMOTE + class_weight 조합 ───────────
#clf_smote = ImbPipeline([
#    ('preprocess', preprocessor),
 #   ('smote', SMOTE(random_state=42, sampling_strategy=0.5)),
  #  ('model', RandomForestClassifier(
   #     n_estimators=100, random_state=42,
    #    class_weight='balanced_subsample'
    #))
#])


#7. 기본 모델 학습 및 평가

In [43]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_valid)

print("F1-score:", f1_score(y_valid, y_pred))
print(classification_report(y_valid, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))


F1-score: 0.0
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     23878
           1       0.00      0.00      0.00      1457

    accuracy                           0.94     25335
   macro avg       0.47      0.50      0.49     25335
weighted avg       0.89      0.94      0.91     25335

Confusion Matrix:
 [[23878     0]
 [ 1457     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#8. 임계값(treshold) 최적화

In [44]:
# 예측 확률 얻기
probs = clf.predict_proba(X_valid)[:, 1]

best_thresh, best_f1 = 0.0, 0.0
for thresh in np.linspace(0.1, 0.9, 81):
    preds = (probs >= thresh).astype(int)
    score = f1_score(y_valid, preds)
    if score > best_f1:
        best_thresh, best_f1 = thresh, score

print(f"최적 임계값: {best_thresh:.2f}, 해당 F1: {best_f1:.4f}")


최적 임계값: 0.10, 해당 F1: 0.1980


#9. 하이퍼파라미터 튜닝

In [None]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5]
}
grid = GridSearchCV(
    clf, param_grid, cv=3,
    scoring='f1', n_jobs=-1, verbose=2
)
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best CV F1:", grid.best_score_)


Fitting 3 folds for each of 12 candidates, totalling 36 fits


#10. 최종 모델 학습 & 테스트 예측

In [None]:
# 1) 최적 파라미터로 재설정
final_clf = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(
        **grid.best_params_['model__param_dict'],  # 예시
        random_state=42,
        class_weight='balanced_subsample'
    ))
])

# 2) 전체 학습 데이터로 학습
final_clf.fit(X, y)

# 3) 테스트 데이터 전처리 + 확률 예측
X_test = df_test.drop(columns=['LoanID', 'Default'])
probs_test = final_clf.predict_proba(X_test)[:, 1]

# 4) 최적 임계값 적용
y_test_pred = (probs_test >= best_thresh).astype(int)

# 5) 결과 확인
print("Test F1:", f1_score(df_test['Default'], y_test_pred))
