### 1. 라이브러리 import

In [73]:
# conda install -c conda-forge xgboost libomp
import os
import joblib
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

### 2. 데이터 로드

In [62]:
train_path = Path("./train.csv")
test_path  = Path("./test.csv")

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

print("Train shape:", train_df.shape)
print("Test  shape:", test_df.shape)
train_df.head()

Train shape: (1076079, 52)
Test  shape: (722740, 52)


Unnamed: 0,num_actv_rev_tl,dti,fico_range_low,funded_amnt,int_rate,grade,emp_title,annual_inc,verification_status_Not Verified,verification_status_Source Verified,...,collections_12_mths_ex_med,spline_segment_1,spline_segment_2,spline_segment_3,spline_segment_4,spline_segment_5,spline_segment_6,spline_segment_7,irr,target
0,6.0,17.74,820.0,16000.0,7.07,0.0,-33.819993,65000.0,True,False,...,0.0,1.0,0.2,0.0,0.0,0.0,0.0,0.0,-26.21,-27.885864
1,3.0,6.99,700.0,4500.0,10.42,1.0,-19.626818,50000.0,True,False,...,0.0,1.0,0.6,0.4,0.0,0.0,0.0,0.0,10.94,9.461522
2,10.0,28.94,675.0,20000.0,9.99,1.0,-11.726184,60000.0,False,True,...,0.0,1.0,0.6,0.4,0.0,0.0,0.0,0.0,10.46,9.450318
3,13.0,17.69,660.0,30000.0,19.99,3.0,-30.077115,65000.0,False,False,...,0.0,1.0,0.0,0.0,0.4,0.6,0.0,0.0,-38.15,-39.1395
4,3.0,9.7,675.0,6025.0,8.46,0.0,-21.393,50000.0,False,True,...,0.0,1.0,0.8,0.0,0.0,0.0,0.0,0.0,8.8,5.861043


### 3. Taget Encoding

In [64]:
TARGET_COL = "target"  # 타깃 열 이름이 다르면 수정
label_encoder = LabelEncoder()
train_df[TARGET_COL] = label_encoder.fit_transform(train_df[TARGET_COL])
# fit(): TARGET_COL에 등장하는 고유 문자열 라벨을 사전식 순으로 정렬해 classes_에 저장
# transform(): 각 원소를 해당 인덱스(0부터 시작하는 정수)로 매핑

### 4. 결측치, 범주형 처리 파이프라인

In [66]:
# train_df의 모든 열 가운데 타깃 열을 제외한 나머지 열 이름을 리스트로 수집
feature_cols = [c for c in train_df.columns if c != TARGET_COL]

# 숫자형 열 추출 -> 선택된 열 이름을 파이썬 리스트로 변환
numeric_cols = train_df[feature_cols].select_dtypes(
    include=['number', 'bool']).columns.tolist() 

# 범주형 열 추출
categorical_cols = [c for c in feature_cols if c not in numeric_cols]

# 숫자형 전처리기: 결측치 -> 평균
numeric_transformer = Pipeline(steps=[ # Pipeline(): 여러 전처리 단계를 순차적으로 묶어 하나의 객체처럼 사용하게 해주는 class
    ("imputer", SimpleImputer(strategy="mean")) # 결측치(NA, NaN)를 해당 열의 평균값으로 대체하는 전처리기
])
# 0으로 대체하고 싶을 때는, SimpleImputer(strategy="constant", fill_value=0)

# 범주형 전처리기: 결측값 -> 최반값 대체 -> 원-핫 인코딩 
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot",  OneHotEncoder(handle_unknown="ignore"))
])

# 열별 전처리 매핑
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

### 5. 학습 / 검증 데이터 분할(8:2)

In [68]:
X = train_df[feature_cols]
y = train_df[TARGET_COL]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE #,stratify=y
)

### 6. XGBoost 모델

In [75]:
xgb_params = {
    "objective":       "reg:squarederror",
    "eval_metric":     "rmse",
    "tree_method":     "exact",
    "learning_rate":   0.08,
    "n_estimators":    400,
    "max_depth":       3,
    "gamma":           0.1,
    "subsample":       0.9,
    "colsample_bytree":0.9,
    "reg_lambda":      3, # L2 정규화
    "random_state":    RANDOM_STATE,
}

model = XGBRegressor(**xgb_params)

# EarlyStopping 콜백: 100라운드 동안 개선 없으면 멈추고, 최고 모델 저장
es_cb = EarlyStopping(
    rounds=100,
    metric_name="rmse",  # eval_metric과 동일
    save_best=True
)

pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", model)
])

### 7. 학습(Early Stopping)

In [78]:
pipe.fit(
    X_train, y_train,
    model__eval_set=[(X_val, y_val)],
    model__verbose=False,
    model__callbacks=[es_cb]
)

val_pred = pipe.predict(X_val)
rmse = mean_squared_error(y_val, val_pred, squared=False)
print(f"Validation RMSE: {rmse:.4f}")

TypeError: XGBModel.fit() got an unexpected keyword argument 'callbacks'

### 8. 모델 저장

In [None]:
joblib.dump(pipe, "best_model.h5")
print("✅ best_model.h5 저장 완료")

### 9. 테스트 데이터 예측 & CSV 저장

In [None]:
loaded_model = joblib.load("best_model.h5")
test_pred = loaded_model.predict(test_df[feature_cols])
test_df["predict"] = np.clip(test_pred, 0, 1)  # 0~1 범위 클리핑
test_df.to_csv("test_with_pred.csv", index=False)
print("📄 test_with_pred.csv 저장 완료")
display(test_df.head())