### 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor

In [2]:
# train, test 데이터를 불러옵니다.
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

### 데이터 살펴보기

In [3]:
# 데이터 크기 확인
print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# 데이터 샘플 확인
print(train.head())
print(test.head())

Train shape: (7497, 11), Test shape: (846, 10)
           ID 제조사      모델        차량상태   배터리용량 구동방식  주행거리(km)  보증기간(년) 사고이력  \
0  TRAIN_0000  P사  TayGTS  Nearly New  86.077  AWD     13642        0   No   
1  TRAIN_0001  K사    Niro  Nearly New  56.000  FWD     10199        6   No   
2  TRAIN_0002  A사      eT   Brand New  91.200  AWD      2361        7   No   
3  TRAIN_0003  A사  RSeTGT  Nearly New     NaN  AWD     21683        3   No   
4  TRAIN_0004  B사      i5   Pre-Owned  61.018  AWD    178205        1   No   

   연식(년)  가격(백만원)  
0      2   159.66  
1      0    28.01  
2      0    66.27  
3      0    99.16  
4      0    62.02  
         ID 제조사     모델        차량상태   배터리용량 구동방식  주행거리(km)  보증기간(년) 사고이력  연식(년)
0  TEST_000  P사  TayCT  Nearly New  76.093  AWD     14057        2   No      0
1  TEST_001  B사     iX   Brand New  90.000  AWD      7547        8   No      0
2  TEST_002  B사     i5   Brand New     NaN  RWD      7197        7  Yes      0
3  TEST_003  H사   ION5  Nearly New  68.479  AWD 

### 데이터 전처리

In [4]:
# 결측치 처리 (배터리용량: 평균값 대체)
train["배터리용량"] = train["배터리용량"].fillna(train["배터리용량"].mean())
test["배터리용량"] = test["배터리용량"].fillna(test["배터리용량"].mean())

# 범주형 변수 One-Hot Encoding 적용
categorical_features = ["제조사", "모델", "차량상태", "구동방식", "사고이력"]
numerical_features = ["배터리용량", "주행거리(km)", "보증기간(년)", "연식(년)"]

# ColumnTransformer 설정
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

# 타겟 변수 분리
X = train.drop(columns=["ID", "가격(백만원)"])
y = train["가격(백만원)"]
X_test = test.drop(columns=["ID"])

# 데이터 변환 적용
X_transformed = preprocessor.fit_transform(X)
X_test_transformed = preprocessor.transform(X_test)

# 데이터 크기 확인
print(f"Transformed Train shape: {X_transformed.shape}, Test shape: {X_test_transformed.shape}")


Transformed Train shape: (7497, 40), Test shape: (846, 40)


### 데이터 분할

In [5]:
from sklearn.model_selection import train_test_split

# 학습 / 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# 데이터 크기 확인
print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}")


Train shape: (5997, 40), Validation shape: (1500, 40)


### Model 학습

In [6]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, cross_val_score

# K-Fold 설정 (5-Fold)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [7]:
# 모델 정의
models = {
    "CatBoost": CatBoostRegressor(iterations=500, learning_rate=0.05, depth=8, loss_function='RMSE', verbose=0, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=500, learning_rate=0.05, max_depth=8, random_state=42),
}
# 모델별 K-Fold 검증
kf_results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_transformed, y, scoring='neg_root_mean_squared_error', cv=kf)
    mean_rmse = -scores.mean()
    kf_results[name] = mean_rmse
    print(f"{name}: 평균 RMSE (5-Fold): {mean_rmse:.4f}")

CatBoost: 평균 RMSE (5-Fold): 1.3792
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001244 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 5997, number of used features: 40
[LightGBM] [Info] Start training from score 62.221487
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000838 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 441
[LightGBM] [Info] Number of data points in the train set: 5997, number of used features: 40
[LightGBM] [Info] Start training from score 62.375508
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000274 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=

### Model 평가

### Stacking Regressor

In [12]:
# 개별 모델을 활용한 Stacking 앙상블
stacking_model = StackingRegressor(
    estimators=[
        ('LightGBM', models['LightGBM']),
        ('CatBoost', models['CatBoost'])
    ],
    final_estimator=XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42),
    cv=kf
)

# 스태킹 모델 학습
print("Training Stacking Model...")
stacking_model.fit(X_train, y_train)

# 검증 데이터 예측
stacking_pred = stacking_model.predict(X_val)
stacking_rmse = np.sqrt(mean_squared_error(y_val, stacking_pred))
print(f"Stacking Model RMSE: {stacking_rmse:.4f}")

Training Stacking Model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000389 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 5997, number of used features: 40
[LightGBM] [Info] Start training from score 62.221487
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000285 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 439
[LightGBM] [Info] Number of data points in the train set: 4797, number of used features: 40
[LightGBM] [Info] Start training from score 62.193052
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000316 seconds.
You can set `force_row_wise=true` to remove the overhead.
A

### 최종 예측 및 제출 파일 생성

In [13]:
# 스태킹 모델을 활용한 테스트 데이터 예측
final_test_pred_stacking = stacking_model.predict(X_test_transformed)

# 결과 값 클리핑
final_test_pred_stacking = np.clip(final_test_pred_stacking, y.min(), y.max())

# 제출 파일 생성
submission = pd.DataFrame({'ID': test['ID'], '가격(백만원)': final_test_pred_stacking})
submission.to_csv("1.csv", index=False)