In [None]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
import numpy as np

: 

In [None]:
def load_housing_csv(p):
    if Path(p).exists():
        print(f"[INFO] Loaded: {p}")
        return pd.read_csv(p)

    # 폴백: /kaggle/input 전체에서 housing.csv 탐색
    hits = list(Path("/kaggle/input").rglob(p))
    if hits:
        chosen = str(hits[0])
        print(f"[INFO] Loaded (auto-found): {chosen}")
        return pd.read_csv(chosen)

    raise FileNotFoundError(
        "요청하신 데이터셋을 /kaggle/input 경로에서 찾지 못했습니다. "
        "'Add data'에서 데이터셋 추가 후 정확한 경로를 candidates에 넣어주세요."
    )

In [None]:
df = load_housing_csv("/kaggle/input/california-housing-prices/housing.csv")  # 또는 네 파일명

[INFO] Loaded: /kaggle/input/california-housing-prices/housing.csv


In [None]:
if "ocean_proximity" in df.columns:
    df["ocean_proximity"] = df["ocean_proximity"].astype("category")
print(df.shape)
df.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
REQUIRED_COLS = [
    "longitude", "latitude", "housing_median_age",
    "total_rooms", "total_bedrooms", "population",
    "households", "median_income", "median_house_value",
    "ocean_proximity"
]
missing = [c for c in REQUIRED_COLS if c not in df.columns]
assert not missing, f"다음 컬럼이 없습니다: {missing}"

In [None]:
# v1 -> v2 변경
def make_features(_df: pd.DataFrame) -> pd.DataFrame:
    d = _df.copy()
    eps = 1e-6
    d["rooms_per_household"]      = d["total_rooms"] / (d["households"] + eps)
    d["bedrooms_per_room"]        = d["total_bedrooms"] / (d["total_rooms"] + eps)
    d["population_per_household"] = d["population"] / (d["households"] + eps)
    d["lat_lon"]                  = d["latitude"] * d["longitude"]
    return d

In [None]:
print(df.shape)
df.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
data = df.dropna(subset=["median_house_value"]).reset_index(drop=True)
print(data.shape)
data.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
# v1 -> v2 변경
data = make_features(data)

print(data.shape)
data.head()

In [None]:
print(data.shape)
data.head

(20640, 10)


<bound method NDFrame.head of        longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                41.0        880.0           129.0   
1        -122.22     37.86                21.0       7099.0          1106.0   
2        -122.24     37.85                52.0       1467.0           190.0   
3        -122.25     37.85                52.0       1274.0           235.0   
4        -122.25     37.85                52.0       1627.0           280.0   
...          ...       ...                 ...          ...             ...   
20635    -121.09     39.48                25.0       1665.0           374.0   
20636    -121.21     39.49                18.0        697.0           150.0   
20637    -121.22     39.43                17.0       2254.0           485.0   
20638    -121.32     39.43                18.0       1860.0           409.0   
20639    -121.24     39.37                16.0       2785.0           616.0   

       population  ho

In [None]:
data = pd.get_dummies(data, columns=['ocean_proximity'], drop_first=False)

In [None]:
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND


In [None]:
num_cols = [c for c in X_train.columns if str(X_train[c].dtype) != "category" and X_train[c].dtype != "object"]
cat_cols = [c for c in X_train.columns if c not in num_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"
)

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# LightGBM에서 사용할 트리 수 (CV와 최종 모델에 공통으로 사용)
cv_n_estimators = 2000

fold_rmses = []
best_n_estimators = []

for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train), 1):
    X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]

    # 전처리: fold별 train에 맞춰 학습 후 train/val 모두 변환
    preprocessor.fit(X_tr)
    X_tr_enc = preprocessor.transform(X_tr)
    X_va_enc = preprocessor.transform(X_va)

    # 모델 (LightGBM)
    lgbm = LGBMRegressor(
        n_estimators=cv_n_estimators,
        learning_rate=0.03,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42 + fold,
        n_jobs=-1,
        verbose=-1
    )

    # 학습
    lgbm.fit(X_tr_enc, y_tr)

    # 예측 및 RMSE
    y_pred_va = lgbm.predict(X_va_enc)
    rmse = mean_squared_error(y_va, y_pred_va, squared=False)
    fold_rmses.append(rmse)

    # LightGBM에서는 early stopping을 쓰지 않으므로, 사용한 트리 수를 그대로 기록
    best_iter = getattr(lgbm, "n_estimators", cv_n_estimators)
    best_n_estimators.append(int(best_iter))

    print(f"[Fold {fold}] RMSE: {rmse:,.2f} | best_n_estimators ≈ {best_iter}")

print("\nCV RMSE (mean ± std): "
      f"{np.mean(fold_rmses):,.2f} ± {np.std(fold_rmses):,.2f}")
print("Per-fold best_n_estimators:", best_n_estimators)



[Fold 1] RMSE: 46,892.64 | best_n_estimators ≈ 1962




[Fold 2] RMSE: 45,724.80 | best_n_estimators ≈ 1972




[Fold 3] RMSE: 45,018.02 | best_n_estimators ≈ 1558




[Fold 4] RMSE: 46,445.73 | best_n_estimators ≈ 1525




[Fold 5] RMSE: 43,632.94 | best_n_estimators ≈ 1977

CV RMSE (mean ± std): 45,542.83 ± 1,148.34
Per-fold best_n_estimators: [1962, 1972, 1558, 1525, 1977]


In [None]:
# CV 결과에서 합리적인 트리 수 선택 (중앙값)
final_n_estimators = int(np.median(best_n_estimators)) if len(best_n_estimators) > 0 else cv_n_estimators
print("Final n_estimators (median of CV):", final_n_estimators)

# 전체 TRAIN으로 전처리기를 fit → 같은 변환을 TEST에도 적용
final_preprocessor = ColumnTransformer(
    [("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)],
    remainder="passthrough"
)
final_preprocessor.fit(X_train)

X_train_enc = final_preprocessor.transform(X_train)
X_test_enc  = final_preprocessor.transform(X_test)
print(y_train.head())

final_model = LGBMRegressor(
    n_estimators=final_n_estimators,
    learning_rate=0.02,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    reg_alpha=0.3,      # L1 정규화 (Lasso)
    reg_lambda=0.3,     # L2 정규화 (Ridge)
    n_jobs=-1,
    verbose=-1
)

# 전체 TRAIN으로 재학습 
final_model.fit(
    X_train_enc, y_train
)

# 독립 TEST 평가
y_pred_test = final_model.predict(X_test_enc)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_test = r2_score(y_test, y_pred_test)

print(f"[TEST] RMSE: {rmse_test:,.2f}")
print(f"[TEST] R²:   {r2_test:.3f}")

Final n_estimators (median of CV): 1962
14196    103000.0
8267     382100.0
17445    172600.0
14265     93400.0
2271      96500.0
Name: median_house_value, dtype: float64
[TEST] RMSE: 44,726.86
[TEST] R²:   0.847
