In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
import numpy as np

In [2]:
def load_housing_csv(p):
    if Path(p).exists():
        print(f"[INFO] Loaded: {p}")
        return pd.read_csv(p)

    # 폴백: /kaggle/input 전체에서 housing.csv 탐색
    hits = list(Path("/kaggle/input").rglob(p))
    if hits:
        chosen = str(hits[0])
        print(f"[INFO] Loaded (auto-found): {chosen}")
        return pd.read_csv(chosen)

    raise FileNotFoundError(
        "요청하신 데이터셋을 /kaggle/input 경로에서 찾지 못했습니다. "
        "'Add data'에서 데이터셋 추가 후 정확한 경로를 candidates에 넣어주세요."
    )

In [3]:
df = load_housing_csv("/kaggle/input/california-housing-prices/housing.csv")  # 또는 네 파일명

[INFO] Loaded: /kaggle/input/california-housing-prices/housing.csv


In [4]:
if "ocean_proximity" in df.columns:
    df["ocean_proximity"] = df["ocean_proximity"].astype("category")
print(df.shape)
df.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
REQUIRED_COLS = [
    "longitude", "latitude", "housing_median_age",
    "total_rooms", "total_bedrooms", "population",
    "households", "median_income", "median_house_value",
    "ocean_proximity"
]
missing = [c for c in REQUIRED_COLS if c not in df.columns]
assert not missing, f"다음 컬럼이 없습니다: {missing}"

In [6]:
def make_features(_df: pd.DataFrame) -> pd.DataFrame:
    d = _df.copy()
    eps = 1e-6
    d["rooms_per_household"]      = d["total_rooms"] / (d["households"] + eps)
    d["bedrooms_per_room"]        = d["total_bedrooms"] / (d["total_rooms"] + eps)
    d["population_per_household"] = d["population"] / (d["households"] + eps)
    d["lat_lon"]                  = d["latitude"] * d["longitude"]
    return d

In [7]:
print(df.shape)
df.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [8]:
data = df.dropna(subset=["median_house_value"]).reset_index(drop=True)
print(data.shape)
data.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [9]:
data = make_features(data)

print(data.shape)
data.head()

(20640, 14)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,lat_lon
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556,-4630.0724
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842,-4627.2492
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226,-4626.784
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945,-4627.1625
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467,-4627.1625


In [10]:
# from sklearn.cluster import KMeans

# coords = data[["longitude", "latitude"]].values

# k = 5  # 군집 개수 (보통 5~10 사이 실험)
# kmeans = KMeans(n_clusters=k, random_state=42)
# data["geo_cluster"] = kmeans.fit_predict(coords)

# # ===== 새로운 feature 추가 결과 =====
# data.groupby("geo_cluster").head(5)

# data["geo_cluster"] = data["geo_cluster"].astype("category")

In [11]:
data = pd.get_dummies(data, columns=['ocean_proximity'], drop_first=False)

In [12]:
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household,lat_lon,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,5.017656,0.200576,3.691814,-3828.0513,False,False,False,False,True
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,4.473545,0.232703,1.738095,-3990.2632,False,False,False,False,True
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,5.645833,0.174486,2.723214,-4175.8368,False,False,False,False,True
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,4.002817,0.258269,3.994366,-3828.3259,False,False,False,False,True
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,6.268421,0.18094,2.3,-4406.244,False,True,False,False,False


In [13]:
num_cols = [c for c in X_train.columns if str(X_train[c].dtype) != "category" and X_train[c].dtype != "object"]
cat_cols = [c for c in X_train.columns if c not in num_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"
)

In [14]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_rmses = []
best_n_estimators = []

for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train), 1):
    X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]
    
    preprocessor.fit(X_tr)
    X_tr_enc = preprocessor.transform(X_tr)
    X_va_enc = preprocessor.transform(X_va)
    # X_tr_enc = X_tr
    # X_va_enc = X_va
    
    # 모델
    xgb = XGBRegressor(
        n_estimators=2000,
        learning_rate=0.03,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42 + fold,
        n_jobs=-1,
        enable_categorical=True,
        tree_method="hist"
    )

    # early_stopping: 검증셋은 같은 인코더로 변환된 X_va_enc 사용
    xgb.fit(
        X_tr_enc, y_tr,
        eval_set=[(X_va_enc, y_va)],
        early_stopping_rounds=100,
        verbose=False
    )

    # 예측 및 RMSE
    y_pred_va = xgb.predict(X_va_enc)
    rmse = mean_squared_error(y_va, y_pred_va, squared=False)
    fold_rmses.append(rmse)

    # best_iteration 안전 추출
    best_iter = getattr(xgb, "best_iteration", None)
    if best_iter is None:
        best_iter = getattr(xgb, "best_ntree_limit", xgb.n_estimators)
    best_n_estimators.append(int(best_iter))

    print(f"[Fold {fold}] RMSE: {rmse:,.2f} | best_n_estimators ≈ {best_iter}")

print("\nCV RMSE (mean ± std): "
      f"{np.mean(fold_rmses):,.2f} ± {np.std(fold_rmses):,.2f}")
print("Per-fold best_n_estimators:", best_n_estimators)



[Fold 1] RMSE: 44,990.66 | best_n_estimators ≈ 1999




[Fold 2] RMSE: 43,185.74 | best_n_estimators ≈ 1679




[Fold 3] RMSE: 43,146.70 | best_n_estimators ≈ 1999




[Fold 4] RMSE: 44,738.17 | best_n_estimators ≈ 1983




[Fold 5] RMSE: 42,632.20 | best_n_estimators ≈ 1991

CV RMSE (mean ± std): 43,738.69 ± 943.07
Per-fold best_n_estimators: [1999, 1679, 1999, 1983, 1991]


In [15]:
# CV 결과에서 합리적인 트리 수 선택 (중앙값)
final_n_estimators = int(np.median(best_n_estimators)) if len(best_n_estimators) > 0 else 800
print("Final n_estimators (median of CV):", final_n_estimators)

# 전체 TRAIN으로 전처리기를 fit → 같은 변환을 TEST에도 적용
final_preprocessor = ColumnTransformer(
    [("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)],
    remainder="passthrough"
)
final_preprocessor.fit(X_train)

y_train_raw = y_train.copy()
y_test_raw  = y_test.copy()

# 로그 변환 (log1p = log(1 + x), 0값에도 안전)
y_train = np.log1p(y_train_raw)
y_test  = np.log1p(y_test_raw)

X_train_enc = final_preprocessor.transform(X_train)
X_test_enc  = final_preprocessor.transform(X_test)
print(y_train.head())

    
final_model = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    enable_categorical=True,
    tree_method="hist"
)

# 전체 TRAIN으로 재학습 
final_model.fit(
    X_train_enc, y_train
)

# 독립 TEST 평가
y_pred_test = final_model.predict(X_test_enc)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_test = r2_score(y_test, y_pred_test)

print(f"[TEST] RMSE: {rmse_test:,.2f}")
print(f"[TEST] R²:   {r2_test:.3f}")

Final n_estimators (median of CV): 1991
14196    11.542494
8267     12.853440
17445    12.058738
14265    11.444657
2271     11.477309
Name: median_house_value, dtype: float64
[TEST] RMSE: 0.21
[TEST] R²:   0.867
