In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import Ridge

import sys
sys.path.insert(0, os.path.abspath("../src"))

from hp_preprocessing import fit_preprocessors, transform_features, rmsle_from_log


In [4]:
train_df = pd.read_csv("../data/train.csv")
test_df  = pd.read_csv("../data/test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
train_df.head(10)


Train shape: (1460, 81)
Test shape: (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [5]:
TARGET = "SalePrice"
ID_COL = "Id"

X_raw = train_df.drop(columns=[TARGET]).copy()
y_log = np.log1p(train_df[TARGET].copy())

X_tr_raw, X_va_raw, y_tr, y_va = train_test_split(
    X_raw, y_log, test_size=0.2, random_state=42
)

pre = fit_preprocessors(X_tr_raw, id_col=ID_COL)
X_tr = transform_features(X_tr_raw, pre)
X_va = transform_features(X_va_raw, pre)

model = Ridge(alpha=20.0, random_state=42)
model.fit(X_tr, y_tr)
pred_va = model.predict(X_va)

print("Holdout RMSLE:", round(rmsle_from_log(y_va, pred_va), 5))
print("Numeric features:", len(pre["num_cols"]), "| Categorical features:", len(pre["cat_cols"]))


Holdout RMSLE: 0.13761
Numeric features: 35 | Categorical features: 44


In [6]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmsles = []

X_all = X_raw.reset_index(drop=True)
y_all = y_log.reset_index(drop=True)

for fold, (tr_idx, va_idx) in enumerate(kf.split(X_all), start=1):
    X_tr_raw = X_all.iloc[tr_idx].copy()
    X_va_raw = X_all.iloc[va_idx].copy()
    y_tr = y_all.iloc[tr_idx].copy()
    y_va = y_all.iloc[va_idx].copy()

    pre = fit_preprocessors(X_tr_raw, id_col=ID_COL)
    X_tr = transform_features(X_tr_raw, pre)
    X_va = transform_features(X_va_raw, pre)

    m = Ridge(alpha=20.0, random_state=42)
    m.fit(X_tr, y_tr)
    pred = m.predict(X_va)

    fold_rmsle = rmsle_from_log(y_va, pred)
    rmsles.append(fold_rmsle)
    print(f"Fold {fold} RMSLE: {fold_rmsle:.5f}")

print(f"CV RMSLE (mean ± std): {np.mean(rmsles):.5f} ± {np.std(rmsles):.5f}")


Fold 1 RMSLE: 0.13761
Fold 2 RMSLE: 0.12600
Fold 3 RMSLE: 0.22425
Fold 4 RMSLE: 0.12529
Fold 5 RMSLE: 0.11433
CV RMSLE (mean ± std): 0.14550 ± 0.04006


In [7]:
pre = fit_preprocessors(X_raw, id_col=ID_COL)
X_train = transform_features(X_raw, pre)
X_test  = transform_features(test_df.copy(), pre)

final_model = Ridge(alpha=20.0, random_state=42)
final_model.fit(X_train, y_log)

pred_test = np.expm1(final_model.predict(X_test))

submission = pd.DataFrame({"Id": test_df["Id"], "SalePrice": pred_test})
os.makedirs("../outputs", exist_ok=True)
submission.to_csv("../outputs/submission.csv", index=False)

print("Saved: ../outputs/submission.csv")
submission.head(10)


Saved: ../outputs/submission.csv


Unnamed: 0,Id,SalePrice
0,1461,114009.553955
1,1462,145042.607078
2,1463,169776.493433
3,1464,192516.283253
4,1465,197290.254637
5,1466,168038.552729
6,1467,185873.618976
7,1468,161154.129022
8,1469,189696.864723
9,1470,122262.144404
