In [11]:
import os, sys
import numpy as np
import pandas as pd

from config_local import local_config

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor

In [13]:
train = pd.read_csv(local_config.TRAIN_PROCESS6_CSV)
test  = pd.read_csv(local_config.TEST_PROCESS6_CSV)
testRaw = pd.read_csv(local_config.TEST_CSV, index_col="Id")

In [14]:
y = train["logSP"]                 # your target column (log SalePrice)
X = train.drop(columns=["logSP"])  # all features except target

In [15]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# ============================================
# Hyperparameter search space for CatBoost
# ============================================
learning_rates = [0.03]
depths         = [6]
l2_leaf_regs   = [3.0, 5.0]
iterations     = [2000]

best_params = None
best_cv_rmse = float("inf")

print("Searching best CatBoost hyperparameters with 5-Fold CV...\n")

for lr in learning_rates:
    for depth in depths:
        for l2 in l2_leaf_regs:
            for iters in iterations:
                params = {
                    "loss_function": "RMSE",   # we model logSP, so RMSE on log
                    "learning_rate": lr,
                    "depth": depth,
                    "l2_leaf_reg": l2,
                    "iterations": iters,
                    "random_seed": 42,
                    "verbose": 0,       # set >0 if you want training logs
                    "thread_count": -1,
                    "task_type": "GPU", # <<< GPU
                    "devices": "0",     # <<< first GPU
                }

                fold_rmses = []

                print(f"Testing params: lr={lr}, depth={depth}, l2={l2}, iters={iters}")

                for fold, (train_idx, val_idx) in enumerate(kf.split(X), start=1):
                    print(f"    Starting fold {fold}...")
                    X_tr = X.iloc[train_idx]
                    X_val = X.iloc[val_idx]
                    y_tr = y.iloc[train_idx]
                    y_val = y.iloc[val_idx]

                    model = CatBoostRegressor(**params)

                    model.fit(
                        X_tr, y_tr,
                        eval_set=(X_val, y_val),
                        use_best_model=True,
                        verbose=500
                    )

                    # Predictions in log space (target is logSP)
                    y_pred_log = model.predict(X_val)

                    # log-RMSE (same as Kaggle metric, no exp here)
                    rmse = mean_squared_error(y_val, y_pred_log) ** 0.5
                    fold_rmses.append(rmse)

                    print(f"  Fold {fold} log-RMSE: {rmse:.5f}")

                mean_rmse = np.mean(fold_rmses)
                print(f"--> Mean CV log-RMSE for these params: {mean_rmse:.5f}\n")

                if mean_rmse < best_cv_rmse:
                    best_cv_rmse = mean_rmse
                    best_params = params.copy()

print("\n=====================================")
print("Best CatBoost params found:")
print(best_params)
print(f"Best mean CV log-RMSE: {best_cv_rmse:.5f}")
print("=====================================\n")


Searching best CatBoost hyperparameters with 5-Fold CV...

Testing params: lr=0.03, depth=6, l2=3.0, iters=2000
    Starting fold 1...
0:	learn: 0.3881723	test: 0.4014183	best: 0.4014183 (0)	total: 289ms	remaining: 9m 38s
50:	learn: 0.1797223	test: 0.1963092	best: 0.1963092 (50)	total: 13.7s	remaining: 8m 42s
100:	learn: 0.1296179	test: 0.1519045	best: 0.1519045 (100)	total: 24.6s	remaining: 7m 42s
150:	learn: 0.1131348	test: 0.1368452	best: 0.1368452 (150)	total: 36.1s	remaining: 7m 22s
200:	learn: 0.1051077	test: 0.1296359	best: 0.1296356 (199)	total: 47.3s	remaining: 7m 3s
250:	learn: 0.1002499	test: 0.1252559	best: 0.1252559 (250)	total: 1m 4s	remaining: 7m 29s
300:	learn: 0.0975264	test: 0.1230070	best: 0.1230070 (300)	total: 1m 15s	remaining: 7m 8s
350:	learn: 0.0955106	test: 0.1213460	best: 0.1213460 (350)	total: 1m 28s	remaining: 6m 56s
400:	learn: 0.0937882	test: 0.1202359	best: 0.1202359 (400)	total: 1m 40s	remaining: 6m 42s
450:	learn: 0.0925532	test: 0.1195196	best: 0.11951

In [17]:
best_params

{'loss_function': 'RMSE',
 'learning_rate': 0.03,
 'depth': 6,
 'l2_leaf_reg': 3.0,
 'iterations': 2000,
 'random_seed': 42,
 'verbose': 0,
 'thread_count': -1,
 'task_type': 'GPU',
 'devices': '0'}

In [None]:
# ============================================
# Final CatBoost model on ALL training data
# ============================================

from catboost import CatBoostRegressor

# best_params already includes task_type="GPU", devices="0"
final_cat = CatBoostRegressor(**best_params)

print("Training final CatBoost model on all data with best hyperparameters...")
final_cat.fit(X, y, verbose=200)

# Predict on Kaggle test (log space)
test_pred_log_cat = final_cat.predict(test)

# Back to original SalePrice
test_pred_real_cat = np.expm1(test_pred_log_cat)

submission_cat = pd.DataFrame({
    "Id": testRaw.index,
    "SalePrice": test_pred_real_cat
})

out_path_cat = os.path.join(local_config.SUBMISSIONS_DIR, "catboost_Model.csv")
submission_cat.to_csv(out_path_cat, index=False)

print(f"CatBoost submission saved to: {out_path_cat}")


Training final CatBoost model on all data with best hyperparameters...
0:	learn: 0.3908628	total: 13.4ms	remaining: 26.8s
200:	learn: 0.1044755	total: 2.57s	remaining: 23s
400:	learn: 0.0946803	total: 5.03s	remaining: 20.1s
600:	learn: 0.0897851	total: 7.46s	remaining: 17.4s
800:	learn: 0.0865154	total: 9.84s	remaining: 14.7s
1000:	learn: 0.0826969	total: 12.2s	remaining: 12.1s
1200:	learn: 0.0796999	total: 14.5s	remaining: 9.64s
1400:	learn: 0.0778909	total: 17s	remaining: 7.25s
1600:	learn: 0.0755194	total: 19.3s	remaining: 4.81s
1800:	learn: 0.0742099	total: 21.7s	remaining: 2.39s
1999:	learn: 0.0736174	total: 24s	remaining: 0us
CatBoost submission saved to: D:\Project\Kaggle\house-prices-starter\data\submissions\catboost_Model.csv
