In [4]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
import os

In [5]:
best_params = {
    "iterations": 1000,
    "learning_rate": 0.1,
    "depth": 8,
    "l2_leaf_reg": 1,
}

In [6]:
# 1. Load train and validation splits, then concatenate for final training
train_df = pd.read_csv("./train_test_splits/train.csv")
val_df = pd.read_csv("./train_test_splits/validate.csv")
full_train_df = pd.concat([train_df, val_df]).reset_index(drop=True)

In [7]:
# 2. Define target and feature columns
target_cols = ["temperature", "rainfall", "wind_speed", "precipitation"]
feature_cols = [c for c in full_train_df.columns if c not in target_cols + ["date"]]

cat_features = ["location_id", "day_of_week"]

In [8]:
# Create CatBoost Pool with categorical features specified
train_pool = Pool(
    data=full_train_df[feature_cols],
    label=full_train_df[target_cols],
    cat_features=cat_features
)

In [9]:
# 3. Initialize and train CatBoost model with best hyperparameters
model = CatBoostRegressor(
    loss_function="MultiRMSE",
    verbose=200,
    random_seed=42,
    use_best_model=False,  # Disable early stopping for final training on full data
    **best_params
)

model.fit(train_pool)

0:	learn: 15.3993682	total: 443ms	remaining: 7m 22s
200:	learn: 4.9336070	total: 1m 28s	remaining: 5m 53s
400:	learn: 4.1939263	total: 2m 35s	remaining: 3m 51s
600:	learn: 3.7288821	total: 3m 42s	remaining: 2m 27s
800:	learn: 3.4008093	total: 4m 48s	remaining: 1m 11s
999:	learn: 3.1423005	total: 5m 57s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x239c87e6ba0>

In [10]:
# 4. Save the final model
model_path = "../../models/catboost_final_model.cbm"
model.save_model(model_path)

print("Final model trained on train + validation data.")
print(f"Saved to: {model_path}")

Final model trained on train + validation data.
Saved to: ../../models/catboost_final_model.cbm
