In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# submission functionality
# ------------------------
import os, sys
sys.path.append(os.path.abspath("..")) # so src/ is on the path

import importlib
import submission_utils
importlib.reload(submission_utils) # force reload latest code

from submission_utils import save_submission
# ------------------------
import xgboost as xgb

from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

In [18]:
# Load data
train = pd.read_csv("../../data/cattle_data_train.csv")
test = pd.read_csv("../../data/cattle_data_test.csv")

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

target = "Milk_Yield_L"
id_col = "Cattle_ID"


def preprocess_pipeline(df, encode_flag=True, target_col="Milk_Yield_L", n_clusters=10):
    milk_features = df.copy()
    
    # -----------------------
    # 1) Drop impossible targets (train only)
    # -----------------------
    if target_col in milk_features.columns:
        milk_features = milk_features[milk_features[target_col] >= 0]
        y = milk_features[target_col]
        milk_features = milk_features.drop(columns=[target_col])
    else:
        y = None

    # -----------------------
    # 2) Basic cleaning
    # -----------------------
    if "Breed" in milk_features.columns:
        milk_features["Breed"] = milk_features["Breed"].str.strip()
        milk_features["Breed"] = milk_features["Breed"].replace({"Holstien": "Holstein"})

    if "Housing_Score" in milk_features.columns:
        milk_features["Housing_Score"] = milk_features["Housing_Score"].fillna(
            milk_features["Housing_Score"].median()
        )

    if "Feed_Quantity_kg" in milk_features.columns and "Feed_Type" in milk_features.columns:
        milk_features["Feed_Quantity_kg"] = milk_features.groupby("Feed_Type")["Feed_Quantity_kg"].transform(
            lambda x: x.fillna(x.median())
        )
    


            

    # Fill any remaining numeric NaNs
    numeric_cols = milk_features.select_dtypes(include="number").columns.tolist()
    milk_features[numeric_cols] = milk_features[numeric_cols].fillna(milk_features[numeric_cols].median())

    # -----------------------
    # 3) Date features
    # -----------------------
    if "Date" in milk_features.columns:
        milk_features["Date"] = pd.to_datetime(milk_features["Date"])
        milk_features["year"] = milk_features["Date"].dt.year
        milk_features["month"] = milk_features["Date"].dt.month
        milk_features["day"] = milk_features["Date"].dt.day
        milk_features["dayofweek"] = milk_features["Date"].dt.dayofweek
        milk_features["weekofyear"] = milk_features["Date"].dt.isocalendar().week.astype(int)
        milk_features["quarter"] = milk_features["Date"].dt.quarter
        milk_features["is_weekend"] = milk_features["dayofweek"].isin([5, 6]).astype(int)
        milk_features["date_ordinal"] = milk_features["Date"].map(pd.Timestamp.toordinal)
        milk_features = milk_features.drop(columns=["Date"])
    
    

    # -----------------------
    # 5) Farm clustering
    # -----------------------
    if "Farm_ID" in milk_features.columns:
        # Use only numeric features for clustering (exclude IDs)
        farm_numeric_cols = milk_features.select_dtypes(include="number").columns.tolist()
        farm_numeric_cols = [c for c in farm_numeric_cols if c not in ["Cattle_ID"]]

        # Aggregate per farm
        farm_features = milk_features.groupby("Farm_ID")[farm_numeric_cols].mean()

        # Scale and cluster farms
        scaler = StandardScaler()
        farm_scaled = scaler.fit_transform(farm_features)
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        farm_features["Cluster"] = kmeans.fit_predict(farm_scaled)

        # Map back to rows
        milk_features["Farm_Cluster"] = milk_features["Farm_ID"].map(farm_features["Cluster"])


    # -----------------------
    # INSERT: Weight_Efficiency_Z (Breed + Age_Year)
    # -----------------------
    if "Weight_kg" in milk_features.columns and "Age_Months" in milk_features.columns:
        milk_features["Age_Year"] = milk_features["Age_Months"] // 12
        group_cols = ["Breed", "Age_Year"]

        # Calculate cohort stats using transform so it maps back to original rows
        cohort_mean = milk_features.groupby(group_cols)["Weight_kg"].transform("mean")
        cohort_std = milk_features.groupby(group_cols)["Weight_kg"].transform("std")

        # Calculate Z-Score: How heavy is this cow relative to her peers?
        milk_features["Weight_Efficiency_Z"] = (
            (milk_features["Weight_kg"] - cohort_mean) / (cohort_std + 1e-5)
        )

        milk_features = milk_features.drop(columns=["Age_Year"])

    # -----------------------
    # 6) Drop raw IDs
    # -----------------------
    drop_cols = ["Cattle_ID"]
    milk_features = milk_features.drop(columns=[c for c in drop_cols if c in milk_features.columns])

    # -----------------------
    # 7) Optional one-hot encoding (we'll keep it OFF for CatBoost)
    # -----------------------
    if encode_flag:
        cat_cols = milk_features.select_dtypes(include="object").columns.tolist()
        milk_features = pd.get_dummies(milk_features, columns=cat_cols, drop_first=False)

    # Make sure Farm_Cluster exists even if something went weird
    if "Farm_Cluster" not in milk_features.columns:
        milk_features["Farm_Cluster"] = 0

    final_df = milk_features.copy()
    return final_df, y

In [None]:
# # -----------------------------------
# # LightGBM hyperparameter search
# # -----------------------------------
# from sklearn.model_selection import RandomizedSearchCV

# lgb_base = lgb.LGBMRegressor(
#     objective="regression",
#     random_state=42
# )

# lgb_param_dist = {
#     "num_leaves": [31, 63, 127, 255],
#     "max_depth": [-1, 6, 8, 10],
#     "learning_rate": [0.01, 0.03, 0.05, 0.08],
#     "n_estimators": [300, 600, 900, 1200],
#     "subsample": [0.7, 0.8, 0.9],
#     "colsample_bytree": [0.7, 0.8, 0.9],
#     "min_child_samples": [10, 20, 50, 100]
# }

# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# lgb_search = RandomizedSearchCV(
#     estimator=lgb_base,
#     param_distributions=lgb_param_dist,
#     n_iter=25,                         # reduce if too slow
#     scoring="neg_mean_squared_error",  # we'll sqrt it to get RMSE
#     cv=kf,
#     verbose=1,
#     n_jobs=-1,
#     random_state=42,
# )

# print("Starting LightGBM hyperparameter search...")
# lgb_search.fit(X, y)

# best_lgb_params = lgb_search.best_params_
# best_lgb_rmse = np.sqrt(-lgb_search.best_score_)

# print("\nBest LightGBM params:")
# print(best_lgb_params)
# print(f"Best LightGBM CV RMSE: {best_lgb_rmse:.4f}")

In [20]:
# Build train and test matrices from the new pipeline
X, y = preprocess_pipeline(train, encode_flag=False, target_col=target)
test_df, _ = preprocess_pipeline(test, encode_flag=False, target_col=None)

print("Train shape:", X.shape)
print("Test shape:", test_df.shape)

Train shape: (209926, 43)
Test shape: (40000, 43)


In [21]:
# ==============================================================================
# REPLACEMENT: Farm_Performance Target Encoding (The Missing Signal)
# ==============================================================================
from sklearn.model_selection import KFold

# FIX: Reset indices (Keep this from previous attempt, it worked)
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)
test_df = test_df.reset_index(drop=True) 

def add_target_encoding_features(train_df, test_df, y_train, target_col_name="Milk_Yield_L", n_splits=5):
    # CHANGE: Encode Farm_ID instead of Bio features
    # This captures "Managerial Quality" - the specific effect of this farm on yield
    encodings = {
        "Farm_Performance": ["Farm_ID"]
    }
    
    train_encoded = train_df.copy()
    test_encoded = test_df.copy()
    
    # Combine X and y temporarily for the training split
    train_temp = train_encoded.copy()
    train_temp[target_col_name] = y_train.values

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for new_col, group_cols in encodings.items():
        # --- A) TRAINING SET: K-Fold Encoding (Prevents Leakage) ---
        train_encoded[new_col] = 0.0
        
        for tr_ind, val_ind in kf.split(train_temp):
            X_tr, X_val = train_temp.iloc[tr_ind], train_temp.iloc[val_ind]
            
            # Calculate mean yield per Farm on the 'training' part
            means = X_tr.groupby(group_cols)[target_col_name].mean()
            
            # Map to the 'validation' part
            # Since it's just one column (Farm_ID), simple map works
            train_encoded.loc[val_ind, new_col] = X_val[group_cols[0]].map(means).fillna(X_tr[target_col_name].mean())

        # --- B) TEST SET: Global Mean Encoding ---
        # Compute global farm stats from full training data
        global_means = train_temp.groupby(group_cols)[target_col_name].mean()
        global_avg = train_temp[target_col_name].mean()
        
        # Map to test set
        test_encoded[new_col] = test_encoded[group_cols[0]].map(global_means).fillna(global_avg)

    # OPTIONAL: Drop the raw Farm_ID now that we have the encoded score
    # This keeps the model clean
    train_encoded = train_encoded.drop(columns=["Farm_ID"], errors='ignore')
    test_encoded = test_encoded.drop(columns=["Farm_ID"], errors='ignore')

    return train_encoded, test_encoded

# Execute
X, test_df = add_target_encoding_features(X, test_df, y)
print("Added feature: Farm_Performance")

# ... Continue to CatBoost training ...

Added feature: Farm_Performance


In [22]:
# Categorical columns for CatBoost (from the processed X)
cat_features = X.select_dtypes(include="object").columns.tolist()
print("CatBoost categorical features:", cat_features)

CatBoost categorical features: ['Breed', 'Climate_Zone', 'Management_System', 'Lactation_Stage', 'Feed_Type']


In [23]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
import numpy as np

# 5-Fold Cross-Validation for CatBoost
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cat_fold_rmse = []
cat_models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n----- CatBoost Fold {fold+1} -----")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = CatBoostRegressor(
        loss_function="RMSE",
        n_estimators=1000,
        learning_rate=0.05,
        depth=6,          # you can try depth=6 like your teammate too
        subsample=0.8,
        random_seed=42,
        verbose=False
    )

    model.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        cat_features=cat_features,
        verbose=False
    )

    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    cat_fold_rmse.append(rmse)

    print(f"CatBoost Fold {fold+1} RMSE: {rmse:.4f}")
    cat_models.append(model)

print("\n==========================")
print(f"CatBoost Average RMSE: {np.mean(cat_fold_rmse):.4f}")
print("==========================\n")


----- CatBoost Fold 1 -----
CatBoost Fold 1 RMSE: 4.1073

----- CatBoost Fold 2 -----
CatBoost Fold 2 RMSE: 4.1007

----- CatBoost Fold 3 -----
CatBoost Fold 3 RMSE: 4.1208

----- CatBoost Fold 4 -----
CatBoost Fold 4 RMSE: 4.1066

----- CatBoost Fold 5 -----
CatBoost Fold 5 RMSE: 4.0996

CatBoost Average RMSE: 4.1070



In [None]:
# Train final CatBoost model on full data
final_cat_model = CatBoostRegressor(
    loss_function="RMSE",
    n_estimators=1000,
    learning_rate=0.05,
    depth=6,      # again, can try 6 if you want to match teammate
    subsample=0.8,
    random_seed=42,
    verbose=False
)

final_cat_model.fit(
    X,
    y,
    cat_features=cat_features,
    verbose=False
)

# Predict on test_df built from preprocess_pipeline
cat_test_preds = final_cat_model.predict(test_df)

cat_submission = pd.DataFrame({
    id_col: test[id_col],
    target: cat_test_preds
})

# save_submission(cat_submission, run_name="felipe_catboost_farmcluster_lactation")
print("submission_catboost.csv created!")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004004 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4625
[LightGBM] [Info] Number of data points in the train set: 209926, number of used features: 47
[LightGBM] [Info] Start training from score 15.595179
Final CatBoost TRAIN RMSE:  3.9662
Final LightGBM TRAIN RMSE:  3.8379
Ensemble(0.7 CB, 0.3 LGB) TRAIN RMSE: 3.9232
Saved submission -> /Users/felipebenitez/ML-Project/submissions/felipe_catboost_only__2025-11-15__22-57-41.csv


In [117]:

# THIS IS ENSEMBLE CODE



from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
import numpy as np

# 5-Fold Cross-Validation for a CatBoost ensemble
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define 3 diverse CatBoost configs
model_configs = [
    {
        "name": "cat_d8_lr005",
        "params": dict(
            loss_function="RMSE",
            n_estimators=1000,
            learning_rate=0.05,
            depth=8,
            subsample=0.8,
            l2_leaf_reg=3.0,
            verbose=False
        )
    },
    {
        "name": "cat_d6_lr003",
        "params": dict(
            loss_function="RMSE",
            n_estimators=1500,
            learning_rate=0.03,
            depth=6,
            subsample=0.9,
            l2_leaf_reg=5.0,
            verbose=False
        )
    },
    {
        "name": "cat_d10_lr004",
        "params": dict(
            loss_function="RMSE",
            n_estimators=800,
            learning_rate=0.04,
            depth=10,
            subsample=0.7,
            l2_leaf_reg=6.0,
            verbose=False
        )
    },
]

# Track per-model and ensemble RMSEs
per_model_fold_rmse = {cfg["name"]: [] for cfg in model_configs}
ens_fold_rmse = []
ens_fold_weights = []  # store best weights per fold

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n===== CatBoost Ensemble Fold {fold+1} =====")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Store val predictions from each model in this fold
    fold_val_preds = []

    # 1) Train each CatBoost variant
    for i, cfg in enumerate(model_configs):
        name = cfg["name"]
        params = cfg["params"].copy()

        # Slightly different random_seed per fold & model
        params["random_seed"] = 42 + fold * 10 + i

        model = CatBoostRegressor(**params)

        model.fit(
            X_train,
            y_train,
            eval_set=(X_val, y_val),
            cat_features=cat_features,
            verbose=False
        )

        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        per_model_fold_rmse[name].append(rmse)
        fold_val_preds.append(preds)

        print(f"{name} Fold {fold+1} RMSE: {rmse:.4f}")

    # 2) Find best non-negative weights (w1,w2,w3) summing to 1
    #    on this fold using a small grid search
    preds_matrix = np.vstack(fold_val_preds).T  # shape: [n_val, 3]

    alphas = np.linspace(0.0, 1.0, 11)  # 0.0, 0.1, ..., 1.0
    best_rmse = float("inf")
    best_w = None

    for a1 in alphas:
        for a2 in alphas:
            if a1 + a2 > 1.0:
                continue
            a3 = 1.0 - a1 - a2
            w = np.array([a1, a2, a3])
            blend = preds_matrix @ w
            rmse_blend = np.sqrt(mean_squared_error(y_val, blend))
            if rmse_blend < best_rmse:
                best_rmse = rmse_blend
                best_w = w

    ens_fold_rmse.append(best_rmse)
    ens_fold_weights.append(best_w)

    print(
        f"Ensemble Fold {fold+1} RMSE: {best_rmse:.4f} "
        f"(weights: w1={best_w[0]:.2f}, w2={best_w[1]:.2f}, w3={best_w[2]:.2f})"
    )

# 3) Print average RMSEs
print("\n==========================")
for cfg in model_configs:
    name = cfg["name"]
    avg_rmse = np.mean(per_model_fold_rmse[name])
    print(f"{name} Average RMSE: {avg_rmse:.4f}")

ens_avg_rmse = np.mean(ens_fold_rmse)
print(f"Ensemble Average RMSE: {ens_avg_rmse:.4f}")

# Average weights over folds for final test ensemble
ens_fold_weights = np.array(ens_fold_weights)  # shape: [n_folds, 3]
global_weights = ens_fold_weights.mean(axis=0)
print(
    f"Average ensemble weights over folds: "
    f"w1={global_weights[0]:.3f}, w2={global_weights[1]:.3f}, w3={global_weights[2]:.3f}"
)
print("==========================\n")

# 4) Train final 3 CatBoost models on all data and ensemble on test

final_test_preds = []
for i, cfg in enumerate(model_configs):
    name = cfg["name"]
    params = cfg["params"].copy()
    params["random_seed"] = 999 + i  # fixed seeds for final models

    print(f"Training final model: {name}")
    model = CatBoostRegressor(**params)
    model.fit(
        X,
        y,
        cat_features=cat_features,
        verbose=False
    )

    preds_test = model.predict(test_df)
    final_test_preds.append(preds_test)

final_test_preds = np.vstack(final_test_preds).T  # [n_test, 3]

# Use global_weights from CV
final_ensemble_test = final_test_preds @ global_weights  # shape: (n_test,)

# Build submission DataFrame: [Cattle_ID, Milk_Yield_L]
cat_submission = pd.DataFrame({
    id_col: test[id_col].values,     # "Cattle_ID"
    target: final_ensemble_test      # "Milk_Yield_L"
})

save_submission(cat_submission, run_name="felipe_catboost_catensemble")
print("Saved submission_cat_ensemble.csv")


===== CatBoost Ensemble Fold 1 =====
cat_d8_lr005 Fold 1 RMSE: 4.1093
cat_d6_lr003 Fold 1 RMSE: 4.1080
cat_d10_lr004 Fold 1 RMSE: 4.1117
Ensemble Fold 1 RMSE: 4.1075 (weights: w1=0.30, w2=0.60, w3=0.10)

===== CatBoost Ensemble Fold 2 =====
cat_d8_lr005 Fold 2 RMSE: 4.1027
cat_d6_lr003 Fold 2 RMSE: 4.1018
cat_d10_lr004 Fold 2 RMSE: 4.1037
Ensemble Fold 2 RMSE: 4.1007 (weights: w1=0.20, w2=0.50, w3=0.30)

===== CatBoost Ensemble Fold 3 =====
cat_d8_lr005 Fold 3 RMSE: 4.1250
cat_d6_lr003 Fold 3 RMSE: 4.1212
cat_d10_lr004 Fold 3 RMSE: 4.1266
Ensemble Fold 3 RMSE: 4.1212 (weights: w1=0.00, w2=0.90, w3=0.10)

===== CatBoost Ensemble Fold 4 =====
cat_d8_lr005 Fold 4 RMSE: 4.1059
cat_d6_lr003 Fold 4 RMSE: 4.1065
cat_d10_lr004 Fold 4 RMSE: 4.1101
Ensemble Fold 4 RMSE: 4.1051 (weights: w1=0.60, w2=0.40, w3=-0.00)

===== CatBoost Ensemble Fold 5 =====
cat_d8_lr005 Fold 5 RMSE: 4.1009
cat_d6_lr003 Fold 5 RMSE: 4.0992
cat_d10_lr004 Fold 5 RMSE: 4.1059
Ensemble Fold 5 RMSE: 4.0988 (weights: w1=0.3

In [None]:
# 5-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_rmse = []
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n----- Fold {fold+1} -----")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=-1,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    model.fit(X_train, y_train)

    preds = model.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    fold_rmse.append(rmse)

    print(f"Fold {fold+1} RMSE: {rmse:.4f}")
    models.append(model)

print("\n==========================")
print(f"Average RMSE: {np.mean(fold_rmse):.4f}")
print("==========================\n")

LightGBM categorical features: ['Breed', 'Climate_Zone', 'Management_System', 'Lactation_Stage', 'Feed_Type']

----- LightGBM Fold 1 -----
LightGBM Fold 1 RMSE: 5.3580
  -> best_iteration: 45

----- LightGBM Fold 2 -----
LightGBM Fold 2 RMSE: 5.3416
  -> best_iteration: 7

----- LightGBM Fold 3 -----
LightGBM Fold 3 RMSE: 5.3441
  -> best_iteration: 1

----- LightGBM Fold 4 -----
LightGBM Fold 4 RMSE: 5.3530
  -> best_iteration: 21

----- LightGBM Fold 5 -----
LightGBM Fold 5 RMSE: 5.3188
  -> best_iteration: 32

LightGBM Average RMSE: 5.3431



In [None]:
# --- Reuse already-loaded train/test if they exist ---
# If not, load again
try:
    train
    test
    print("Reusing loaded train/test...")
except NameError:
    print("Loading data fresh...")
    train = pd.read_csv("../../data/cattle_data_train.csv")
    test = pd.read_csv("../../data/cattle_data_test.csv")

target = "Milk_Yield_L"
id_col = "Cattle_ID"

def preprocess(df):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].astype("category").cat.codes
    return df

train_prep = preprocess(train.drop(columns=[target]))
test_prep = preprocess(test)

X = train_prep
y = train[target]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_rmse = []
xgb_models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n----- XGBoost Fold {fold+1} -----")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = xgb.XGBRegressor(
        n_estimators=700,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        tree_method="hist",      # fast for large data
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    preds = model.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    fold_rmse.append(rmse)
    
    print(f"Fold {fold+1} RMSE: {rmse:.4f}")
    xgb_models.append(model)

print("\n==========================")
print(f"XGBoost Average RMSE = {np.mean(fold_rmse):.4f}")
print("==========================\n")

# --- Train final model on all data ---
final_xgb = xgb.XGBRegressor(
    n_estimators=700,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    tree_method="hist",
    random_state=42
)

final_xgb.fit(X, y)

# --- Submission ---
test_preds = final_xgb.predict(test_prep)

submission_xgb = pd.DataFrame({
    id_col: test[id_col],
    target: test_preds
})

# save_submission(submission, run_name="felipe_model")
print("submission_xgb.csv created!")

Reusing loaded train/test...

----- XGBoost Fold 1 -----
Fold 1 RMSE: 4.1888

----- XGBoost Fold 2 -----
Fold 2 RMSE: 4.1517

----- XGBoost Fold 3 -----
Fold 3 RMSE: 4.1568

----- XGBoost Fold 4 -----
Fold 4 RMSE: 4.1814

----- XGBoost Fold 5 -----
Fold 5 RMSE: 4.1694

XGBoost Average RMSE = 4.1696

submission_xgb.csv created!


In [89]:
import pandas as pd
import numpy as np

train = pd.read_csv("../../data/cattle_data_train.csv")

print("="*60)
print("DATASET STRUCTURE ANALYSIS")
print("="*60)

# Date analysis
print("\n1. DATE PATTERNS:")
train['Date'] = pd.to_datetime(train['Date'])
print(f"Date range: {train['Date'].min()} to {train['Date'].max()}")
print(f"Unique dates: {train['Date'].nunique()}")
print(f"Total rows: {len(train)}")
print(f"\nDates per cattle (sample 5):")
print(train.groupby('Cattle_ID')['Date'].nunique().head())

# Farm analysis
print("\n2. FARM PATTERNS:")
print(f"Unique farms: {train['Farm_ID'].nunique()}")
print(f"Avg rows per farm: {len(train) / train['Farm_ID'].nunique():.1f}")
print(f"\nTop 5 farms by count:")
print(train['Farm_ID'].value_counts().head())

# Breed analysis
print("\n3. BREED PATTERNS:")
print(train['Breed'].value_counts())

# Climate analysis
print("\n4. CLIMATE × BREED:")
print(pd.crosstab(train['Climate_Zone'], train['Breed']))

# Correlation with target
print("\n5. TOP CORRELATIONS WITH TARGET:")
numeric_cols = train.select_dtypes(include=[np.number]).columns
corr = train[numeric_cols].corr()['Milk_Yield_L'].sort_values(ascending=False)
print(corr.head(10))

print("\n6. NEGATIVE CORRELATIONS:")
print(corr.tail(10))

DATASET STRUCTURE ANALYSIS

1. DATE PATTERNS:
Date range: 2022-01-01 00:00:00 to 2024-12-30 00:00:00
Unique dates: 1095
Total rows: 210000

Dates per cattle (sample 5):
Cattle_ID
CATTLE_000001    1
CATTLE_000002    1
CATTLE_000003    1
CATTLE_000004    1
CATTLE_000005    1
Name: Date, dtype: int64

2. FARM PATTERNS:
Unique farms: 1000
Avg rows per farm: 210.0

Top 5 farms by count:
Farm_ID
FARM_0842    252
FARM_0724    251
FARM_0405    250
FARM_0571    250
FARM_0937    249
Name: count, dtype: int64

3. BREED PATTERNS:
Breed
Holstein        104775
Jersey           42183
Guernsey         31672
Brown Swiss      31155
Holstien           112
 Brown Swiss        57
Brown Swiss         46
Name: count, dtype: int64

4. CLIMATE × BREED:
Breed           Brown Swiss  Brown Swiss  Brown Swiss   Guernsey  Holstein  \
Climate_Zone                                                                 
Arid                     12         5235             7      5190     17428   
Continental               5 

In [None]:
import os, sys
import importlib

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import lightgbm as lgb      # still imported in case you need it later
import xgboost as xgb       # same
from catboost import CatBoostRegressor

import submission_utils
importlib.reload(submission_utils)
from submission_utils import save_submission

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import optuna  # make sure you have `pip install optuna` first


# ======================================================
# 0) Load data
# ======================================================

train = pd.read_csv("../../data/cattle_data_train.csv")
test = pd.read_csv("../../data/cattle_data_test.csv")

target = "Milk_Yield_L"
id_col = "Cattle_ID"


# ======================================================
# 1) Your existing preprocessing pipeline (UNCHANGED)
# ======================================================

def preprocess_pipeline(df, encode_flag=True, target_col="Milk_Yield_L", n_clusters=10):
    milk_features = df.copy()
    
    # -----------------------
    # 1) Drop impossible targets (train only)
    # -----------------------
    if target_col in milk_features.columns:
        milk_features = milk_features[milk_features[target_col] >= 0]
        y = milk_features[target_col]
        milk_features = milk_features.drop(columns=[target_col])
    else:
        y = None

    # -----------------------
    # 2) Basic cleaning
    # -----------------------
    if "Breed" in milk_features.columns:
        milk_features["Breed"] = milk_features["Breed"].str.strip()
        milk_features["Breed"] = milk_features["Breed"].replace({"Holstien": "Holstein"})

    if "Housing_Score" in milk_features.columns:
        milk_features["Housing_Score"] = milk_features["Housing_Score"].fillna(
            milk_features["Housing_Score"].median()
        )

    if "Feed_Quantity_kg" in milk_features.columns and "Feed_Type" in milk_features.columns:
        milk_features["Feed_Quantity_kg"] = milk_features.groupby("Feed_Type")["Feed_Quantity_kg"].transform(
            lambda x: x.fillna(x.median())
        )
    
    # Fill any remaining numeric NaNs
    numeric_cols = milk_features.select_dtypes(include="number").columns.tolist()
    milk_features[numeric_cols] = milk_features[numeric_cols].fillna(milk_features[numeric_cols].median())

    # -----------------------
    # 3) Date features
    # -----------------------
    if "Date" in milk_features.columns:
        milk_features["Date"] = pd.to_datetime(milk_features["Date"])
        milk_features["year"] = milk_features["Date"].dt.year
        milk_features["month"] = milk_features["Date"].dt.month
        milk_features["day"] = milk_features["Date"].dt.day
        milk_features["dayofweek"] = milk_features["Date"].dt.dayofweek
        milk_features["weekofyear"] = milk_features["Date"].dt.isocalendar().week.astype(int)
        milk_features["quarter"] = milk_features["Date"].dt.quarter
        milk_features["is_weekend"] = milk_features["dayofweek"].isin([5, 6]).astype(int)
        milk_features["date_ordinal"] = milk_features["Date"].map(pd.Timestamp.toordinal)
        milk_features = milk_features.drop(columns=["Date"])
    
    # -----------------------
    # 5) Farm clustering (your current version)
    # -----------------------
    if "Farm_ID" in milk_features.columns:
        # Use only numeric features for clustering (exclude IDs)
        farm_numeric_cols = milk_features.select_dtypes(include="number").columns.tolist()
        farm_numeric_cols = [c for c in farm_numeric_cols if c not in ["Cattle_ID"]]

        # Aggregate per farm
        farm_features = milk_features.groupby("Farm_ID")[farm_numeric_cols].mean()

        # Scale and cluster farms
        scaler = StandardScaler()
        farm_scaled = scaler.fit_transform(farm_features)
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        farm_features["Cluster"] = kmeans.fit_predict(farm_scaled)

        # Map back to rows
        milk_features["Farm_Cluster"] = milk_features["Farm_ID"].map(farm_features["Cluster"])

    # -----------------------
    # 6) Drop raw IDs
    # -----------------------
    drop_cols = ["Cattle_ID", "Farm_ID"]
    milk_features = milk_features.drop(columns=[c for c in drop_cols if c in milk_features.columns])

    # -----------------------
    # 7) Optional one-hot encoding (we'll keep it OFF for CatBoost)
    # -----------------------
    if encode_flag:
        cat_cols = milk_features.select_dtypes(include="object").columns.tolist()
        milk_features = pd.get_dummies(milk_features, columns=cat_cols, drop_first=False)

    # Make sure Farm_Cluster exists even if something went weird
    if "Farm_Cluster" not in milk_features.columns:
        milk_features["Farm_Cluster"] = 0

    final_df = milk_features.copy()
    return final_df, y


# ======================================================
# 2) Build X, y, test_df (frozen pipeline)
# ======================================================

X, y = preprocess_pipeline(train, encode_flag=False, target_col=target)
test_df, _ = preprocess_pipeline(test, encode_flag=False, target_col=None)

print("Train shape:", X.shape)
print("Test shape:", test_df.shape)

# Categorical columns for CatBoost (from the processed X)
cat_features = X.select_dtypes(include="object").columns.tolist()
print("CatBoost categorical features:", cat_features)

# KFold for everything
kf = KFold(n_splits=5, shuffle=True, random_state=42)


# ======================================================
# 3) Optuna hyperparameter search (5-fold CV)
# ======================================================

def objective(trial):
    params = {
        "depth": trial.suggest_int("depth", 5, 7),
        "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.04),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 7.0),
        "subsample": trial.suggest_float("subsample", 0.7, 0.9),
        "random_strength": trial.suggest_float("random_strength", 0.5, 5.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        # you can add more (e.g., border_count) if desired
    }

    fold_rmses = []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = CatBoostRegressor(
            loss_function="RMSE",
            n_estimators=3000,            # big cap, rely on early stopping
            early_stopping_rounds=100,
            random_seed=42,
            thread_count=4,
            verbose=False,                # keep Optuna runs quiet
            **params
        )

        model.fit(
            X_train,
            y_train,
            eval_set=(X_val, y_val),
            cat_features=cat_features,   # using column names
            verbose=False
        )

        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        fold_rmses.append(rmse)

    mean_rmse = float(np.mean(fold_rmses))
    return mean_rmse


# You can bump this if you want; 40–60 is a nice compromise
N_TRIALS = 40  

print("\nStarting Optuna hyperparameter search...")
study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=N_TRIALS, n_jobs=2, show_progress_bar=True)

best_params = study.best_params
best_cv_rmse = study.best_value

print("\nOptuna search complete.")
print("Best CV RMSE from Optuna:", best_cv_rmse)
print("Best params:", best_params)


# ======================================================
# 4) Final 5-fold CV ensemble with best_params
# ======================================================

cv_models = []
fold_rmses = []
fold_best_iters = []

print("\nTraining final 5-fold ensemble with best_params...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X), start=1):
    print(f"\n----- Final CV Fold {fold} -----")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # use SAME seed as Optuna for consistency
    model = CatBoostRegressor(
        loss_function="RMSE",
        n_estimators=3000,
        early_stopping_rounds=100,
        random_seed=42,
        thread_count=4,
        verbose=False,
        **best_params
    )

    model.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        cat_features=cat_features,
        verbose=False
    )

    preds_val = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds_val))
    best_iter = model.get_best_iteration()

    print(f"Fold {fold} RMSE: {rmse:.4f}, best_iter={best_iter}")

    fold_rmses.append(rmse)
    fold_best_iters.append(best_iter)
    cv_models.append(model)

final_cv_rmse = float(np.mean(fold_rmses))
print("\n===============================")
print(f"Final 5-fold ensemble CV RMSE: {final_cv_rmse:.4f}")
print("Fold RMSEs:", fold_rmses)
print("Best iters:", fold_best_iters)

# sanity check vs Optuna's own CV
if final_cv_rmse > best_cv_rmse + 0.001:
    print("⚠️ WARNING: Final CV RMSE is worse than Optuna's best!")
    print(f"Optuna best: {best_cv_rmse:.4f}, Final CV: {final_cv_rmse:.4f}")
    print("This might indicate some instability / differences in folds.")
print("===============================")


# ======================================================
# 5) CV-ensemble predictions on test_df (SAFE baseline)
# ======================================================

cv_test_preds = np.zeros(len(test_df), dtype=float)

for model in cv_models:
    cv_test_preds += model.predict(test_df)

cv_test_preds /= len(cv_models)

# This is your safest, fully CV-validated prediction vector
sub_cv = pd.DataFrame({
    id_col: test[id_col],
    target: cv_test_preds
})

save_submission(sub_cv, run_name="felipe_catboost_optuna_cv_only")
print("Saved CV-only submission: felipe_catboost_optuna_cv_only")


# ======================================================
# 6) Optional: full-data multi-seed ensemble
# ======================================================

avg_best_iter = int(np.mean(fold_best_iters))
print("\nAverage best_iteration across folds:", avg_best_iter)

n_estimators_full = avg_best_iter
print("Using n_estimators for full-data models:", n_estimators_full)

seed_list = [101, 202, 303, 404, 505]
full_seed_models = []
full_seed_preds = np.zeros(len(test_df), dtype=float)

for seed in seed_list:
    print(f"\nTraining full-data model with seed={seed}...")
    model = CatBoostRegressor(
        loss_function="RMSE",
        n_estimators=n_estimators_full,
        random_seed=seed,
        thread_count=4,
        verbose=False,
        **best_params
        # no early_stopping_rounds and no eval_set here
    )

    model.fit(
        X,
        y,
        cat_features=cat_features,
        verbose=False
    )

    full_seed_models.append(model)
    full_seed_preds += model.predict(test_df)

full_seed_preds /= len(seed_list)

sub_full = pd.DataFrame({
    id_col: test[id_col],
    target: full_seed_preds
})

save_submission(sub_full, run_name="felipe_catboost_optuna_full_seed_only")
print("Saved full-data seed ensemble submission: felipe_catboost_optuna_full_seed_only")


# ======================================================
# 7) Blended submissions (CV + full-data)
# ======================================================

# 70% CV ensemble, 30% full-data ensemble
alpha_70 = 0.7
blend_70_30 = alpha_70 * cv_test_preds + (1.0 - alpha_70) * full_seed_preds

sub_blend_70_30 = pd.DataFrame({
    id_col: test[id_col],
    target: blend_70_30
})
save_submission(sub_blend_70_30, run_name="felipe_catboost_optuna_blend_70_30")
print("Saved blend 70/30 submission: felipe_catboost_optuna_blend_70_30")

# 50% / 50% blend
alpha_50 = 0.5
blend_50_50 = alpha_50 * cv_test_preds + (1.0 - alpha_50) * full_seed_preds

sub_blend_50_50 = pd.DataFrame({
    id_col: test[id_col],
    target: blend_50_50
})
save_submission(sub_blend_50_50, run_name="felipe_catboost_optuna_blend_50_50")
print("Saved blend 50/50 submission: felipe_catboost_optuna_blend_50_50")


# ======================================================
# 8) EXTRA: OOF RMSE comparison of strategies
#     (CV-only vs "full-seed style" vs blends)
# ======================================================

print("\n\n=== Running OOF comparison for strategies (this is offline, NOT Kaggle) ===")

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# OOF containers (same length as y)
cv_only_oof        = np.zeros(len(X), dtype=float)
full_style_oof     = np.zeros(len(X), dtype=float)
blend_70_30_oof    = np.zeros(len(X), dtype=float)
blend_50_50_oof    = np.zeros(len(X), dtype=float)

# Re-use same KFold splits and same cv_models (order aligned with enumerate start=1)
for fold, (train_idx, val_idx) in enumerate(kf.split(X), start=1):
    print(f"\n[OOF Eval] Fold {fold}")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # 1) CV-only predictions (from already trained cv_models)
    cv_model = cv_models[fold - 1]
    cv_preds_val = cv_model.predict(X_val)

    # 2) "Full-seed style" ensemble trained ONLY on X_train (no leakage)
    fold_full_preds = np.zeros(len(val_idx), dtype=float)
    for seed in seed_list:
        model = CatBoostRegressor(
            loss_function="RMSE",
            n_estimators=n_estimators_full,
            random_seed=seed,
            verbose=False,
            thread_count=4,
            **best_params
        )
        model.fit(
            X_train,
            y_train,
            cat_features=cat_features,
            verbose=False
        )
        fold_full_preds += model.predict(X_val)

    fold_full_preds /= len(seed_list)

    # Store OOF preds
    cv_only_oof[val_idx]     = cv_preds_val
    full_style_oof[val_idx]  = fold_full_preds
    blend_70_30_oof[val_idx] = alpha_70 * cv_preds_val + (1.0 - alpha_70) * fold_full_preds
    blend_50_50_oof[val_idx] = alpha_50 * cv_preds_val + (1.0 - alpha_50) * fold_full_preds

# Compute OOF RMSEs for each strategy
rmse_cv_only     = rmse(y, cv_only_oof)
rmse_full_style  = rmse(y, full_style_oof)
rmse_blend_70_30 = rmse(y, blend_70_30_oof)
rmse_blend_50_50 = rmse(y, blend_50_50_oof)

print("\n=== OOF RMSE summary (lower is better) ===")
print(f"CV-only ensemble OOF RMSE:        {rmse_cv_only:.6f}")
print(f"Full-seed-style ensemble OOF RMSE: {rmse_full_style:.6f}")
print(f"Blend 70% CV / 30% full OOF RMSE: {rmse_blend_70_30:.6f}")
print(f"Blend 50% CV / 50% full OOF RMSE: {rmse_blend_50_50:.6f}")
print("Note: CV-only OOF RMSE should closely match 'Final 5-fold ensemble CV RMSE' above.")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-11-17 18:36:04,762] A new study created in memory with name: no-name-1a8eea34-828a-47e9-9eb1-d7daa167ceb6


Train shape: (209926, 41)
Test shape: (40000, 41)
CatBoost categorical features: ['Breed', 'Climate_Zone', 'Management_System', 'Lactation_Stage', 'Feed_Type']

Starting Optuna hyperparameter search...


Best trial: 0. Best value: 4.10664:   2%|▎         | 1/40 [04:02<2:37:25, 242.19s/it]

[I 2025-11-17 18:40:06,955] Trial 0 finished with value: 4.1066350616145835 and parameters: {'depth': 7, 'learning_rate': 0.030033228718532787, 'l2_leaf_reg': 1.7953972301724823, 'subsample': 0.7796965026065017, 'random_strength': 1.0696793682056738, 'bagging_temperature': 0.9530309362394171}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:   5%|▌         | 2/40 [04:12<1:07:03, 105.88s/it]

[I 2025-11-17 18:40:17,426] Trial 1 finished with value: 4.1069651873737705 and parameters: {'depth': 6, 'learning_rate': 0.03246286800397102, 'l2_leaf_reg': 6.934585342403436, 'subsample': 0.8892560134025628, 'random_strength': 1.9009017430415895, 'bagging_temperature': 0.0612661117999308}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:   8%|▊         | 3/40 [08:24<1:46:31, 172.74s/it]

[I 2025-11-17 18:44:29,731] Trial 3 finished with value: 4.107981500145322 and parameters: {'depth': 5, 'learning_rate': 0.03060417129254867, 'l2_leaf_reg': 3.8027511028361998, 'subsample': 0.7628897889666677, 'random_strength': 4.325941286066591, 'bagging_temperature': 0.7786329332665413}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:  10%|█         | 4/40 [08:37<1:05:46, 109.63s/it]

[I 2025-11-17 18:44:42,597] Trial 2 finished with value: 4.10740570764789 and parameters: {'depth': 5, 'learning_rate': 0.024493405366674076, 'l2_leaf_reg': 3.6353347752597167, 'subsample': 0.707230473010704, 'random_strength': 2.2892214679264926, 'bagging_temperature': 0.2155123427828266}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:  12%|█▎        | 5/40 [13:07<1:37:36, 167.34s/it]

[I 2025-11-17 18:49:12,271] Trial 5 finished with value: 4.107690360205317 and parameters: {'depth': 5, 'learning_rate': 0.028564706151943724, 'l2_leaf_reg': 2.3626314183815476, 'subsample': 0.7763443417159476, 'random_strength': 3.032977362587531, 'bagging_temperature': 0.5507477981809029}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:  15%|█▌        | 6/40 [13:13<1:03:38, 112.32s/it]

[I 2025-11-17 18:49:17,783] Trial 4 finished with value: 4.106676906203669 and parameters: {'depth': 6, 'learning_rate': 0.021298257855780277, 'l2_leaf_reg': 6.460136786522908, 'subsample': 0.7349923254618709, 'random_strength': 1.2501129730356348, 'bagging_temperature': 0.5851409559207785}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:  18%|█▊        | 7/40 [16:28<1:16:44, 139.54s/it]

[I 2025-11-17 18:52:33,364] Trial 6 finished with value: 4.106900723385433 and parameters: {'depth': 6, 'learning_rate': 0.032643554231493614, 'l2_leaf_reg': 5.1126445494184285, 'subsample': 0.7639782599311191, 'random_strength': 0.7424764599661797, 'bagging_temperature': 0.5274361850636387}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:  20%|██        | 8/40 [16:55<55:18, 103.70s/it]  

[I 2025-11-17 18:53:00,319] Trial 7 finished with value: 4.108050508456788 and parameters: {'depth': 6, 'learning_rate': 0.03715447153262421, 'l2_leaf_reg': 3.1202924983964415, 'subsample': 0.8823717249230913, 'random_strength': 2.737813060872654, 'bagging_temperature': 0.832679423449802}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:  22%|██▎       | 9/40 [19:43<1:03:53, 123.65s/it]

[I 2025-11-17 18:55:47,850] Trial 8 finished with value: 4.10712894781373 and parameters: {'depth': 7, 'learning_rate': 0.0372063943972149, 'l2_leaf_reg': 3.066826907246639, 'subsample': 0.7433660606551039, 'random_strength': 0.92382483862985, 'bagging_temperature': 0.9435129263785973}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:  25%|██▌       | 10/40 [20:40<51:40, 103.34s/it] 

[I 2025-11-17 18:56:45,703] Trial 9 finished with value: 4.108473595370948 and parameters: {'depth': 6, 'learning_rate': 0.03612595478530639, 'l2_leaf_reg': 2.421872652069468, 'subsample': 0.8598219688266027, 'random_strength': 4.706821695030949, 'bagging_temperature': 0.29411014570143235}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:  28%|██▊       | 11/40 [23:29<59:35, 123.31s/it]

[I 2025-11-17 18:59:34,295] Trial 10 finished with value: 4.107869715812108 and parameters: {'depth': 5, 'learning_rate': 0.03830312520468619, 'l2_leaf_reg': 5.112357991065945, 'subsample': 0.7935863532464469, 'random_strength': 4.266484446357278, 'bagging_temperature': 0.6264913331873335}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:  30%|███       | 12/40 [25:43<59:01, 126.46s/it]

[I 2025-11-17 19:01:47,976] Trial 11 finished with value: 4.1078967863134634 and parameters: {'depth': 7, 'learning_rate': 0.027287662103531236, 'l2_leaf_reg': 1.00895727744874, 'subsample': 0.8167621612992455, 'random_strength': 3.841572475573071, 'bagging_temperature': 0.7293196793346711}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:  32%|███▎      | 13/40 [28:54<1:05:42, 146.03s/it]

[I 2025-11-17 19:04:59,014] Trial 12 finished with value: 4.1068503307044795 and parameters: {'depth': 7, 'learning_rate': 0.022000819616776045, 'l2_leaf_reg': 1.3317752666375293, 'subsample': 0.8233925456406492, 'random_strength': 1.5769911781443726, 'bagging_temperature': 0.9836198146410235}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:  35%|███▌      | 14/40 [31:21<1:03:28, 146.50s/it]

[I 2025-11-17 19:07:26,605] Trial 13 finished with value: 4.106876422073571 and parameters: {'depth': 7, 'learning_rate': 0.022202272557925288, 'l2_leaf_reg': 5.6294485257075175, 'subsample': 0.7253082034156811, 'random_strength': 1.4731605087905866, 'bagging_temperature': 0.9979819308612154}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:  38%|███▊      | 15/40 [33:42<1:00:17, 144.71s/it]

[I 2025-11-17 19:09:47,167] Trial 14 finished with value: 4.106954616492338 and parameters: {'depth': 7, 'learning_rate': 0.025607587308905125, 'l2_leaf_reg': 6.51590579058398, 'subsample': 0.7184142194838564, 'random_strength': 1.32515565201582, 'bagging_temperature': 0.3912123875074608}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 0. Best value: 4.10664:  40%|████      | 16/40 [35:28<53:17, 133.22s/it]  

[I 2025-11-17 19:11:33,713] Trial 15 finished with value: 4.106740958173306 and parameters: {'depth': 6, 'learning_rate': 0.02626299033290152, 'l2_leaf_reg': 6.786081886657344, 'subsample': 0.7002672200682457, 'random_strength': 0.5709183840263852, 'bagging_temperature': 0.3092743770314209}. Best is trial 0 with value: 4.1066350616145835.


Best trial: 16. Best value: 4.1064:  42%|████▎     | 17/40 [38:45<58:22, 152.30s/it]

[I 2025-11-17 19:14:50,393] Trial 16 finished with value: 4.106398510192625 and parameters: {'depth': 6, 'learning_rate': 0.02006010827962057, 'l2_leaf_reg': 4.434343507978291, 'subsample': 0.7483861301411402, 'random_strength': 0.8777706129477674, 'bagging_temperature': 0.6453948564266518}. Best is trial 16 with value: 4.106398510192625.


Best trial: 16. Best value: 4.1064:  45%|████▌     | 18/40 [38:58<40:30, 110.48s/it]

[I 2025-11-17 19:15:03,526] Trial 17 finished with value: 4.107661171901893 and parameters: {'depth': 6, 'learning_rate': 0.03334803917868082, 'l2_leaf_reg': 4.507009586304742, 'subsample': 0.7485421714075146, 'random_strength': 2.1481771797298777, 'bagging_temperature': 0.6734256967656533}. Best is trial 16 with value: 4.106398510192625.


Best trial: 16. Best value: 4.1064:  48%|████▊     | 19/40 [42:46<50:58, 145.62s/it]

[I 2025-11-17 19:18:51,009] Trial 18 finished with value: 4.10756628257958 and parameters: {'depth': 7, 'learning_rate': 0.03371968895642867, 'l2_leaf_reg': 4.519233529009053, 'subsample': 0.7938956325903803, 'random_strength': 2.230417344316034, 'bagging_temperature': 0.6893165924057587}. Best is trial 16 with value: 4.106398510192625.


Best trial: 16. Best value: 4.1064:  50%|█████     | 20/40 [43:36<39:00, 117.02s/it]

[I 2025-11-17 19:19:41,369] Trial 19 finished with value: 4.107437874958039 and parameters: {'depth': 7, 'learning_rate': 0.02981163612740578, 'l2_leaf_reg': 1.877287177795457, 'subsample': 0.8060920787883783, 'random_strength': 2.9633146331715228, 'bagging_temperature': 0.8628879909835876}. Best is trial 16 with value: 4.106398510192625.


Best trial: 16. Best value: 4.1064:  52%|█████▎    | 21/40 [47:20<47:10, 148.96s/it]

[I 2025-11-17 19:23:24,793] Trial 20 finished with value: 4.107400725180578 and parameters: {'depth': 7, 'learning_rate': 0.030018802385249874, 'l2_leaf_reg': 1.6317341049787633, 'subsample': 0.8410941602515147, 'random_strength': 3.3178159118097934, 'bagging_temperature': 0.845108030783518}. Best is trial 16 with value: 4.106398510192625.


Best trial: 16. Best value: 4.1064:  55%|█████▌    | 22/40 [48:33<37:54, 126.35s/it]

[I 2025-11-17 19:24:38,404] Trial 21 finished with value: 4.106536433303795 and parameters: {'depth': 6, 'learning_rate': 0.02340955831094361, 'l2_leaf_reg': 2.815489644128715, 'subsample': 0.8487654758428467, 'random_strength': 0.5052836552908216, 'bagging_temperature': 0.42024609910984945}. Best is trial 16 with value: 4.106398510192625.


Best trial: 16. Best value: 4.1064:  57%|█████▊    | 23/40 [52:44<46:24, 163.78s/it]

[I 2025-11-17 19:28:49,502] Trial 22 finished with value: 4.106669315031203 and parameters: {'depth': 6, 'learning_rate': 0.020533797958888145, 'l2_leaf_reg': 5.840037868006504, 'subsample': 0.7393182725960709, 'random_strength': 1.1678961110629231, 'bagging_temperature': 0.4221993761406658}. Best is trial 16 with value: 4.106398510192625.


Best trial: 16. Best value: 4.1064:  60%|██████    | 24/40 [53:18<33:17, 124.82s/it]

[I 2025-11-17 19:29:23,434] Trial 23 finished with value: 4.106739718599835 and parameters: {'depth': 6, 'learning_rate': 0.023423180609812397, 'l2_leaf_reg': 2.663993896088838, 'subsample': 0.8407637958717706, 'random_strength': 0.5116175060830896, 'bagging_temperature': 0.41124032305142444}. Best is trial 16 with value: 4.106398510192625.


Best trial: 16. Best value: 4.1064:  62%|██████▎   | 25/40 [57:20<39:59, 159.94s/it]

[I 2025-11-17 19:33:25,291] Trial 24 finished with value: 4.1065106290625675 and parameters: {'depth': 6, 'learning_rate': 0.023740225569556593, 'l2_leaf_reg': 2.8288747962637615, 'subsample': 0.846071175407146, 'random_strength': 0.699342275403859, 'bagging_temperature': 0.45617487034191834}. Best is trial 16 with value: 4.106398510192625.


Best trial: 16. Best value: 4.1064:  65%|██████▌   | 26/40 [58:44<32:01, 137.23s/it]

[I 2025-11-17 19:34:49,557] Trial 25 finished with value: 4.106680765873017 and parameters: {'depth': 6, 'learning_rate': 0.020173976783207856, 'l2_leaf_reg': 3.2853277559042393, 'subsample': 0.7777175316042767, 'random_strength': 1.0013652592519824, 'bagging_temperature': 0.1699275571019232}. Best is trial 16 with value: 4.106398510192625.


Best trial: 16. Best value: 4.1064:  68%|██████▊   | 27/40 [1:02:51<36:51, 170.14s/it]

[I 2025-11-17 19:38:56,484] Trial 26 finished with value: 4.107059927548338 and parameters: {'depth': 6, 'learning_rate': 0.02003645553938417, 'l2_leaf_reg': 3.2807894436896374, 'subsample': 0.8631749102780778, 'random_strength': 1.7908882000908406, 'bagging_temperature': 0.46679314866346666}. Best is trial 16 with value: 4.106398510192625.


Best trial: 16. Best value: 4.1064:  70%|███████   | 28/40 [1:03:19<25:29, 127.48s/it]

[I 2025-11-17 19:39:24,424] Trial 27 finished with value: 4.107344713976985 and parameters: {'depth': 5, 'learning_rate': 0.024021642693085724, 'l2_leaf_reg': 4.227111817093591, 'subsample': 0.8727227115802122, 'random_strength': 1.7502742930428385, 'bagging_temperature': 0.519740342805473}. Best is trial 16 with value: 4.106398510192625.


Best trial: 16. Best value: 4.1064:  72%|███████▎  | 29/40 [1:07:54<31:29, 171.76s/it]

[I 2025-11-17 19:43:59,484] Trial 28 finished with value: 4.1071017667363 and parameters: {'depth': 5, 'learning_rate': 0.02324321093252745, 'l2_leaf_reg': 4.204741087809179, 'subsample': 0.8672360438472356, 'random_strength': 0.7853534773712, 'bagging_temperature': 0.3098809048154888}. Best is trial 16 with value: 4.106398510192625.


Best trial: 29. Best value: 4.10639:  75%|███████▌  | 30/40 [1:08:23<21:29, 128.92s/it]

[I 2025-11-17 19:44:28,462] Trial 29 finished with value: 4.106391174299612 and parameters: {'depth': 6, 'learning_rate': 0.022872270426979868, 'l2_leaf_reg': 4.014881862534349, 'subsample': 0.847236734294946, 'random_strength': 0.731107887920747, 'bagging_temperature': 0.4635035306948895}. Best is trial 29 with value: 4.106391174299612.


Best trial: 29. Best value: 4.10639:  78%|███████▊  | 31/40 [1:12:11<23:48, 158.68s/it]

[I 2025-11-17 19:48:16,577] Trial 30 finished with value: 4.106742185275502 and parameters: {'depth': 6, 'learning_rate': 0.025698965440708266, 'l2_leaf_reg': 2.119365465321693, 'subsample': 0.842736469388962, 'random_strength': 0.5156466484870262, 'bagging_temperature': 0.6070860571369767}. Best is trial 29 with value: 4.106391174299612.


Best trial: 29. Best value: 4.10639:  80%|████████  | 32/40 [1:13:23<17:40, 132.52s/it]

[I 2025-11-17 19:49:28,065] Trial 31 finished with value: 4.10691715767228 and parameters: {'depth': 6, 'learning_rate': 0.025326241348709796, 'l2_leaf_reg': 3.6236468599762377, 'subsample': 0.8998197254947831, 'random_strength': 1.0591608371136803, 'bagging_temperature': 0.6096925441164875}. Best is trial 29 with value: 4.106391174299612.


Best trial: 29. Best value: 4.10639:  82%|████████▎ | 33/40 [1:17:29<19:26, 166.66s/it]

[I 2025-11-17 19:53:34,377] Trial 32 finished with value: 4.106845225980416 and parameters: {'depth': 6, 'learning_rate': 0.02253353547453913, 'l2_leaf_reg': 2.7390716617616846, 'subsample': 0.8534606754827547, 'random_strength': 1.0411471456831247, 'bagging_temperature': 0.4664352766106681}. Best is trial 29 with value: 4.106391174299612.


Best trial: 29. Best value: 4.10639:  85%|████████▌ | 34/40 [1:18:29<13:26, 134.50s/it]

[I 2025-11-17 19:54:33,832] Trial 33 finished with value: 4.1065102453425855 and parameters: {'depth': 6, 'learning_rate': 0.022473152362790745, 'l2_leaf_reg': 2.7648812978803323, 'subsample': 0.825516159876847, 'random_strength': 0.8207204129662253, 'bagging_temperature': 0.46535814766909717}. Best is trial 29 with value: 4.106391174299612.


Best trial: 29. Best value: 4.10639:  88%|████████▊ | 35/40 [1:22:58<14:34, 174.99s/it]

[I 2025-11-17 19:59:03,309] Trial 34 finished with value: 4.106499338330347 and parameters: {'depth': 6, 'learning_rate': 0.02144219023403332, 'l2_leaf_reg': 4.672094674220013, 'subsample': 0.8232211434553708, 'random_strength': 0.7663706319680869, 'bagging_temperature': 0.3548656280325466}. Best is trial 29 with value: 4.106391174299612.


Best trial: 29. Best value: 4.10639:  90%|█████████ | 36/40 [1:24:12<09:38, 144.72s/it]

[I 2025-11-17 20:00:17,401] Trial 35 finished with value: 4.10674606513646 and parameters: {'depth': 6, 'learning_rate': 0.021437969491837437, 'l2_leaf_reg': 4.8274965453274294, 'subsample': 0.8272245302831016, 'random_strength': 1.4455312399626012, 'bagging_temperature': 0.3528952583168568}. Best is trial 29 with value: 4.106391174299612.


Best trial: 29. Best value: 4.10639:  92%|█████████▎| 37/40 [1:28:52<09:16, 185.40s/it]

[I 2025-11-17 20:04:57,708] Trial 36 finished with value: 4.106682045624402 and parameters: {'depth': 6, 'learning_rate': 0.021366052730996142, 'l2_leaf_reg': 4.864301781371315, 'subsample': 0.8254602853352773, 'random_strength': 1.4110009768197087, 'bagging_temperature': 0.10366421105715207}. Best is trial 29 with value: 4.106391174299612.


Best trial: 29. Best value: 4.10639:  95%|█████████▌| 38/40 [1:30:19<05:11, 155.76s/it]

[I 2025-11-17 20:06:24,329] Trial 37 finished with value: 4.106685214153209 and parameters: {'depth': 6, 'learning_rate': 0.021224595425353376, 'l2_leaf_reg': 3.7288952126765063, 'subsample': 0.8079377679075879, 'random_strength': 2.0751816513193564, 'bagging_temperature': 0.20718472942570026}. Best is trial 29 with value: 4.106391174299612.


Best trial: 29. Best value: 4.10639:  98%|█████████▊| 39/40 [1:33:43<02:50, 170.24s/it]

[I 2025-11-17 20:09:48,357] Trial 38 finished with value: 4.107314015070822 and parameters: {'depth': 5, 'learning_rate': 0.024778472112600383, 'l2_leaf_reg': 3.805345305329153, 'subsample': 0.8100157870165497, 'random_strength': 1.9927500915285554, 'bagging_temperature': 0.2433839797830411}. Best is trial 29 with value: 4.106391174299612.


Best trial: 29. Best value: 4.10639: 100%|██████████| 40/40 [1:34:29<00:00, 141.73s/it]


[I 2025-11-17 20:10:33,822] Trial 39 finished with value: 4.107113798299714 and parameters: {'depth': 5, 'learning_rate': 0.02733868841994063, 'l2_leaf_reg': 5.343837851160658, 'subsample': 0.7589913249369241, 'random_strength': 0.8732091640464774, 'bagging_temperature': 0.5614393962636348}. Best is trial 29 with value: 4.106391174299612.

Optuna search complete.
Best CV RMSE from Optuna: 4.106391174299612
Best params: {'depth': 6, 'learning_rate': 0.022872270426979868, 'l2_leaf_reg': 4.014881862534349, 'subsample': 0.847236734294946, 'random_strength': 0.731107887920747, 'bagging_temperature': 0.4635035306948895}

Training final 5-fold ensemble with best_params...

----- Final CV Fold 1 -----
Fold 1 RMSE: 4.1070, best_iter=1157

----- Final CV Fold 2 -----
Fold 2 RMSE: 4.1002, best_iter=1127

----- Final CV Fold 3 -----
Fold 3 RMSE: 4.1201, best_iter=1093

----- Final CV Fold 4 -----
Fold 4 RMSE: 4.1063, best_iter=1094

----- Final CV Fold 5 -----
Fold 5 RMSE: 4.0984, best_iter=999

F

In [None]:
import os, sys
import importlib

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import lightgbm as lgb      # still imported in case you need it later
import xgboost as xgb       # same
from catboost import CatBoostRegressor

import submission_utils
importlib.reload(submission_utils)
from submission_utils import save_submission

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import optuna  # make sure you have `pip install optuna` first


# ======================================================
# 0) Load data
# ======================================================

train = pd.read_csv("../../data/cattle_data_train.csv")
test = pd.read_csv("../../data/cattle_data_test.csv")

target = "Milk_Yield_L"
id_col = "Cattle_ID"


# ======================================================
# 1) Your existing preprocessing pipeline (UNCHANGED)
# ======================================================

def preprocess_pipeline(df, encode_flag=True, target_col="Milk_Yield_L", n_clusters=10):
    milk_features = df.copy()
    
    # -----------------------
    # 1) Drop impossible targets (train only)
    # -----------------------
    if target_col in milk_features.columns:
        milk_features = milk_features[milk_features[target_col] >= 0]
        y = milk_features[target_col]
        milk_features = milk_features.drop(columns=[target_col])
    else:
        y = None

    # -----------------------
    # 2) Basic cleaning
    # -----------------------
    if "Breed" in milk_features.columns:
        milk_features["Breed"] = milk_features["Breed"].str.strip()
        milk_features["Breed"] = milk_features["Breed"].replace({"Holstien": "Holstein"})

    if "Housing_Score" in milk_features.columns:
        milk_features["Housing_Score"] = milk_features["Housing_Score"].fillna(
            milk_features["Housing_Score"].median()
        )

    if "Feed_Quantity_kg" in milk_features.columns and "Feed_Type" in milk_features.columns:
        milk_features["Feed_Quantity_kg"] = milk_features.groupby("Feed_Type")["Feed_Quantity_kg"].transform(
            lambda x: x.fillna(x.median())
        )
    


            

    # Fill any remaining numeric NaNs
    numeric_cols = milk_features.select_dtypes(include="number").columns.tolist()
    milk_features[numeric_cols] = milk_features[numeric_cols].fillna(milk_features[numeric_cols].median())

    # -----------------------
    # 3) Date features
    # -----------------------
    if "Date" in milk_features.columns:
        milk_features["Date"] = pd.to_datetime(milk_features["Date"])
        milk_features["year"] = milk_features["Date"].dt.year
        milk_features["month"] = milk_features["Date"].dt.month
        milk_features["day"] = milk_features["Date"].dt.day
        milk_features["dayofweek"] = milk_features["Date"].dt.dayofweek
        milk_features["weekofyear"] = milk_features["Date"].dt.isocalendar().week.astype(int)
        milk_features["quarter"] = milk_features["Date"].dt.quarter
        milk_features["is_weekend"] = milk_features["dayofweek"].isin([5, 6]).astype(int)
        milk_features["date_ordinal"] = milk_features["Date"].map(pd.Timestamp.toordinal)
        milk_features = milk_features.drop(columns=["Date"])
    
    

    # -----------------------
    # 5) Farm clustering
    # -----------------------
    if "Farm_ID" in milk_features.columns:
        # Use only numeric features for clustering (exclude IDs)
        farm_numeric_cols = milk_features.select_dtypes(include="number").columns.tolist()
        farm_numeric_cols = [c for c in farm_numeric_cols if c not in ["Cattle_ID"]]

        # Aggregate per farm
        farm_features = milk_features.groupby("Farm_ID")[farm_numeric_cols].mean()

        # Scale and cluster farms
        scaler = StandardScaler()
        farm_scaled = scaler.fit_transform(farm_features)
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        farm_features["Cluster"] = kmeans.fit_predict(farm_scaled)

        # Map back to rows
        milk_features["Farm_Cluster"] = milk_features["Farm_ID"].map(farm_features["Cluster"])


    # -----------------------
    # INSERT: Weight_Efficiency_Z (Breed + Age_Year)
    # -----------------------
    if "Weight_kg" in milk_features.columns and "Age_Months" in milk_features.columns:
        milk_features["Age_Year"] = milk_features["Age_Months"] // 12
        group_cols = ["Breed", "Age_Year"]

        # Calculate cohort stats using transform so it maps back to original rows
        cohort_mean = milk_features.groupby(group_cols)["Weight_kg"].transform("mean")
        cohort_std = milk_features.groupby(group_cols)["Weight_kg"].transform("std")

        # Calculate Z-Score: How heavy is this cow relative to her peers?
        milk_features["Weight_Efficiency_Z"] = (
            (milk_features["Weight_kg"] - cohort_mean) / (cohort_std + 1e-5)
        )

        milk_features = milk_features.drop(columns=["Age_Year"])

    # -----------------------
    # 6) Drop raw IDs
    # -----------------------
    drop_cols = ["Cattle_ID"]
    milk_features = milk_features.drop(columns=[c for c in drop_cols if c in milk_features.columns])

    # -----------------------
    # 7) Optional one-hot encoding (we'll keep it OFF for CatBoost)
    # -----------------------
    if encode_flag:
        cat_cols = milk_features.select_dtypes(include="object").columns.tolist()
        milk_features = pd.get_dummies(milk_features, columns=cat_cols, drop_first=False)

    # Make sure Farm_Cluster exists even if something went weird
    if "Farm_Cluster" not in milk_features.columns:
        milk_features["Farm_Cluster"] = 0

    final_df = milk_features.copy()
    return final_df, y


# ======================================================
# 2) Build X, y, test_df (frozen pipeline)
# ======================================================

X, y = preprocess_pipeline(train, encode_flag=False, target_col=target)
test_df, _ = preprocess_pipeline(test, encode_flag=False, target_col=None)

print("Train shape:", X.shape)
print("Test shape:", test_df.shape)



# ==============================================================================
# REPLACEMENT: Farm_Performance Target Encoding (The Missing Signal)
# ==============================================================================
from sklearn.model_selection import KFold

# FIX: Reset indices (Keep this from previous attempt, it worked)
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)
test_df = test_df.reset_index(drop=True) 

def add_target_encoding_features(train_df, test_df, y_train, target_col_name="Milk_Yield_L", n_splits=5):
    # CHANGE: Encode Farm_ID instead of Bio features
    # This captures "Managerial Quality" - the specific effect of this farm on yield
    encodings = {
        "Farm_Performance": ["Farm_ID"]
    }
    
    train_encoded = train_df.copy()
    test_encoded = test_df.copy()
    
    # Combine X and y temporarily for the training split
    train_temp = train_encoded.copy()
    train_temp[target_col_name] = y_train.values

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for new_col, group_cols in encodings.items():
        # --- A) TRAINING SET: K-Fold Encoding (Prevents Leakage) ---
        train_encoded[new_col] = 0.0
        
        for tr_ind, val_ind in kf.split(train_temp):
            X_tr, X_val = train_temp.iloc[tr_ind], train_temp.iloc[val_ind]
            
            # Calculate mean yield per Farm on the 'training' part
            means = X_tr.groupby(group_cols)[target_col_name].mean()
            
            # Map to the 'validation' part
            # Since it's just one column (Farm_ID), simple map works
            train_encoded.loc[val_ind, new_col] = X_val[group_cols[0]].map(means).fillna(X_tr[target_col_name].mean())

        # --- B) TEST SET: Global Mean Encoding ---
        # Compute global farm stats from full training data
        global_means = train_temp.groupby(group_cols)[target_col_name].mean()
        global_avg = train_temp[target_col_name].mean()
        
        # Map to test set
        test_encoded[new_col] = test_encoded[group_cols[0]].map(global_means).fillna(global_avg)

    # OPTIONAL: Drop the raw Farm_ID now that we have the encoded score
    # This keeps the model clean
    train_encoded = train_encoded.drop(columns=["Farm_ID"], errors='ignore')
    test_encoded = test_encoded.drop(columns=["Farm_ID"], errors='ignore')

    return train_encoded, test_encoded

# Execute
X, test_df = add_target_encoding_features(X, test_df, y)
print("Added feature: Farm_Performance")

# ... Continue to CatBoost training ...



# Categorical columns for CatBoost (from the processed X)
cat_features = X.select_dtypes(include="object").columns.tolist()
print("CatBoost categorical features:", cat_features)

# KFold for everything
kf = KFold(n_splits=5, shuffle=True, random_state=42)


# ======================================================
# 3) Optuna hyperparameter search (5-fold CV)
# ======================================================

def objective(trial):
    params = {
        "depth": trial.suggest_int("depth", 5, 8),
        "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.04),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 7.0),
        "subsample": trial.suggest_float("subsample", 0.7, 0.9),
        "random_strength": trial.suggest_float("random_strength", 0.5, 5.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        # you can add more (e.g., border_count) if desired
    }

    fold_rmses = []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = CatBoostRegressor(
            loss_function="RMSE",
            n_estimators=3000,            # big cap, rely on early stopping
            early_stopping_rounds=100,
            random_seed=42,
            thread_count=5,
            verbose=False,                # keep Optuna runs quiet
            **params
        )

        model.fit(
            X_train,
            y_train,
            eval_set=(X_val, y_val),
            cat_features=cat_features,   # using column names
            verbose=False
        )

        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        fold_rmses.append(rmse)

    mean_rmse = float(np.mean(fold_rmses))
    return mean_rmse


# You can bump this if you want; 40–60 is a nice compromise
N_TRIALS = 40  

print("\nStarting Optuna hyperparameter search...")
study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=N_TRIALS, n_jobs=4, show_progress_bar=True)

best_params = study.best_params
best_cv_rmse = study.best_value

print("\nOptuna search complete.")
print("Best CV RMSE from Optuna:", best_cv_rmse)
print("Best params:", best_params)


# ======================================================
# 4) Final 5-fold CV ensemble with best_params
# ======================================================

cv_models = []
fold_rmses = []
fold_best_iters = []

print("\nTraining final 5-fold ensemble with best_params...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X), start=1):
    print(f"\n----- Final CV Fold {fold} -----")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # use SAME seed as Optuna for consistency
    model = CatBoostRegressor(
        loss_function="RMSE",
        n_estimators=3000,
        early_stopping_rounds=100,
        random_seed=42,
        thread_count=5,
        verbose=False,
        task_type="GPU",
        devices="0",   # or "0:1" if he wants multi-GPU (rarely needed)
        **best_params
    )

    model.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        cat_features=cat_features,
        verbose=False
    )

    preds_val = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds_val))
    best_iter = model.get_best_iteration()

    print(f"Fold {fold} RMSE: {rmse:.4f}, best_iter={best_iter}")

    fold_rmses.append(rmse)
    fold_best_iters.append(best_iter)
    cv_models.append(model)

final_cv_rmse = float(np.mean(fold_rmses))
print("\n===============================")
print(f"Final 5-fold ensemble CV RMSE: {final_cv_rmse:.4f}")
print("Fold RMSEs:", fold_rmses)
print("Best iters:", fold_best_iters)

# sanity check vs Optuna's own CV
if final_cv_rmse > best_cv_rmse + 0.001:
    print("⚠️ WARNING: Final CV RMSE is worse than Optuna's best!")
    print(f"Optuna best: {best_cv_rmse:.4f}, Final CV: {final_cv_rmse:.4f}")
    print("This might indicate some instability / differences in folds.")
print("===============================")


# ======================================================
# 5) CV-ensemble predictions on test_df (SAFE baseline)
# ======================================================

cv_test_preds = np.zeros(len(test_df), dtype=float)

for model in cv_models:
    cv_test_preds += model.predict(test_df)

cv_test_preds /= len(cv_models)

# This is your safest, fully CV-validated prediction vector
sub_cv = pd.DataFrame({
    id_col: test[id_col],
    target: cv_test_preds
})

save_submission(sub_cv, run_name="felipe_catboost_optuna_cv_only")
print("Saved CV-only submission: felipe_catboost_optuna_cv_only")


# ======================================================
# 6) Optional: full-data multi-seed ensemble
# ======================================================

avg_best_iter = int(np.mean(fold_best_iters))
print("\nAverage best_iteration across folds:", avg_best_iter)

n_estimators_full = avg_best_iter
print("Using n_estimators for full-data models:", n_estimators_full)

seed_list = [101, 202, 303, 404, 505]
full_seed_models = []
full_seed_preds = np.zeros(len(test_df), dtype=float)

for seed in seed_list:
    print(f"\nTraining full-data model with seed={seed}...")
    model = CatBoostRegressor(
        loss_function="RMSE",
        n_estimators=n_estimators_full,
        random_seed=seed,
        thread_count=5,
        verbose=False,
        task_type="GPU",
        devices="0",   # or "0:1" if he wants multi-GPU (rarely needed)
        **best_params
        # no early_stopping_rounds and no eval_set here
    )

    model.fit(
        X,
        y,
        cat_features=cat_features,
        verbose=False
    )

    full_seed_models.append(model)
    full_seed_preds += model.predict(test_df)

full_seed_preds /= len(seed_list)

sub_full = pd.DataFrame({
    id_col: test[id_col],
    target: full_seed_preds
})

save_submission(sub_full, run_name="felipe_catboost_optuna_full_seed_only")
print("Saved full-data seed ensemble submission: felipe_catboost_optuna_full_seed_only")


def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# ======================================================
# 7) Blended submissions (CV + full-data)
# ======================================================

cv_only_oof    = np.zeros(len(X), dtype=float)
full_style_oof = np.zeros(len(X), dtype=float)

for fold, (train_idx, val_idx) in enumerate(kf.split(X), start=1):
    print(f"\n[OOF Eval] Fold {fold}")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # CV-only preds
    cv_model = cv_models[fold - 1]
    cv_preds_val = cv_model.predict(X_val)

    # Full-seed-style preds (train on fold train only)
    fold_full_preds = np.zeros(len(val_idx), dtype=float)
    for seed in seed_list:
        model = CatBoostRegressor(
            loss_function="RMSE",
            n_estimators=n_estimators_full,
            random_seed=seed,
            verbose=False,
            thread_count=5,
            task_type="GPU",
            devices="0",   # or "0:1" if he wants multi-GPU (rarely needed)
            **best_params
        )
        model.fit(X_train, y_train, cat_features=cat_features, verbose=False)
        fold_full_preds += model.predict(X_val)

    fold_full_preds /= len(seed_list)

    cv_only_oof[val_idx]    = cv_preds_val
    full_style_oof[val_idx] = fold_full_preds

rmse_cv_only    = rmse(y, cv_only_oof)
rmse_full_style = rmse(y, full_style_oof)

print("\n=== OOF RMSE (base strategies) ===")
print(f"CV-only OOF RMSE:        {rmse_cv_only:.6f}")
print(f"Full-seed-style OOF RMSE:{rmse_full_style:.6f}")

# Search best alpha
alphas = np.linspace(0, 1, 101)
best_alpha, best_rmse = None, 1e9

for a in alphas:
    blend_oof = a * cv_only_oof + (1 - a) * full_style_oof
    r = rmse(y, blend_oof)
    if r < best_rmse:
        best_rmse, best_alpha = r, a

print(f"\nBest alpha from OOF search: {best_alpha:.2f}")
print(f"Best blended OOF RMSE:      {best_rmse:.6f}")

# Final blended test preds using best_alpha
final_blend_test = best_alpha * cv_test_preds + (1 - best_alpha) * full_seed_preds

sub_blend_best = pd.DataFrame({
    id_col: test[id_col],
    target: final_blend_test
})
save_submission(sub_blend_best, run_name=f"felipe_catboost_optuna_blend_best_{best_alpha:.2f}")
print("Saved best-alpha blended submission.")

print("\n=== OOF RMSE comparison ===")
print(f"CV-only:             {rmse_cv_only:.6f}")
print(f"Full-seed-style:     {rmse_full_style:.6f}")
print(f"Best blend (alpha={best_alpha:.2f}): {best_rmse:.6f}")
