In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Load data
train = pd.read_csv("cattle_data_train.csv")
test = pd.read_csv("cattle_data_test.csv")

# Target + ID
target = "Milk_Yield_L"
id_col = "Cattle_ID"

# Basic preprocessing
# (turn categories -> strings -> numeric encodings)
def preprocess(df):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].astype("category").cat.codes
    return df

train_prep = preprocess(train.drop(columns=[target]))
test_prep = preprocess(test)

y = train[target]
X = train_prep

# 5-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_rmse = []
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n----- Fold {fold+1} -----")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=-1,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    model.fit(X_train, y_train)

    preds = model.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    fold_rmse.append(rmse)

    print(f"Fold {fold+1} RMSE: {rmse:.4f}")
    models.append(model)

print("\n==========================")
print(f"Average RMSE: {np.mean(fold_rmse):.4f}")
print("==========================\n")

# -----------------------
# Train final model on full data
# -----------------------
final_model = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
final_model.fit(X, y)

# -----------------------
# Create submission
# -----------------------
test_preds = final_model.predict(test_prep)

submission = pd.DataFrame({
    id_col: test[id_col],
    target: test_preds
})

submission.to_csv("submission.csv", index=False)
print("submission.csv created!")


----- Fold 1 -----
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005833 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3988
[LightGBM] [Info] Number of data points in the train set: 168000, number of used features: 35
[LightGBM] [Info] Start training from score 15.587505
Fold 1 RMSE: 4.1518

----- Fold 2 -----
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008604 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3987
[LightGBM] [Info] Number of data points in the train set: 168000, number of used features: 35
[LightGBM] [Info] Start training from score 15.580781
Fold 2 RMSE: 4.1184

----- Fold 3 -----
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008844 seconds.
You can set `force_col_wise=true` to remove 

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# --- Reuse already-loaded train/test if they exist ---
# If not, load again
try:
    train
    test
    print("Reusing loaded train/test...")
except NameError:
    print("Loading data fresh...")
    train = pd.read_csv("cattle_data_train.csv")
    test = pd.read_csv("cattle_data_test.csv")

target = "Milk_Yield_L"
id_col = "Cattle_ID"

def preprocess(df):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].astype("category").cat.codes
    return df

train_prep = preprocess(train.drop(columns=[target]))
test_prep = preprocess(test)

X = train_prep
y = train[target]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_rmse = []
xgb_models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n----- XGBoost Fold {fold+1} -----")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = xgb.XGBRegressor(
        n_estimators=700,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        tree_method="hist",      # fast for large data
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    preds = model.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    fold_rmse.append(rmse)
    
    print(f"Fold {fold+1} RMSE: {rmse:.4f}")
    xgb_models.append(model)

print("\n==========================")
print(f"XGBoost Average RMSE = {np.mean(fold_rmse):.4f}")
print("==========================\n")

# --- Train final model on all data ---
final_xgb = xgb.XGBRegressor(
    n_estimators=700,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    tree_method="hist",
    random_state=42
)

final_xgb.fit(X, y)

# --- Submission ---
test_preds = final_xgb.predict(test_prep)

submission_xgb = pd.DataFrame({
    id_col: test[id_col],
    target: test_preds
})

submission_xgb.to_csv("submission_xgb.csv", index=False)
print("submission_xgb.csv created!")

Reusing loaded train/test...

----- XGBoost Fold 1 -----
Fold 1 RMSE: 4.1855

----- XGBoost Fold 2 -----
Fold 2 RMSE: 4.1582

----- XGBoost Fold 3 -----
Fold 3 RMSE: 4.1531

----- XGBoost Fold 4 -----
Fold 4 RMSE: 4.1820

----- XGBoost Fold 5 -----
Fold 5 RMSE: 4.1717

XGBoost Average RMSE = 4.1701

submission_xgb.csv created!


In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# 5-Fold Cross-Validation for CatBoost with improved hyperparameters
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cat_fold_rmse = []
cat_models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n----- CatBoost Fold {fold+1} -----")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = CatBoostRegressor(
        loss_function="RMSE",
        # increased for early stopping
        n_estimators=2000, 
        # lower learning rate (was 0.05)
        learning_rate=0.03,          
        # reduced depth (was 8) - key for reducing overfit
        depth=6,    
        # more aggressive subsampling (was 0.8)
        subsample=0.7,       
        # feature sampling per level         
        colsample_bylevel=0.8,        
         # L2 regularization (was default 3)
        l2_leaf_reg=5,              
        # Randomness for splits (was default 1)
        random_strength=1.5,
        # bayesian bootstrap intensity
        bagging_temperature=0.5,      
        random_seed=42,
        # stop if no improvement
        early_stopping_rounds=100,
        verbose=False
    )

    model.fit(
        X_train, y_train, 
        eval_set=(X_val, y_val),
         # use best iteration, not last
        use_best_model=True,
        verbose=False
    )

    preds = model.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    cat_fold_rmse.append(rmse)

    print(f"CatBoost Fold {fold+1} RMSE: {rmse:.4f}")
    print(f"Best iteration: {model.best_iteration_}")
    cat_models.append(model)

print("\n==========================")
print(f"CatBoost Average RMSE: {np.mean(cat_fold_rmse):.4f}")
print(f"CatBoost Std RMSE: {np.std(cat_fold_rmse):.4f}")
print("==========================\n")

# use ensemble averaging from fold models instead of retraining
# generalizes better than a single model on full data
cat_test_preds = np.zeros(len(test_prep))
for model in cat_models:
    cat_test_preds += model.predict(test_prep) / len(cat_models)

cat_submission = pd.DataFrame({
    id_col: test[id_col],
    target: cat_test_preds
})

cat_submission.to_csv("submission_catboost_improved.csv", index=False)
print("submission_catboost_improved.csv created!")

# feature importance analysis
feature_importance = final_cat_model.get_feature_importance()
feature_names = X.columns
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(importance_df.head(10))


----- CatBoost Fold 1 -----
CatBoost Fold 1 RMSE: 4.1354
Best iteration: 920

----- CatBoost Fold 2 -----
CatBoost Fold 2 RMSE: 4.0990
Best iteration: 998

----- CatBoost Fold 3 -----
CatBoost Fold 3 RMSE: 4.1054
Best iteration: 1088

----- CatBoost Fold 4 -----
CatBoost Fold 4 RMSE: 4.1321
Best iteration: 1149

----- CatBoost Fold 5 -----
CatBoost Fold 5 RMSE: 4.1240
Best iteration: 1123

CatBoost Average RMSE: 4.1192
CatBoost Std RMSE: 0.0145

submission_catboost_improved.csv created!

Top 10 Most Important Features:
             feature  importance
4         Age_Months   19.068072
31              Date   13.175092
5          Weight_kg   12.846055
6             Parity    9.965733
33  Feed_Quantity_lb    5.545830
10  Feed_Quantity_kg    5.077596
12    Water_Intake_L    4.886213
8       Days_in_Milk    3.982507
34          Mastitis    3.146792
7    Lactation_Stage    2.113956
