In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# submission functionality
# ------------------------
import os, sys
sys.path.append(os.path.abspath("..")) # so src/ is on the path

import importlib
import submission_utils
importlib.reload(submission_utils) # force reload latest code

from submission_utils import save_submission
# ------------------------
import xgboost as xgb

from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

# # Load data
# train = pd.read_csv("../../data/cattle_data_train.csv")
# test = pd.read_csv("../../data/cattle_data_test.csv")

# # Target + ID
# target = "Milk_Yield_L"
# id_col = "Cattle_ID"

# # Basic preprocessing
# # (turn categories -> strings -> numeric encodings)
# def preprocess(df):
#     df = df.copy()
#     for col in df.columns:
#         if df[col].dtype == "object":
#             df[col] = df[col].astype("category").cat.codes
#     return df

# train_prep = preprocess(train.drop(columns=[target]))
# test_prep = preprocess(test)

# y = train[target]
# X = train_prep

# # 5-Fold Cross-Validation
# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# fold_rmse = []
# models = []

# for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
#     print(f"\n----- Fold {fold+1} -----")

#     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#     model = lgb.LGBMRegressor(
#         n_estimators=500,
#         learning_rate=0.05,
#         max_depth=-1,
#         num_leaves=64,
#         subsample=0.8,
#         colsample_bytree=0.8,
#         random_state=42
#     )

#     model.fit(X_train, y_train)

#     preds = model.predict(X_val)
#     mse = mean_squared_error(y_val, preds)
#     rmse = np.sqrt(mse)
#     fold_rmse.append(rmse)

#     print(f"Fold {fold+1} RMSE: {rmse:.4f}")
#     models.append(model)

# print("\n==========================")
# print(f"Average RMSE: {np.mean(fold_rmse):.4f}")
# print("==========================\n")

# # -----------------------
# # Train final model on full data
# # -----------------------
# final_model = lgb.LGBMRegressor(
#     n_estimators=500,
#     learning_rate=0.05,
#     max_depth=-1,
#     num_leaves=64,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     random_state=42
# )
# final_model.fit(X, y)

# # -----------------------
# # Create submission
# # -----------------------
# test_preds = final_model.predict(test_prep)

# submission = pd.DataFrame({
#     id_col: test[id_col],
#     target: test_preds
# })

# save_submission(submission, run_name="felipe_model")
# print("submission.csv created!")

In [70]:
# Load data
train = pd.read_csv("../../data/cattle_data_train.csv")
# drop 74 negative milk_yield_l rows. KEEP!!!
train = train[train["Milk_Yield_L"] >= 0].copy()
test = pd.read_csv("../../data/cattle_data_test.csv")

In [71]:
# Target + ID
target = "Milk_Yield_L"
id_col = "Cattle_ID"

# Basic preprocessing
# (turn categories -> strings -> numeric encodings)
def preprocess(df):
    milk_test = df.copy()

    # LACTATION CURVE
    # Peak lactation detection (biological peak at 50-100 days)
    milk_test['is_peak_lactation'] = (
        (milk_test['Days_in_Milk'] >= 40) & 
        (milk_test['Days_in_Milk'] <= 100)
    ).astype(int)
    
    milk_test['is_early_lactation'] = (milk_test['Days_in_Milk'] < 40).astype(int)
    milk_test['is_late_lactation'] = (milk_test['Days_in_Milk'] > 250).astype(int)
    
    # Polynomial terms (capture non-linear lactation curve)
    milk_test['dim_squared'] = milk_test['Days_in_Milk'] ** 2
    milk_test['dim_cubed'] = milk_test['Days_in_Milk'] ** 3
    milk_test['dim_log'] = np.log1p(milk_test['Days_in_Milk'])
    
    # Interaction with parity (older cows have different curves)
    milk_test['dim_parity'] = milk_test['Days_in_Milk'] * milk_test['Parity']


    # BASE FEATURES
    milk_test["Date"] = pd.to_datetime(milk_test["Date"])
    milk_test['year'] = milk_test['Date'].dt.year
    milk_test['month'] = milk_test['Date'].dt.month
    milk_test['day'] = milk_test['Date'].dt.day
    milk_test['dayofweek'] = milk_test['Date'].dt.dayofweek
    milk_test['weekofyear'] = milk_test['Date'].dt.isocalendar().week.astype(int)
    milk_test['quarter'] = milk_test['Date'].dt.quarter
    milk_test['is_weekend'] = milk_test['dayofweek'].isin([5, 6]).astype(int)
    milk_test['date_ordinal'] = milk_test['Date'].map(pd.Timestamp.toordinal)
    milk_test['Breed'] = milk_test['Breed'].str.strip()
    milk_test['Breed'] = milk_test['Breed'].replace({'Holstien': 'Holstein'})
    milk_test['Feed_Quantity_kg'] = milk_test.groupby('Feed_Type')['Feed_Quantity_kg'].transform(lambda x: x.fillna(x.median()))
    milk_test['Housing_Score'] = milk_test['Housing_Score'].transform(lambda x: x.fillna(x.median()))
    milk_test = milk_test.drop("Feed_Quantity_lb", axis=1)
    milk_test = milk_test.drop(id_col, axis=1)
    milk_test = milk_test.drop("Date", axis=1)
    for col in milk_test.columns:
        if milk_test[col].dtype == "object":
            milk_test[col] = milk_test[col].astype("category").cat.codes
    return milk_test

train_prep = preprocess(train.drop(columns=[target]))
test_prep = preprocess(test)

y = train[target]
X = train_prep

# Tell CatBoost which columns are categorical
# original object columns from the raw train
orig_cat_cols = train.select_dtypes(include=["object"]).columns.tolist()

# we dropped these from X, so exclude them
orig_cat_cols = [c for c in orig_cat_cols if c not in [id_col, "Date"]]

# keep only those that are actually in X after preprocess
cat_feature_cols = [c for c in orig_cat_cols if c in X.columns]

# CatBoost wants indices, not names
cat_features = [X.columns.get_loc(c) for c in cat_feature_cols]

print("CatBoost categorical features:", cat_feature_cols)

CatBoost categorical features: ['Breed', 'Climate_Zone', 'Management_System', 'Lactation_Stage', 'Feed_Type', 'Farm_ID']


In [None]:
# # -----------------------------------
# # LightGBM hyperparameter search
# # -----------------------------------
# from sklearn.model_selection import RandomizedSearchCV

# lgb_base = lgb.LGBMRegressor(
#     objective="regression",
#     random_state=42
# )

# lgb_param_dist = {
#     "num_leaves": [31, 63, 127, 255],
#     "max_depth": [-1, 6, 8, 10],
#     "learning_rate": [0.01, 0.03, 0.05, 0.08],
#     "n_estimators": [300, 600, 900, 1200],
#     "subsample": [0.7, 0.8, 0.9],
#     "colsample_bytree": [0.7, 0.8, 0.9],
#     "min_child_samples": [10, 20, 50, 100]
# }

# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# lgb_search = RandomizedSearchCV(
#     estimator=lgb_base,
#     param_distributions=lgb_param_dist,
#     n_iter=25,                         # reduce if too slow
#     scoring="neg_mean_squared_error",  # we'll sqrt it to get RMSE
#     cv=kf,
#     verbose=1,
#     n_jobs=-1,
#     random_state=42,
# )

# print("Starting LightGBM hyperparameter search...")
# lgb_search.fit(X, y)

# best_lgb_params = lgb_search.best_params_
# best_lgb_rmse = np.sqrt(-lgb_search.best_score_)

# print("\nBest LightGBM params:")
# print(best_lgb_params)
# print(f"Best LightGBM CV RMSE: {best_lgb_rmse:.4f}")

In [25]:
# Adjust path if needed
train = pd.read_csv("../../data/cattle_data_train.csv")

print("Shape:", train.shape)
print("\nDtypes:\n", train.dtypes)

# -----------------------------
# Missing values per column
# -----------------------------
print("\n=== Missing values per column ===")
missing = train.isna().sum()
print(missing[missing > 0].sort_values(ascending=False))

# -----------------------------
# Numeric summary stats
# -----------------------------
numeric_cols = train.select_dtypes(include=[np.number]).columns
print("\n=== Numeric summary (head) ===")
print(train[numeric_cols].describe().T.head(15))  # first 15 numeric features

# -----------------------------
# Categorical overview
# -----------------------------
cat_cols = train.select_dtypes(include=["object"]).columns
print("\n=== Categorical columns and num unique values ===")
for c in cat_cols:
    print(f"{c}: {train[c].nunique()} unique")

# Example: inspect top values for a few key categoricals
for c in ["Breed", "Climate_Zone", "Management_System", "Feed_Type", "Farm_ID"]:
    if c in train.columns:
        print(f"\nValue counts for {c}:")
        print(train[c].value_counts().head(10))

# -----------------------------
# Negative values in numeric columns
# -----------------------------
numeric = train[numeric_cols]
neg_mask = numeric < 0
neg_cols = neg_mask.any()
print("\n=== Columns with at least one negative value ===")
print(neg_cols[neg_cols])

neg_counts = neg_mask.sum()
print("\n=== Negative counts per column ===")
print(neg_counts[neg_counts > 0])

# -----------------------------
# Correlation with target (numeric only)
# -----------------------------
if "Milk_Yield_L" in train.columns:
    corr = numeric.corr()["Milk_Yield_L"].sort_values(ascending=False)
    print("\n=== Correlation with Milk_Yield_L (numeric features) ===")
    print(corr.head(15))   # top positive
    print("\nMost negative correlations:")
    print(corr.tail(15))

Shape: (210000, 36)

Dtypes:
 Cattle_ID                   object
Breed                       object
Climate_Zone                object
Management_System           object
Age_Months                   int64
Weight_kg                  float64
Parity                       int64
Lactation_Stage             object
Days_in_Milk                 int64
Feed_Type                   object
Feed_Quantity_kg           float64
Feeding_Frequency            int64
Water_Intake_L             float64
Walking_Distance_km        float64
Grazing_Duration_hrs       float64
Rumination_Time_hrs        float64
Resting_Hours              float64
Ambient_Temperature_C      float64
Humidity_percent           float64
Housing_Score              float64
FMD_Vaccine                  int64
Brucellosis_Vaccine          int64
HS_Vaccine                   int64
BQ_Vaccine                   int64
Anthrax_Vaccine              int64
IBR_Vaccine                  int64
BVD_Vaccine                  int64
Rabies_Vaccine           

In [19]:
for i in train.columns:
    # print(train[i].dtype)
    print(train[i].name, ":", train[i].dtype)


Cattle_ID : object
Breed : object
Climate_Zone : object
Management_System : object
Age_Months : int64
Weight_kg : float64
Parity : int64
Lactation_Stage : object
Days_in_Milk : int64
Feed_Type : object
Feed_Quantity_kg : float64
Feeding_Frequency : int64
Water_Intake_L : float64
Walking_Distance_km : float64
Grazing_Duration_hrs : float64
Rumination_Time_hrs : float64
Resting_Hours : float64
Ambient_Temperature_C : float64
Humidity_percent : float64
Housing_Score : float64
FMD_Vaccine : int64
Brucellosis_Vaccine : int64
HS_Vaccine : int64
BQ_Vaccine : int64
Anthrax_Vaccine : int64
IBR_Vaccine : int64
BVD_Vaccine : int64
Rabies_Vaccine : int64
Previous_Week_Avg_Yield : float64
Body_Condition_Score : float64
Milking_Interval_hrs : int64
Date : object
Farm_ID : object
Feed_Quantity_lb : float64
Mastitis : int64
Milk_Yield_L : float64


In [69]:
# 5-Fold Cross-Validation for CatBoost
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cat_fold_rmse = []
cat_models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n----- CatBoost Fold {fold+1} -----")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = CatBoostRegressor(
        loss_function="RMSE",
        n_estimators=1000,
        learning_rate=0.05,
        depth=8,
        subsample=0.8,
        random_seed=42,
        verbose=False  # turn off big CatBoost logs
    )

    # model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)
    
    # changed this to tell it what features are categorical
    model.fit(
    X_train,
    y_train,
    eval_set=(X_val, y_val),
    cat_features=cat_features,
    verbose=False
)

    preds = model.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    cat_fold_rmse.append(rmse)

    print(f"CatBoost Fold {fold+1} RMSE: {rmse:.4f}")
    cat_models.append(model)

print("\n==========================")
print(f"CatBoost Average RMSE: {np.mean(cat_fold_rmse):.4f}")
print("==========================\n")



----- CatBoost Fold 1 -----
CatBoost Fold 1 RMSE: 4.1088

----- CatBoost Fold 2 -----
CatBoost Fold 2 RMSE: 4.1004

----- CatBoost Fold 3 -----
CatBoost Fold 3 RMSE: 4.1239

----- CatBoost Fold 4 -----
CatBoost Fold 4 RMSE: 4.1074

----- CatBoost Fold 5 -----
CatBoost Fold 5 RMSE: 4.1023

CatBoost Average RMSE: 4.1085



In [None]:
# Train final CatBoost model on full data
final_cat_model = CatBoostRegressor(
    loss_function="RMSE",
    n_estimators=1000,
    learning_rate=0.05,
    depth=8,
    subsample=0.8,
    random_seed=42,
    verbose=False
)
# when you dont use true categorical features
# final_cat_model.fit(X, y, verbose=False)

# if using true categorical features (which we are now)
final_cat_model.fit(
    X,
    y,
    cat_features=cat_features,
    verbose=False
)

# Train final LightGBM model on full data
final_lgb_model = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

final_lgb_model.fit(X, y)

# Evaluate on TRAIN set (just to see behavior, not for Kaggle)
cat_train_preds = final_cat_model.predict(X)
lgb_train_preds = final_lgb_model.predict(X)

cat_train_rmse = np.sqrt(mean_squared_error(y, cat_train_preds))
lgb_train_rmse = np.sqrt(mean_squared_error(y, lgb_train_preds))

ensemble_train_preds = 0.7 * cat_train_preds + 0.3 * lgb_train_preds
ensemble_train_rmse = np.sqrt(mean_squared_error(y, ensemble_train_preds))

print(f"Final CatBoost TRAIN RMSE:  {cat_train_rmse:.4f}")
print(f"Final LightGBM TRAIN RMSE:  {lgb_train_rmse:.4f}")
print(f"Ensemble(0.7 CB, 0.3 LGB) TRAIN RMSE: {ensemble_train_rmse:.4f}")

# Make TEST predictions for Kaggle
cat_test_preds = final_cat_model.predict(test_prep)
lgb_test_preds = final_lgb_model.predict(test_prep)

# Simple weighted ensemble (CatBoost did better in CV)
ensemble_test_preds = 0.7 * cat_test_preds + 0.3 * lgb_test_preds

# Build DataFrames
cat_submission = pd.DataFrame({
    id_col: test[id_col],
    target: cat_test_preds
})

lgb_submission = pd.DataFrame({
    id_col: test[id_col],
    target: lgb_test_preds
})

ensemble_submission = pd.DataFrame({
    id_col: test[id_col],
    target: ensemble_test_preds
})

# Choose which to save/submit:
save_submission(cat_submission, run_name="felipe_catboost_only")
# save_submission(lgb_submission, run_name="lightgbm_only")
# Recommended:
# save_submission(ensemble_submission, run_name="cat_lgb_ensemble")



# # Make submission
# cat_test_preds = final_cat_model.predict(test_prep)

# cat_submission = pd.DataFrame({
#     id_col: test[id_col],
#     target: cat_test_preds
# })

# # save_submission(cat_submission, run_name="sankarsh_model")
# print("submission_catboost.csv created!")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004004 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4625
[LightGBM] [Info] Number of data points in the train set: 209926, number of used features: 47
[LightGBM] [Info] Start training from score 15.595179
Final CatBoost TRAIN RMSE:  3.9662
Final LightGBM TRAIN RMSE:  3.8379
Ensemble(0.7 CB, 0.3 LGB) TRAIN RMSE: 3.9232
Saved submission -> /Users/felipebenitez/ML-Project/submissions/felipe_catboost_only__2025-11-15__22-57-41.csv


In [68]:
# 5-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_rmse = []
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n----- Fold {fold+1} -----")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=-1,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    model.fit(X_train, y_train)

    preds = model.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    fold_rmse.append(rmse)

    print(f"Fold {fold+1} RMSE: {rmse:.4f}")
    models.append(model)

print("\n==========================")
print(f"Average RMSE: {np.mean(fold_rmse):.4f}")
print("==========================\n")


----- Fold 1 -----
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4621
[LightGBM] [Info] Number of data points in the train set: 167940, number of used features: 47
[LightGBM] [Info] Start training from score 15.593146
Fold 1 RMSE: 4.1256

----- Fold 2 -----
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004633 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4625
[LightGBM] [Info] Number of data points in the train set: 167941, number of used features: 47
[LightGBM] [Info] Start training from score 15.593859
Fold 2 RMSE: 4.1163

----- Fold 3 -----
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing wa

In [None]:
# # --- Reuse already-loaded train/test if they exist ---
# # If not, load again
# try:
#     train
#     test
#     print("Reusing loaded train/test...")
# except NameError:
#     print("Loading data fresh...")
#     train = pd.read_csv("../../data/cattle_data_train.csv")
#     test = pd.read_csv("../../data/cattle_data_test.csv")

# target = "Milk_Yield_L"
# id_col = "Cattle_ID"

# def preprocess(df):
#     df = df.copy()
#     for col in df.columns:
#         if df[col].dtype == "object":
#             df[col] = df[col].astype("category").cat.codes
#     return df

# train_prep = preprocess(train.drop(columns=[target]))
# test_prep = preprocess(test)

# X = train_prep
# y = train[target]

# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# fold_rmse = []
# xgb_models = []

# for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
#     print(f"\n----- XGBoost Fold {fold+1} -----")
    
#     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
#     model = xgb.XGBRegressor(
#         n_estimators=700,
#         learning_rate=0.05,
#         max_depth=8,
#         subsample=0.8,
#         colsample_bytree=0.8,
#         objective="reg:squarederror",
#         tree_method="hist",      # fast for large data
#         random_state=42
#     )
    
#     model.fit(X_train, y_train)
    
#     preds = model.predict(X_val)
#     mse = mean_squared_error(y_val, preds)
#     rmse = np.sqrt(mse)
#     fold_rmse.append(rmse)
    
#     print(f"Fold {fold+1} RMSE: {rmse:.4f}")
#     xgb_models.append(model)

# print("\n==========================")
# print(f"XGBoost Average RMSE = {np.mean(fold_rmse):.4f}")
# print("==========================\n")

# # --- Train final model on all data ---
# final_xgb = xgb.XGBRegressor(
#     n_estimators=700,
#     learning_rate=0.05,
#     max_depth=8,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     objective="reg:squarederror",
#     tree_method="hist",
#     random_state=42
# )

# final_xgb.fit(X, y)

# # --- Submission ---
# test_preds = final_xgb.predict(test_prep)

# submission_xgb = pd.DataFrame({
#     id_col: test[id_col],
#     target: test_preds
# })

# # save_submission(submission, run_name="felipe_model")
# print("submission_xgb.csv created!")

Reusing loaded train/test...

----- XGBoost Fold 1 -----
Fold 1 RMSE: 4.1888

----- XGBoost Fold 2 -----
Fold 2 RMSE: 4.1517

----- XGBoost Fold 3 -----
Fold 3 RMSE: 4.1568

----- XGBoost Fold 4 -----
Fold 4 RMSE: 4.1814

----- XGBoost Fold 5 -----
Fold 5 RMSE: 4.1694

XGBoost Average RMSE = 4.1696

submission_xgb.csv created!


In [None]:
# # If you haven't installed it yet:
# # pip install catboost

# from catboost import CatBoostRegressor
# from sklearn.model_selection import KFold
# from sklearn.metrics import mean_squared_error
# import numpy as np
# import pandas as pd

# # 5-Fold Cross-Validation for CatBoost
# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# cat_fold_rmse = []
# cat_models = []

# for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
#     print(f"\n----- CatBoost Fold {fold+1} -----")

#     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#     model = CatBoostRegressor(
#         loss_function="RMSE",
#         n_estimators=1000,
#         learning_rate=0.05,
#         depth=8,
#         subsample=0.8,
#         random_seed=42,
#         verbose=False  # turn off big CatBoost logs
#     )

#     model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)

#     preds = model.predict(X_val)
#     mse = mean_squared_error(y_val, preds)
#     rmse = np.sqrt(mse)
#     cat_fold_rmse.append(rmse)

#     print(f"CatBoost Fold {fold+1} RMSE: {rmse:.4f}")
#     cat_models.append(model)

# print("\n==========================")
# print(f"CatBoost Average RMSE: {np.mean(cat_fold_rmse):.4f}")
# print("==========================\n")

# # Train final CatBoost model on full data
# final_cat_model = CatBoostRegressor(
#     loss_function="RMSE",
#     n_estimators=1000,
#     learning_rate=0.05,
#     depth=8,
#     subsample=0.8,
#     random_seed=42,
#     verbose=False
# )
# final_cat_model.fit(X, y, verbose=False)

# # Make submission
# cat_test_preds = final_cat_model.predict(test_prep)

# cat_submission = pd.DataFrame({
#     id_col: test[id_col],
#     target: cat_test_preds
# })

# cat_submission.to_csv("submission_catboost.csv", index=False)
# print("submission_catboost.csv created!")


----- CatBoost Fold 1 -----
CatBoost Fold 1 RMSE: 4.1362

----- CatBoost Fold 2 -----
CatBoost Fold 2 RMSE: 4.1009

----- CatBoost Fold 3 -----
CatBoost Fold 3 RMSE: 4.1066

----- CatBoost Fold 4 -----
CatBoost Fold 4 RMSE: 4.1333

----- CatBoost Fold 5 -----
CatBoost Fold 5 RMSE: 4.1248

CatBoost Average RMSE: 4.1204

submission_catboost.csv created!
