# Tuned XGBoost Notebook (Target ~4.11 RMSE)
This notebook loads data, cleans it, tunes XGBoost, prints validation RMSE, and prepares submission.

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from xgboost import XGBRegressor

train = pd.read_csv("cattle_data_train.csv")
test = pd.read_csv("cattle_data_test.csv")


In [10]:
# Cleaning
target_col="Milk_Yield_L"
train=train.drop_duplicates().dropna(subset=[target_col])

num_cols=train.select_dtypes(include=[np.number]).columns
cat_cols=train.select_dtypes(include=["object"]).columns

for col in num_cols:
    med=train[col].median()
    train[col]=train[col].fillna(med)
    if col in test.columns: test[col]=test[col].fillna(med)

for col in cat_cols:
    mode=train[col].mode().iloc[0]
    train[col]=train[col].fillna(mode)
    if col in test.columns: test[col]=test[col].fillna(mode)


In [11]:
# Features
y=train[target_col]
X=train.drop(columns=[target_col,"Cattle_ID","Farm_ID","Date"],errors="ignore")
X_test=test.drop(columns=["Cattle_ID","Farm_ID","Date"],errors="ignore")

X=pd.get_dummies(X)
X_test=pd.get_dummies(X_test)
X,X_test=X.align(X_test,join="left",axis=1,fill_value=0)


In [12]:
# Split
X_tr,X_val,y_tr,y_val=train_test_split(X,y,test_size=0.2,random_state=42)


In [13]:
# Tuning XGBoost
kf=KFold(n_splits=5,shuffle=True,random_state=0)

def neg_rmse(y_true,y_pred):
    return -np.sqrt(mean_squared_error(y_true,y_pred))

xgb=XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",
    random_state=0,
    n_jobs=-1
)

param_dist={
    "n_estimators":[300,500,800],
    "max_depth":[3,4,6],
    "learning_rate":[0.02,0.03,0.05],
    "subsample":[0.7,0.8,0.9],
    "colsample_bytree":[0.7,0.8,0.9],
    "min_child_weight":[1,3,5],
    "reg_lambda":[1.0,3.0,5.0]
}

search=RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=25,
    scoring=make_scorer(neg_rmse),
    cv=kf,
    n_jobs=-1,
    verbose=1,
    random_state=0
)

search.fit(X,y)
best_xgb=search.best_estimator_
print("Best params:",search.best_params_)
print("Best CV RMSE:",-search.best_score_)


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best params: {'subsample': 0.7, 'reg_lambda': 3.0, 'n_estimators': 800, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.03, 'colsample_bytree': 0.7}
Best CV RMSE: 4.245020038070919


In [14]:
# Validation RMSE
best_xgb.fit(X_tr,y_tr)
val_preds=best_xgb.predict(X_val)
rmse=np.sqrt(mean_squared_error(y_val,val_preds))
print("Validation RMSE (tuned):",f"{rmse:.2f}")


Validation RMSE (tuned): 4.26


In [15]:
# Submission
best_xgb.fit(X,y)
test_preds=best_xgb.predict(X_test)

submission=pd.DataFrame({
    "Cattle_ID":test["Cattle_ID"],
    "Milk_Yield_L":test_preds
})
submission.to_csv("submission.csv",index=False)
submission.head()


Unnamed: 0,Cattle_ID,Milk_Yield_L
0,1,18.692791
1,2,10.230684
2,3,22.906891
3,4,14.553174
4,5,17.814556
