# Milk Yield Prediction 

This notebook follows the exact workflow used in class: data load, preprocessing with median/mode style fills, one-hot encoding, train/validation split, model comparison (Linear Regression, Decision Tree, Random Forest), pick best by RMSE, retrain on full data, and generate `submission.csv`.

**Files expected:**
- `cattle_data_train.csv`
- `cattle_data_test.csv`
- optional: `/mnt/data/sample_submission.csv` for reference

**Output:**
- `submission.csv` ready to upload


## 1. Imports and Data Load

In [39]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

train_path = "cattle_data_train.csv"
test_path  = "cattle_data_test.csv"

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)
 
train.head()

Unnamed: 0,Cattle_ID,Breed,Climate_Zone,Management_System,Age_Months,Weight_kg,Parity,Lactation_Stage,Days_in_Milk,Feed_Type,...,BVD_Vaccine,Rabies_Vaccine,Previous_Week_Avg_Yield,Body_Condition_Score,Milking_Interval_hrs,Date,Farm_ID,Feed_Quantity_lb,Mastitis,Milk_Yield_L
0,CATTLE_133713,Holstein,Tropical,Intensive,114,544.8,4,Mid,62,Concentrates,...,0,1,6.31,3.0,12,2024-01-15,FARM_0301,36.8235,1,12.192634
1,CATTLE_027003,Holstein,Arid,Mixed,136,298.9,4,Mid,213,Crop_Residues,...,0,0,17.16,4.0,12,2023-10-31,FARM_0219,,0,14.717031
2,CATTLE_122459,Holstein,Tropical,Semi_Intensive,64,336.6,4,Late,16,Hay,...,1,0,4.07,3.5,12,2024-05-20,FARM_0802,16.0965,0,14.006142
3,CATTLE_213419,Jersey,Mediterranean,Intensive,58,370.5,1,Early,339,Crop_Residues,...,0,0,10.23,3.0,24,2024-07-22,FARM_0034,40.7925,0,24.324325
4,CATTLE_106260,Guernsey,Subtropical,Intensive,84,641.5,6,Early,125,Mixed_Feed,...,1,1,20.68,3.0,12,2023-01-03,FARM_0695,33.7365,1,12.023074


In [40]:
target_col = "Milk_Yield_L"
y = train[target_col].copy()

cols_to_drop = [c for c in ["Cattle_ID", "Farm_ID", "Date"] if c in train.columns]
if "Feed_Quantity_lb" in train.columns:
    cols_to_drop.append("Feed_Quantity_lb")

X = train.drop(columns=cols_to_drop + [target_col], errors="ignore")
X_test = test.drop(columns=[c for c in cols_to_drop if c in test.columns], errors="ignore")

test_ids = test["Cattle_ID"].copy()
X.shape, X_test.shape, y.shape

((210000, 31), (40000, 31), (210000,))

In [41]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    num_cols = df.select_dtypes(include=[np.number]).columns.to_list()
    cat_cols = [c for c in df.columns if c not in num_cols]

    # numeric: median fill
    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].median())

    # categorical: 'Unknown' fill
    for c in cat_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna("Unknown")

    return pd.get_dummies(df, drop_first=False)

X_proc = preprocess(X)
X_test_proc = preprocess(X_test)

# align columns so test matches train
X_proc, X_test_proc = X_proc.align(X_test_proc, join="left", axis=1, fill_value=0)
X_proc.shape, X_test_proc.shape

((210000, 55), (40000, 55))

In [42]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X_proc, y, test_size=0.2, random_state=42
)

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree(max_depth=10)": DecisionTreeRegressor(max_depth=10, random_state=42),
    "RandomForest(200)": RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1),
}

rmse_scores = {}
for name, model in models.items():
    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)
    rmse_scores[name] = rmse(y_val, preds)

rmse_scores

{'LinearRegression': 4.3186005153086615,
 'DecisionTree(max_depth=10)': 4.534595018373689,
 'RandomForest(200)': 4.356773877350133}

In [43]:
best_name = min(rmse_scores, key=rmse_scores.get)
best_model = models[best_name]

best_model.fit(X_proc, y)
test_preds = best_model.predict(X_test_proc)

best_name

'LinearRegression'

In [44]:
submission = pd.DataFrame({
    "Cattle_ID": test_ids,
    "Milk_Yield_L": test_preds
})
out_path = "submission.csv"
submission.to_csv(out_path, index=False)
submission.head(), out_path

(   Cattle_ID  Milk_Yield_L
 0          1     18.378923
 1          2     10.241831
 2          3     21.182128
 3          4     15.574096
 4          5     17.377651,
 'submission.csv')