In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame
import sklearn
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split

# submission functionality
# ------------------------
import os, sys
sys.path.append(os.path.abspath("..")) # so src/ is on the path

import importlib
import submission_utils
importlib.reload(submission_utils) # force reload latest code

from submission_utils import save_submission
# ------------------------

In [15]:
# Load dataset
train = pd.read_csv("../../data/cattle_data_train.csv")
test = pd.read_csv("../../data/cattle_data_test.csv")

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

target = "Milk_Yield_L"
id_col = "Cattle_ID"

numeric_cols = None
farm_features = None

def preprocess_pipeline(df, encode_flag = True, target_col='Milk_Yield_L', n_clusters=10, pca_variance=0.9, PCA_Flag=False):
    milk_features = df.copy()
    if target_col in milk_features.columns:
        milk_features = milk_features[milk_features[target_col] >= 0]
        y = milk_features[target_col]
        milk_features = milk_features.drop(columns=[target_col])
    else:
        y = None

    if 'Breed' in milk_features.columns:
        milk_features['Breed'] = milk_features['Breed'].str.strip()
        milk_features['Breed'] = milk_features['Breed'].replace({'Holstien': 'Holstein'})

    if 'Housing_Score' in milk_features.columns:
        milk_features['Housing_Score'] = milk_features['Housing_Score'].fillna(
            milk_features['Housing_Score'].median()
        )

    if 'Feed_Quantity_kg' in milk_features.columns and 'Feed_Type' in milk_features.columns:
        milk_features['Feed_Quantity_kg'] = milk_features.groupby('Feed_Type')['Feed_Quantity_kg'].transform(
            lambda x: x.fillna(x.median())
        )

    numeric_cols = milk_features.select_dtypes(include='number').columns.tolist()
    milk_features[numeric_cols] = milk_features[numeric_cols].fillna(milk_features[numeric_cols].median())

    if 'Date' in milk_features.columns:
        milk_features['Date'] = pd.to_datetime(milk_features['Date'])
        milk_features['year'] = milk_features['Date'].dt.year
        milk_features['month'] = milk_features['Date'].dt.month
        milk_features['day'] = milk_features['Date'].dt.day
        milk_features['dayofweek'] = milk_features['Date'].dt.dayofweek
        milk_features['weekofyear'] = milk_features['Date'].dt.isocalendar().week.astype(int)
        milk_features['quarter'] = milk_features['Date'].dt.quarter
        milk_features['is_weekend'] = milk_features['dayofweek'].isin([5,6]).astype(int)
        milk_features['date_ordinal'] = milk_features['Date'].map(pd.Timestamp.toordinal)
        milk_features = milk_features.drop(columns=['Date'])

    if 'Farm_ID' in milk_features.columns:
        # Use only numeric features for clustering (exclude IDs and target)
        farm_numeric_cols = milk_features.select_dtypes(include='number').columns.tolist()
        farm_numeric_cols = [c for c in farm_numeric_cols if c != 'Cattle_ID']
        # Aggregate per farm
        farm_features = milk_features.groupby('Farm_ID')[farm_numeric_cols].mean()
        # Scale and cluster
        scaler = StandardScaler()
        farm_scaled = scaler.fit_transform(farm_features)
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        farm_features['Cluster'] = kmeans.fit_predict(farm_scaled)
        # Map cluster back
        milk_features['Farm_Cluster'] = milk_features['Farm_ID'].map(farm_features['Cluster'])
    
    # ---------------------------
    # Drop IDs
    drop_cols = ['Cattle_ID', 'Farm_ID']
    milk_features = milk_features.drop(columns=[c for c in drop_cols if c in milk_features.columns])
    
    # ---------------------------
    # One-hot encode categorical features
    if encode_flag:
        cat_cols = milk_features.select_dtypes(include='object').columns.tolist()
        milk_features = pd.get_dummies(milk_features, columns=cat_cols, drop_first=False)
        
    # Ensure 'Farm_Cluster' exists
    if 'Farm_Cluster' not in milk_features.columns:
        milk_features['Farm_Cluster'] = 0  # fallback if cluster wasn't added

    if not PCA_Flag :
        # final_df is all remaining features
        final_df = milk_features.copy()

        return final_df, y
    else :
        # -------- Apply PCA on numeric features only --------
        pca_ready_cols = milk_features.select_dtypes(include='number').columns.tolist()
        # farm cluster stays
        if 'Farm_Cluster' in pca_ready_cols:
            pca_ready_cols.remove('Farm_Cluster')
        scaler_final = StandardScaler()
        scaled_numeric = scaler_final.fit_transform(milk_features[pca_ready_cols])
        pca = PCA(n_components=pca_variance)
        pca_transformed = pca.fit_transform(scaled_numeric)
        # Convert PCA output into dataframe
        pca_cols = [f"PCA_{i+1}" for i in range(pca_transformed.shape[1])]
        pca_df = pd.DataFrame(pca_transformed, columns=pca_cols, index=milk_features.index)
        # Merge back non-PCA features (categorical + Farm_Cluster)
        non_pca_cols = [c for c in milk_features.columns if c not in pca_ready_cols]
        final_df = pd.concat([milk_features[non_pca_cols], pca_df], axis=1)

X, y = preprocess_pipeline(train, False)
test_df, test_labels = preprocess_pipeline(test, False)

In [24]:
categorical_cols = X.select_dtypes(include='object').columns.tolist()
print(categorical_cols)

['Breed', 'Climate_Zone', 'Management_System', 'Lactation_Stage', 'Feed_Type']


In [25]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from catboost import CatBoostRegressor

# Define model parameters
model_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'verbose': 0
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    model = CatBoostRegressor(**model_params)
    model.fit(
        X_train, 
        y_train, 
        eval_set=(X_val, y_val),
        cat_features=categorical_cols,
        use_best_model=True
    )
    
    preds = model.predict(X_val)
    rmse_scores.append(np.sqrt(mean_squared_error(y_val, preds)))
    r2_scores.append(r2_score(y_val, preds))

print(f"Average RMSE: {np.mean(rmse_scores):.2f} ± {np.std(rmse_scores):.2f}")
print(f"Average R2: {np.mean(r2_scores):.3f} ± {np.std(r2_scores):.3f}")

Average RMSE: 4.11 ± 0.01
Average R2: 0.409 ± 0.003


In [26]:
print(rmse_scores)
print(r2_scores)

[np.float64(4.107988072027877), np.float64(4.100485237679691), np.float64(4.121729028209943), np.float64(4.10623953981049), np.float64(4.099511025047362)]
[0.4122475645100112, 0.4107476841533413, 0.4051357713495739, 0.4115768591316096, 0.40603190664943867]


In [29]:
final_cat_model = CatBoostRegressor(
    loss_function="RMSE",
    n_estimators=1000,
    learning_rate=0.05,
    depth=6,
    subsample=0.8,
    random_seed=42,
    verbose=False
)

final_cat_model.fit(X, y, cat_features=categorical_cols)

# Make submission
cat_test_preds = final_cat_model.predict(test_df)

cat_submission = pd.DataFrame({
    id_col: test[id_col],
    target: cat_test_preds
})

In [30]:
save_submission(cat_submission, run_name="gora_model")

Saved submission -> c:\Users\gorab\Documents\ML\ML-Project\submissions\gora_model__2025-11-16__18-44-17.csv
