In [3]:
import os
os.chdir('C:\\Users\\kaush\\OneDrive\\Desktop\\UT\\CS 363M\\ML Project\\src\\experiments')
print(os.getcwd())

C:\Users\kaush\OneDrive\Desktop\UT\CS 363M\ML Project\src\experiments


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor
import xgboost as xgb

# submission functionality
# ------------------------
import os, sys
sys.path.append(os.path.abspath("..")) # so src/ is on the path

import importlib
import submission_utils
importlib.reload(submission_utils) # force reload latest code

from submission_utils import save_submission
# ------------------------


In [6]:
# Load data
train = pd.read_csv("../../data/cattle_data_train.csv")
test = pd.read_csv("../../data/cattle_data_test.csv")

In [7]:
for i in train.columns:
    # print(train[i].dtype)
    print(train[i].name, ":", train[i].dtype)


Cattle_ID : object
Breed : object
Climate_Zone : object
Management_System : object
Age_Months : int64
Weight_kg : float64
Parity : int64
Lactation_Stage : object
Days_in_Milk : int64
Feed_Type : object
Feed_Quantity_kg : float64
Feeding_Frequency : int64
Water_Intake_L : float64
Walking_Distance_km : float64
Grazing_Duration_hrs : float64
Rumination_Time_hrs : float64
Resting_Hours : float64
Ambient_Temperature_C : float64
Humidity_percent : float64
Housing_Score : float64
FMD_Vaccine : int64
Brucellosis_Vaccine : int64
HS_Vaccine : int64
BQ_Vaccine : int64
Anthrax_Vaccine : int64
IBR_Vaccine : int64
BVD_Vaccine : int64
Rabies_Vaccine : int64
Previous_Week_Avg_Yield : float64
Body_Condition_Score : float64
Milking_Interval_hrs : int64
Date : object
Farm_ID : object
Feed_Quantity_lb : float64
Mastitis : int64
Milk_Yield_L : float64


In [8]:
# Target + ID
target = "Milk_Yield_L"
id_col = "Cattle_ID"

# Basic preprocessing
# (turn categories -> strings -> numeric encodings)
def preprocess(df):
    milk_test = df.copy()
    milk_test["Date"] = pd.to_datetime(milk_test["Date"])
    milk_test['year'] = milk_test['Date'].dt.year
    milk_test['month'] = milk_test['Date'].dt.month
    milk_test['day'] = milk_test['Date'].dt.day
    milk_test['dayofweek'] = milk_test['Date'].dt.dayofweek
    milk_test['weekofyear'] = milk_test['Date'].dt.isocalendar().week.astype(int)
    milk_test['quarter'] = milk_test['Date'].dt.quarter
    milk_test['is_weekend'] = milk_test['dayofweek'].isin([5, 6]).astype(int)
    milk_test['date_ordinal'] = milk_test['Date'].map(pd.Timestamp.toordinal)
    milk_test['Breed'] = milk_test['Breed'].str.strip()
    milk_test['Breed'] = milk_test['Breed'].replace({'Holstien': 'Holstein'})
    milk_test['Feed_Quantity_kg'] = milk_test.groupby('Feed_Type')['Feed_Quantity_kg'].transform(lambda x: x.fillna(x.median()))
    milk_test['Housing_Score'] = milk_test['Housing_Score'].transform(lambda x: x.fillna(x.median()))
    milk_test = milk_test.drop("Feed_Quantity_lb", axis=1)
    milk_test = milk_test.drop(id_col, axis=1)
    milk_test = milk_test.drop("Date", axis=1)
    for col in milk_test.columns:
        if milk_test[col].dtype == "object":
            milk_test[col] = milk_test[col].astype("category").cat.codes
    return milk_test

train_prep = preprocess(train.drop(columns=[target]))
test_prep = preprocess(test)

y = train[target]
X = train_prep

In [5]:
# 5-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_rmse = []
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n----- Fold {fold+1} -----")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=-1,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    model.fit(X_train, y_train)

    preds = model.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    fold_rmse.append(rmse)

    print(f"Fold {fold+1} RMSE: {rmse:.4f}")
    models.append(model)

print("\n==========================")
print(f"Average RMSE: {np.mean(fold_rmse):.4f}")
print("==========================\n")


----- Fold 1 -----
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004457 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3623
[LightGBM] [Info] Number of data points in the train set: 168000, number of used features: 40
[LightGBM] [Info] Start training from score 15.587505
Fold 1 RMSE: 4.1447

----- Fold 2 -----
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004199 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3622
[LightGBM] [Info] Number of data points in the train set: 168000, number of used features: 40
[LightGBM] [Info] Start training from score 15.580781
Fold 2 RMSE: 4.1141

----- Fold 3 -----
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing wa

In [10]:
# 5-Fold Cross-Validation for CatBoost
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cat_fold_rmse = []
cat_models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n----- CatBoost Fold {fold+1} -----")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = CatBoostRegressor(
        loss_function="RMSE",
        n_estimators=1000,
        learning_rate=0.05,
        depth=8,
        subsample=0.8,
        random_seed=42,
        verbose=False  # turn off big CatBoost logs
    )

    model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)

    preds = model.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    cat_fold_rmse.append(rmse)

    print(f"CatBoost Fold {fold+1} RMSE: {rmse:.4f}")
    cat_models.append(model)

print("\n==========================")
print(f"CatBoost Average RMSE: {np.mean(cat_fold_rmse):.4f}")
print("==========================\n")



----- CatBoost Fold 1 -----
CatBoost Fold 1 RMSE: 4.1297

----- CatBoost Fold 2 -----
CatBoost Fold 2 RMSE: 4.0945

----- CatBoost Fold 3 -----
CatBoost Fold 3 RMSE: 4.0998

----- CatBoost Fold 4 -----
CatBoost Fold 4 RMSE: 4.1249

----- CatBoost Fold 5 -----
CatBoost Fold 5 RMSE: 4.1191

CatBoost Average RMSE: 4.1136



In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_rmse = []
xgb_models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n----- XGBoost Fold {fold+1} -----")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = xgb.XGBRegressor(
        n_estimators=700,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        tree_method="hist",      # fast for large data
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    preds = model.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    fold_rmse.append(rmse)
    
    print(f"Fold {fold+1} RMSE: {rmse:.4f}")
    xgb_models.append(model)

print("\n==========================")
print(f"XGBoost Average RMSE = {np.mean(fold_rmse):.4f}")
print("==========================\n")


----- XGBoost Fold 1 -----
Fold 1 RMSE: 4.1811

----- XGBoost Fold 2 -----
Fold 2 RMSE: 4.1489

----- XGBoost Fold 3 -----
Fold 3 RMSE: 4.1563

----- XGBoost Fold 4 -----
Fold 4 RMSE: 4.1785

----- XGBoost Fold 5 -----
Fold 5 RMSE: 4.1650

XGBoost Average RMSE = 4.1659



In [None]:
# Train final CatBoost model on full data
final_cat_model = CatBoostRegressor(
    loss_function="RMSE",
    n_estimators=1000,
    learning_rate=0.05,
    depth=8,
    subsample=0.8,
    random_seed=42,
    verbose=False
)
final_cat_model.fit(X, y, verbose=False)

# Make submission
cat_test_preds = final_cat_model.predict(test_prep)

cat_submission = pd.DataFrame({
    id_col: test[id_col],
    target: cat_test_preds
})

# save_submission(submission, run_name="sankarsh_model")
print("submission_catboost.csv created!")

submission_catboost.csv created!


In [9]:
# -----------------------
# Train final model on full data
# -----------------------
final_model = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
final_model.fit(X, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004235 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3622
[LightGBM] [Info] Number of data points in the train set: 210000, number of used features: 40
[LightGBM] [Info] Start training from score 15.589156


0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [11]:
# -----------------------
# Create submission
# -----------------------
test_preds = final_model.predict(test_prep)

submission = pd.DataFrame({
    id_col: test[id_col],
    target: test_preds
})

# save_submission(submission, run_name="sankarsh_model")
print("submission.csv created!")

submission.csv created!


In [9]:
df = train
# numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Q1, Q3, IQR per numerical column
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1

# boolean mask of outliers per column
outlier_mask = (df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))

# count of outliers per column (Series)
outlier_counts = outlier_mask.sum()

print(outlier_counts)


Age_Months                     0
Weight_kg                      0
Parity                         0
Days_in_Milk                   0
Feed_Quantity_kg             674
Feeding_Frequency              0
Water_Intake_L              1485
Walking_Distance_km          748
Grazing_Duration_hrs           0
Rumination_Time_hrs         2406
Resting_Hours                  0
Ambient_Temperature_C        196
Humidity_percent               0
Housing_Score                  0
FMD_Vaccine                    0
Brucellosis_Vaccine            0
HS_Vaccine                     0
BQ_Vaccine                     0
Anthrax_Vaccine                0
IBR_Vaccine                    0
BVD_Vaccine                    0
Rabies_Vaccine                 0
Previous_Week_Avg_Yield     2406
Body_Condition_Score           0
Milking_Interval_hrs       62833
Feed_Quantity_lb             698
Mastitis                   20995
Milk_Yield_L                2677
dtype: int64


In [13]:

# Select numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Dictionary to hold outlier boolean mask per column
outlier_masks = {}

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier_masks[col] = (df[col] < lower_bound) | (df[col] > upper_bound)

# Combine all masks to find rows with outliers in any column
combined_outliers = pd.concat(outlier_masks, axis=1).any(axis=1)

# Number of records with any outlier
num_any_outlier_records = combined_outliers.sum()
print(f"Number of records with at least one outlier: {num_any_outlier_records}")

# To check which records have outliers across multiple columns,
# sum the boolean masks row-wise
outlier_counts_per_record = pd.concat(outlier_masks, axis=1).sum(axis=1)

# Example: records with outliers in at least 2 columns
for i in range(9):
    records_multiple_outliers = df[outlier_counts_per_record == i]
    print(f"Number of records with outliers in {i} columns: {len(records_multiple_outliers)}")
# Optional: to view which columns are outliers for a record
# outlier_flags_df = pd.concat(outlier_masks, axis=1)
# print(outlier_flags_df.loc[records_multiple_outliers.index])


Number of records with at least one outlier: 82665
Number of records with outliers in 0 columns: 127335
Number of records with outliers in 1 columns: 71695
Number of records with outliers in 2 columns: 9623
Number of records with outliers in 3 columns: 1216
Number of records with outliers in 4 columns: 127
Number of records with outliers in 5 columns: 3
Number of records with outliers in 6 columns: 1
Number of records with outliers in 7 columns: 0
Number of records with outliers in 8 columns: 0
