## Part 1: Data Loading and Exploration

### 1.1 Import Libraries

In [None]:
import numpy as np
import pandas as pd
import warnings
from pathlib import Path
from datetime import datetime

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor, ExtraTreesRegressor

import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

### 1.2 Load Data

In [None]:
DATA_DIR = Path('../data/raw')
SUBMISSION_DIR = Path('../submissions')
SUBMISSION_DIR.mkdir(parents=True, exist_ok=True)

train = pd.read_csv(DATA_DIR / 'cattle_data_train.csv')
test = pd.read_csv(DATA_DIR / 'cattle_data_test.csv')

print(f"Training data shape: {train.shape}")
print(f"Test data shape: {test.shape}")

ID_COL = 'Cattle_ID'
TARGET_COL = 'Milk_Yield_L'

### 1.3 Initial Data Exploration

In [None]:
print("\n=== Training Data Info ===")
print(train.info())

print("\n=== First Few Rows ===")
display(train.head())

print("\n=== Target Variable Statistics ===")
print(train[TARGET_COL].describe())

print("\n=== Missing Values ===")
print(train.isnull().sum()[train.isnull().sum() > 0])

### 1.4 Separate Features and Target

In [None]:
X_full = train.drop(columns=[ID_COL, TARGET_COL])
y = train[TARGET_COL]
X_test = test.drop(columns=[ID_COL])
test_ids = test[ID_COL]

print(f"\nTarget stats - Mean: {y.mean():.2f}, Std: {y.std():.2f}")
print(f"Target range: [{y.min():.2f}, {y.max():.2f}]")

## Part 2: Feature Engineering

### 2.1 Date Feature Engineering

Extract temporal and cyclical features from date information to capture seasonality effects on milk production.

In [None]:
def extract_date_features(df, date_col):
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    
    df['date_month'] = df[date_col].dt.month
    df['date_quarter'] = df[date_col].dt.quarter
    df['date_day_of_week'] = df[date_col].dt.dayofweek
    df['date_day_of_year'] = df[date_col].dt.dayofyear
    df['date_week_of_year'] = df[date_col].dt.isocalendar().week
    df['date_is_weekend'] = (df['date_day_of_week'] >= 5).astype(int)
    
    df['date_month_sin'] = np.sin(2 * np.pi * df['date_month'] / 12)
    df['date_month_cos'] = np.cos(2 * np.pi * df['date_month'] / 12)
    df['date_quarter_sin'] = np.sin(2 * np.pi * df['date_quarter'] / 4)
    df['date_quarter_cos'] = np.cos(2 * np.pi * df['date_quarter'] / 4)
    df['date_week_sin'] = np.sin(2 * np.pi * df['date_week_of_year'] / 52)
    df['date_week_cos'] = np.cos(2 * np.pi * df['date_week_of_year'] / 52)
    
    df['date_season'] = ((df['date_month'] % 12 + 3) // 3) % 4
    
    df = df.drop(columns=[date_col])
    return df

X_full = extract_date_features(X_full, 'Date')
X_test = extract_date_features(X_test, 'Date')

date_features = [col for col in X_full.columns if col.startswith('date_')]
print(f"Created {len(date_features)} date features: {date_features}")

### 2.2 Farm Statistical Features

Create safe statistical features from Farm_ID without target leakage.

In [None]:
if 'Farm_ID' in X_full.columns:
    farm_sizes = X_full['Farm_ID'].value_counts().to_dict()
    X_full['farm_size'] = X_full['Farm_ID'].map(farm_sizes)
    X_test['farm_size'] = X_test['Farm_ID'].map(farm_sizes).fillna(X_full['farm_size'].median())
    
    X_full['farm_frequency'] = X_full['farm_size'] / len(X_full)
    X_test['farm_frequency'] = X_test['farm_size'] / len(X_full)
    
    farm_rank = X_full['Farm_ID'].value_counts().rank(method='dense', ascending=False).to_dict()
    X_full['farm_rank'] = X_full['Farm_ID'].map(farm_rank)
    X_test['farm_rank'] = X_test['Farm_ID'].map(farm_rank).fillna(X_full['farm_rank'].median())
    
    farm_diversity = train.groupby('Farm_ID')[ID_COL].nunique().to_dict()
    X_full['farm_diversity'] = X_full['Farm_ID'].map(farm_diversity)
    X_test['farm_diversity'] = X_test['Farm_ID'].map(farm_diversity).fillna(X_full['farm_diversity'].median())
    
    le_farm = LabelEncoder()
    all_farms = pd.concat([X_full['Farm_ID'], X_test['Farm_ID']])
    le_farm.fit(all_farms.astype(str))
    X_full['farm_encoded'] = le_farm.transform(X_full['Farm_ID'].astype(str))
    X_test['farm_encoded'] = le_farm.transform(X_test['Farm_ID'].astype(str))
    
    X_full = X_full.drop(columns=['Farm_ID'])
    X_test = X_test.drop(columns=['Farm_ID'])
    
    print("Created 5 farm features: farm_size, farm_frequency, farm_rank, farm_diversity, farm_encoded")

### 2.3 Domain-Driven Interaction Features

Create interaction features based on dairy science knowledge.

In [None]:
interactions_created = 0

if 'Age_Months' in X_full.columns and 'Weight_kg' in X_full.columns:
    X_full['age_x_weight'] = X_full['Age_Months'] * X_full['Weight_kg']
    X_test['age_x_weight'] = X_test['Age_Months'] * X_test['Weight_kg']
    X_full['age_weight_ratio'] = X_full['Age_Months'] / (X_full['Weight_kg'] + 1e-5)
    X_test['age_weight_ratio'] = X_test['Age_Months'] / (X_test['Weight_kg'] + 1e-5)
    interactions_created += 2

if 'Parity' in X_full.columns and 'Age_Months' in X_full.columns:
    X_full['parity_x_age'] = X_full['Parity'] * X_full['Age_Months']
    X_test['parity_x_age'] = X_test['Parity'] * X_test['Age_Months']
    interactions_created += 1

if 'Temperature_Celsius' in X_full.columns and 'Humidity_Percent' in X_full.columns:
    X_full['heat_stress'] = X_full['Temperature_Celsius'] * X_full['Humidity_Percent'] / 100
    X_test['heat_stress'] = X_test['Temperature_Celsius'] * X_test['Humidity_Percent'] / 100
    X_full['temp_squared'] = X_full['Temperature_Celsius'] ** 2
    X_test['temp_squared'] = X_test['Temperature_Celsius'] ** 2
    interactions_created += 2

if 'Feed_Quantity_kg' in X_full.columns and 'Weight_kg' in X_full.columns:
    X_full['feed_per_weight'] = X_full['Feed_Quantity_kg'] / (X_full['Weight_kg'] + 1e-5)
    X_test['feed_per_weight'] = X_test['Feed_Quantity_kg'] / (X_test['Weight_kg'] + 1e-5)
    interactions_created += 1

if 'Feed_Protein_Percent' in X_full.columns and 'Feed_Quantity_kg' in X_full.columns:
    X_full['protein_intake'] = X_full['Feed_Protein_Percent'] * X_full['Feed_Quantity_kg'] / 100
    X_test['protein_intake'] = X_test['Feed_Protein_Percent'] * X_test['Feed_Quantity_kg'] / 100
    interactions_created += 1

if 'Feed_Energy_MJ' in X_full.columns and 'Weight_kg' in X_full.columns:
    X_full['energy_per_weight'] = X_full['Feed_Energy_MJ'] / (X_full['Weight_kg'] + 1e-5)
    X_test['energy_per_weight'] = X_test['Feed_Energy_MJ'] / (X_test['Weight_kg'] + 1e-5)
    interactions_created += 1

if 'Somatic_Cell_Count' in X_full.columns:
    X_full['scc_log'] = np.log1p(X_full['Somatic_Cell_Count'])
    X_test['scc_log'] = np.log1p(X_test['Somatic_Cell_Count'])
    interactions_created += 1

if all(col in X_full.columns for col in ['Parity', 'date_month']):
    X_full['lactation_curve'] = X_full['Parity'] * np.exp(-0.05 * X_full['date_month'])
    X_test['lactation_curve'] = X_test['Parity'] * np.exp(-0.05 * X_test['date_month'])
    interactions_created += 1

if all(col in X_full.columns for col in ['Weight_kg', 'Age_Months', 'Feed_Quantity_kg']):
    X_full['body_condition'] = X_full['Weight_kg'] / (X_full['Age_Months'] + 1) * X_full['Feed_Quantity_kg']
    X_test['body_condition'] = X_test['Weight_kg'] / (X_test['Age_Months'] + 1) * X_test['Feed_Quantity_kg']
    interactions_created += 1

print(f"Created {interactions_created} interaction features")

### 2.4 Polynomial Features

Add squared terms for key numerical features to capture non-linear relationships.

In [None]:
key_squared = ['Age_Months', 'Weight_kg', 'Parity', 'Feed_Quantity_kg']
squared_count = 0

for feat in key_squared:
    if feat in X_full.columns:
        X_full[f'{feat}_sq'] = X_full[feat] ** 2
        X_test[f'{feat}_sq'] = X_test[feat] ** 2
        squared_count += 1

print(f"Added {squared_count} squared features")

## Part 3: Data Preprocessing

### 3.1 Encode Categorical Features

In [None]:
numeric_features = X_full.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_full.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

for col in categorical_features:
    le = LabelEncoder()
    combined = pd.concat([X_full[col].astype(str), X_test[col].astype(str)])
    le.fit(combined)
    X_full[col] = le.transform(X_full[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    numeric_features.append(col)

print(f"Encoded {len(categorical_features)} categorical features")

### 3.2 Handle Missing Values

In [None]:
print("Missing values before imputation:")
print(X_full.isnull().sum()[X_full.isnull().sum() > 0])

imputer = SimpleImputer(strategy='median')
X_full[numeric_features] = imputer.fit_transform(X_full[numeric_features])
X_test[numeric_features] = imputer.transform(X_test[numeric_features])

print("\n Missing values imputed with median strategy")

### 3.3 Feature Scaling

In [None]:
scaler = StandardScaler()
X_full_scaled = pd.DataFrame(
    scaler.fit_transform(X_full),
    columns=X_full.columns,
    index=X_full.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

print(f"Scaled {X_full_scaled.shape[1]} features")
print(f"\nFinal feature count: {X_full_scaled.shape[1]}")

## Part 4: Model Training and Evaluation

### 4.1 Cross-Validation Setup

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def evaluate_model(model, X, y, name):
    scores = cross_val_score(
        model, X, y,
        cv=kf,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1
    )
    rmse = -scores.mean()
    std = scores.std()
    print(f"{name:20s} CV RMSE: {rmse:.4f} (¬±{std:.4f})")
    return rmse

print("Cross-validation setup: 5-fold with shuffle")

### 4.2 Train Base Models

#### 4.2.1 XGBoost (V7 Configuration)

In [None]:
print("Training XGBoost with V7's optimized parameters...")
print("Key params: lr=0.015, subsample=0.68, reg_alpha=1.5, reg_lambda=3.5")

xgb_model = xgb.XGBRegressor(
    n_estimators=800,
    max_depth=5,
    learning_rate=0.015,       # Slower learning for better generalization
    subsample=0.68,             # Aggressive subsampling to reduce overfitting
    colsample_bytree=0.68,
    reg_alpha=1.5,              # Strong L1 regularization
    reg_lambda=3.5,             # Strong L2 regularization
    min_child_weight=6,
    gamma=0.12,                 # Minimum loss reduction for split
    random_state=RANDOM_STATE,
    n_jobs=-1,
    tree_method='hist'
)

xgb_cv = evaluate_model(xgb_model, X_full_scaled, y, "XGBoost")

#### 4.2.2 LightGBM (V7 Configuration)

In [None]:
print("\nTraining LightGBM with V7's optimized parameters...")

lgb_model = lgb.LGBMRegressor(
    n_estimators=800,
    max_depth=5,
    learning_rate=0.015,
    num_leaves=26,
    subsample=0.68,
    colsample_bytree=0.68,
    reg_alpha=1.5,
    reg_lambda=3.5,
    min_child_weight=6,
    min_child_samples=22,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=-1
)

lgb_cv = evaluate_model(lgb_model, X_full_scaled, y, "LightGBM")

#### 4.2.3 ExtraTrees (For Ensemble Diversity)

In [None]:
print("\nTraining ExtraTrees for ensemble diversity...")

et_model = ExtraTreesRegressor(
    n_estimators=350,
    max_depth=14,
    min_samples_split=8,
    min_samples_leaf=3,
    max_features='sqrt',
    random_state=RANDOM_STATE,
    n_jobs=-1
)

et_cv = evaluate_model(et_model, X_full_scaled, y, "ExtraTrees")

### 4.3 Stacking Ensemble

Combine base models using a meta-learner (Ridge regression with strong regularization).

In [None]:
print("\nBuilding Stacking Ensemble...")
print("Meta-learner: Ridge regression with alpha=15 (V7's optimal value)")

base_learners = [
    ('xgb', xgb_model),
    ('lgb', lgb_model),
    ('et', et_model)
]

stacking_model = StackingRegressor(
    estimators=base_learners,
    final_estimator=Ridge(alpha=15.0),  # Strong regularization in meta-learner
    cv=5,
    n_jobs=-1
)

stack_cv = evaluate_model(stacking_model, X_full_scaled, y, "Stacking")

### 4.4 Model Selection and Summary

In [None]:
models = {
    'XGBoost': (xgb_model, xgb_cv),
    'LightGBM': (lgb_model, lgb_cv),
    'ExtraTrees': (et_model, et_cv),
    'Stacking': (stacking_model, stack_cv)
}

print("\n" + "="*60)
print("MODEL PERFORMANCE SUMMARY")
print("="*60)

for name, (model, cv) in sorted(models.items(), key=lambda x: x[1][1]):
    print(f"{name:20s}: {cv:.4f} RMSE")

best_name = min(models, key=lambda x: models[x][1])
best_model, best_cv = models[best_name]

print("\n" + "="*60)
print(f"üèÜ BEST MODEL: {best_name}")
print(f"   Cross-Validation RMSE: {best_cv:.4f}")
print("="*60)

## Part 5: Final Prediction and Submission

### 5.1 Train on Full Dataset

In [None]:
print(f"\nTraining {best_name} on full training dataset...")
best_model.fit(X_full_scaled, y)
print("Training complete")

### 5.2 Generate Predictions

In [None]:
print("Generating predictions on test set...")
predictions = best_model.predict(X_test_scaled)

print("\nPrediction Statistics:")
print(f"  Mean:   {predictions.mean():.4f}")
print(f"  Median: {np.median(predictions):.4f}")
print(f"  Std:    {predictions.std():.4f}")
print(f"  Min:    {predictions.min():.4f}")
print(f"  Max:    {predictions.max():.4f}")

### 5.3 Create Submission File

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
submission_filename = f'submission_v9_final_{timestamp}.csv'
submission_path = SUBMISSION_DIR / submission_filename

submission = pd.DataFrame({
    ID_COL: test_ids,
    TARGET_COL: predictions
})

submission.to_csv(submission_path, index=False)

print(f"\n Submission file created: {submission_filename}")
print(f" Path: {submission_path}")

# Display first few rows
print("\nFirst 10 predictions:")
display(submission.head(10))