In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/synth-ai-nitkkr/sample_submission.csv
/kaggle/input/synth-ai-nitkkr/train.csv
/kaggle/input/synth-ai-nitkkr/test.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict

# Set random seed for reproducibility
np.random.seed(42)

# Load Data
print("Loading data...")
train_data = pd.read_csv('/kaggle/input/synth-ai-nitkkr/train.csv')
test_data = pd.read_csv('/kaggle/input/synth-ai-nitkkr/test.csv')

# Save test IDs for submission
test_ids = test_data["Id"]

# Drop ID Column
train_data.drop(columns=["Id"], inplace=True)
test_data.drop(columns=["Id"], inplace=True)

print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

# =======================================
# 1. CONSISTENT DATA PREPROCESSING
# =======================================

# 1.1 Handle percentage columns
percent_cols = ["Bone_Health_Calcium", "VitA_Percentage", "Iron_Concentration", "VitC_Content"]
for col in percent_cols:
    if col in train_data.columns:
        train_data[col] = train_data[col].str.replace('%', '', regex=False).astype(float) / 100
    if col in test_data.columns:
        test_data[col] = test_data[col].str.replace('%', '', regex=False).astype(float) / 100

# 1.2 Handle type inconsistencies between train and test
# Standardize types to float64 where needed
numeric_string_cols = ['Sweetness_Index', 'Stimulant_Caffeine', 'Heart_Risk_Index', 'Health_Rating']
for col in numeric_string_cols:
    if col in train_data.columns:
        train_data[col] = pd.to_numeric(train_data[col], errors='coerce')
    if col in test_data.columns:
        test_data[col] = pd.to_numeric(test_data[col], errors='coerce')

# 1.3 Normalize string columns
for col in train_data.select_dtypes(include=['object']).columns:
    train_data[col] = train_data[col].astype(str).str.strip()
for col in test_data.select_dtypes(include=['object']).columns:
    test_data[col] = test_data[col].astype(str).str.strip()

# 1.4 Identify column types
train_num_cols = train_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
train_cat_cols = train_data.select_dtypes(include=['object']).columns.tolist()

# Filter to common columns
num_cols = [col for col in train_num_cols if col in test_data.columns and col != 'Fat_Content']
cat_cols = [col for col in train_cat_cols if col in test_data.columns]

print(f"Numerical features: {len(num_cols)}")
print(f"Categorical features: {len(cat_cols)}")

# =======================================
# 2. ROBUST FEATURE ENGINEERING
# =======================================
print("Creating robust features...")

# 2.1 Direct Fat Content Predictors
# Saturated lipids are highly predictive of fat content
if 'Saturated_Lipids' in train_data.columns:
    train_data['Sat_Lipids_Squared'] = np.square(train_data['Saturated_Lipids'])
    test_data['Sat_Lipids_Squared'] = np.square(test_data['Saturated_Lipids'])

# 2.2 Fat-to-X ratios
# These ratios capture important nutritional relationships
if 'Saturated_Lipids' in train_data.columns:
    # Fat to Carb ratio
    if 'Carb_Count' in train_data.columns:
        train_data['Fat_to_Carb'] = train_data['Saturated_Lipids'] / (train_data['Carb_Count'] + 1e-8)
        test_data['Fat_to_Carb'] = test_data['Saturated_Lipids'] / (test_data['Carb_Count'] + 1e-8)
        
        # Replace infinities
        train_data['Fat_to_Carb'].replace([np.inf, -np.inf], 0, inplace=True)
        test_data['Fat_to_Carb'].replace([np.inf, -np.inf], 0, inplace=True)
    
    # Fat to Protein ratio
    if 'Protein_Quantity' in train_data.columns:
        train_data['Fat_to_Protein'] = train_data['Saturated_Lipids'] / (train_data['Protein_Quantity'] + 1e-8)
        test_data['Fat_to_Protein'] = test_data['Saturated_Lipids'] / (test_data['Protein_Quantity'] + 1e-8)
        
        # Replace infinities
        train_data['Fat_to_Protein'].replace([np.inf, -np.inf], 0, inplace=True)
        test_data['Fat_to_Protein'].replace([np.inf, -np.inf], 0, inplace=True)

# 2.3 Oil and fat interaction
if 'Saturated_Lipids' in train_data.columns and 'Processed_Oil_Content' in train_data.columns:
    train_data['Fat_Oil_Interaction'] = train_data['Saturated_Lipids'] * train_data['Processed_Oil_Content']
    test_data['Fat_Oil_Interaction'] = test_data['Saturated_Lipids'] * test_data['Processed_Oil_Content']

# 2.4 Robust Target Encoding (cross-validation based to prevent leakage)
# Use K-fold target encoding for drink name and type
if 'Drink_Name' in train_data.columns and 'Fat_Content' in train_data.columns:
    # First, create a mapping of global averages to use as fallback for test set
    global_means = train_data.groupby('Drink_Name')['Fat_Content'].mean().to_dict()
    global_mean = train_data['Fat_Content'].mean()
    
    # Use 5-fold cross-validation to create leak-free target encodings
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    train_data['Drink_Mean_Fat'] = 0
    
    for train_idx, val_idx in kf.split(train_data):
        # For each fold, encode validation set using only training data
        X_fold, X_val = train_data.iloc[train_idx], train_data.iloc[val_idx]
        means = X_fold.groupby('Drink_Name')['Fat_Content'].mean().to_dict()
        
        # Apply encoding to validation set
        X_val_drink_names = X_val['Drink_Name'].values
        val_encodings = np.array([means.get(name, global_mean) for name in X_val_drink_names])
        train_data.loc[val_idx, 'Drink_Mean_Fat'] = val_encodings
    
    # For test set, use global mapping from entire training set
    test_data['Drink_Mean_Fat'] = test_data['Drink_Name'].map(global_means).fillna(global_mean)

# Do the same for Drink_Type
if 'Drink_Type' in train_data.columns and 'Fat_Content' in train_data.columns:
    global_type_means = train_data.groupby('Drink_Type')['Fat_Content'].mean().to_dict()
    
    train_data['Type_Mean_Fat'] = 0
    for train_idx, val_idx in kf.split(train_data):
        X_fold, X_val = train_data.iloc[train_idx], train_data.iloc[val_idx]
        means = X_fold.groupby('Drink_Type')['Fat_Content'].mean().to_dict()
        
        X_val_types = X_val['Drink_Type'].values
        val_encodings = np.array([means.get(type_, global_mean) for type_ in X_val_types])
        train_data.loc[val_idx, 'Type_Mean_Fat'] = val_encodings
    
    test_data['Type_Mean_Fat'] = test_data['Drink_Type'].map(global_type_means).fillna(global_mean)

# 2.5 Add Health-related aggregations
if 'Health_Category' in train_data.columns and 'Fat_Content' in train_data.columns:
    health_means = train_data.groupby('Health_Category')['Fat_Content'].mean().to_dict()
    
    train_data['Health_Mean_Fat'] = 0
    for train_idx, val_idx in kf.split(train_data):
        X_fold, X_val = train_data.iloc[train_idx], train_data.iloc[val_idx]
        means = X_fold.groupby('Health_Category')['Fat_Content'].mean().to_dict()
        
        X_val_health = X_val['Health_Category'].values
        val_encodings = np.array([means.get(health, global_mean) for health in X_val_health])
        train_data.loc[val_idx, 'Health_Mean_Fat'] = val_encodings
    
    test_data['Health_Mean_Fat'] = test_data['Health_Category'].map(health_means).fillna(global_mean)

# =======================================
# 3. MISSING VALUE HANDLING
# =======================================
print("Handling missing values...")

# For numerical columns
for col in num_cols:
    if col in train_data.columns and col in test_data.columns:
        median_val = train_data[col].median()
        train_data[col] = train_data[col].fillna(median_val)
        test_data[col] = test_data[col].fillna(median_val)

# For categorical columns
for col in cat_cols:
    if col in train_data.columns and col in test_data.columns:
        mode_val = train_data[col].mode()[0]
        train_data[col] = train_data[col].fillna(mode_val)
        test_data[col] = test_data[col].fillna(mode_val)

# =======================================
# 4. CATEGORICAL ENCODING
# =======================================
print("Encoding categorical features...")

# Create copies for encoding
encoded_train = train_data.copy()
encoded_test = test_data.copy()

# One-hot encoding for low-cardinality categories
ohe_cols = ['Drink_Type', 'Health_Category']
ohe_cols = [col for col in ohe_cols if col in cat_cols]

for col in ohe_cols:
    # Get unique values from both datasets
    all_categories = set(encoded_train[col].unique()).union(set(encoded_test[col].unique()))
    all_categories.discard('')
    
    print(f"One-hot encoding {col} with {len(all_categories)} categories")
    
    # Create one-hot encoded columns (drop_first=True)
    for i, category in enumerate(sorted(all_categories)):
        if i > 0:  # Skip first category
            col_name = f"{col}_{category}"
            encoded_train[col_name] = (encoded_train[col] == category).astype(int)
            encoded_test[col_name] = (encoded_test[col] == category).astype(int)

# Frequency encoding for high-cardinality features
freq_cols = ['Flavor_Variant']
freq_cols = [col for col in freq_cols if col in cat_cols]

for col in freq_cols:
    freq_map = encoded_train[col].value_counts(normalize=True).to_dict()
    
    encoded_train[f'{col}_freq'] = encoded_train[col].map(freq_map)
    encoded_test[f'{col}_freq'] = encoded_test[col].map(freq_map).fillna(1/len(encoded_train))

# Drop original categorical columns after encoding
for col in cat_cols:
    if col in encoded_train.columns:
        encoded_train.drop(columns=[col], inplace=True)
    if col in encoded_test.columns:
        encoded_test.drop(columns=[col], inplace=True)

# =======================================
# 5. FEATURE ALIGNMENT
# =======================================
print("Aligning features...")

# Ensure both datasets have the same columns
train_cols = encoded_train.columns
test_cols = encoded_test.columns

# Find columns that are in train but not in test
missing_test_cols = [col for col in train_cols if col not in test_cols and col != 'Fat_Content']
for col in missing_test_cols:
    encoded_test[col] = 0

# Find columns that are in test but not in train
missing_train_cols = [col for col in test_cols if col not in train_cols]
for col in missing_train_cols:
    encoded_train[col] = 0

# Final features
all_features = sorted([col for col in encoded_train.columns if col != 'Fat_Content'])

# Create final datasets
X_train = encoded_train[all_features]
y_train = encoded_train['Fat_Content']
X_test = encoded_test[all_features]

# Fix any remaining NaN values
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

print(f"Final feature count: {X_train.shape[1]}")

# =======================================
# 6. CROSS-VALIDATED MODEL TRAINING
# =======================================
print("Training models with cross-validation...")

# Parameters optimized for minimizing MAE
params = {
    'n_estimators': 500,
    'learning_rate': 0.01,
    'max_depth': 5,
    'min_child_weight': 2,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'gamma': 0.05,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'objective': 'reg:absoluteerror',  # Directly optimize for MAE
    'random_state': 42
}

# Cross-validation for more reliable MAE estimate
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(
    XGBRegressor(**params),
    X_train,
    y_train,
    scoring='neg_mean_absolute_error',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

print(f"Cross-validation MAE: {-cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# =======================================
# 7. ENSEMBLE OF K-FOLD MODELS
# =======================================
print("Creating K-fold model ensemble...")

# Train K separate models on different data splits for a robust ensemble
k_fold_models = []
k_fold_predictions = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train)):
    print(f"Training model for fold {fold+1}/5")
    X_fold_train, y_fold_train = X_train.iloc[train_idx], y_train.iloc[train_idx]
    
    # Train model on this fold
    fold_model = XGBRegressor(**params)
    fold_model.fit(X_fold_train, y_fold_train)
    k_fold_models.append(fold_model)
    
    # Generate predictions for test set
    fold_preds = fold_model.predict(X_test)
    k_fold_predictions += fold_preds / 5  # Average predictions across folds

# =======================================
# 8. TRAIN FINAL MODEL ON ALL DATA
# =======================================
print("Training final model on all data...")

final_model = XGBRegressor(**params)
final_model.fit(X_train, y_train, verbose=True)
final_predictions = final_model.predict(X_test)

# =======================================
# 9. BLEND ENSEMBLE AND FINAL MODEL
# =======================================
print("Blending ensemble and final model predictions...")

# Blend ensemble predictions (60%) and final model predictions (40%)
# This combines the robustness of cross-validation with the power of using all data
blended_predictions = 0.6 * k_fold_predictions + 0.4 * final_predictions

# =======================================
# 10. GENERATE SUBMISSION
# =======================================
# Create submission file
submission = pd.DataFrame({
    "Id": test_ids,
    "Fat_Content": blended_predictions
})

# Save to CSV
submission.to_csv("ensemble_submission.csv", index=False)
print("Predictions saved to ensemble_submission.csv ✅")

# Also save individual model predictions for comparison
submission_ensemble = pd.DataFrame({
    "Id": test_ids,
    "Fat_Content": k_fold_predictions
})
submission_ensemble.to_csv("kfold_ensemble_submission.csv", index=False)

submission_final = pd.DataFrame({
    "Id": test_ids,
    "Fat_Content": final_predictions
})
submission_final.to_csv("submission.csv", index=False)

print("All predictions saved. ")

Loading data...
Train data shape: (4000, 21)
Test data shape: (1000, 20)
Numerical features: 16
Categorical features: 4
Creating robust features...
Handling missing values...
Encoding categorical features...
One-hot encoding Drink_Type with 9 categories
One-hot encoding Health_Category with 4 categories
Aligning features...
Final feature count: 35
Training models with cross-validation...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Fat_to_Carb'].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Fat_to_Carb'].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inter

Cross-validation MAE: 20.7274 ± 1.0377
Creating K-fold model ensemble...
Training model for fold 1/5
Training model for fold 2/5
Training model for fold 3/5
Training model for fold 4/5
Training model for fold 5/5
Training final model on all data...
Blending ensemble and final model predictions...
Predictions saved to ensemble_submission.csv ✅
All predictions saved. 
