# Advanced Calorie Prediction Ensemble Solution

This notebook implements an ultra-advanced ensemble solution for calorie prediction with the goal of achieving RMSLE < 0.05.

## Structure:
1. **Data Loading & Basic Setup**
2. **Feature Engineering Functions**
3. **Target Encoding Functions**
4. **Feature Selection Functions**
5. **Ensemble Model Training**
6. **Main Execution Pipeline**
7. **Results Analysis**

##  Imports and Utility Functions
Import all required libraries and define the RMSLE scoring function.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import RobustScaler, QuantileTransformer, StandardScaler
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import Ridge, ElasticNet, BayesianRidge
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')

def rmsle_score(y_true, y_pred):
    """Calculate Root Mean Squared Logarithmic Error"""
    y_true = np.maximum(y_true, 1e-8)
    y_pred = np.maximum(y_pred, 1e-8)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

##  Data Loading Functions
Functions to load and preprocess the training and test datasets with proper data type conversion.

In [None]:
def load_data():
    """Load and preprocess training and test data"""
    train_cols = ['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']
    test_cols = ['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

    train = pd.read_csv('train.csv', names=train_cols, skiprows=1)
    test = pd.read_csv('test.csv', names=test_cols, skiprows=1)

    numeric_cols = ['id', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']
    for col in numeric_cols:
        if col in train.columns:
            train[col] = pd.to_numeric(train[col], errors='coerce')
        if col in test.columns and col != 'Calories':
            test[col] = pd.to_numeric(test[col], errors='coerce')

    print(f"Training data shape: {train.shape}")
    print(f"Test data shape: {test.shape}")
    return train, test

##  Core Feature Engineering
Advanced physiological feature engineering including BMI, heart rate zones, body surface area, and metabolic calculations.

In [None]:
def ultra_feature_engineering(df, is_test=False):
    """Advanced feature engineering with physiological and interaction features"""
    df = df.copy()

    # Core physiological features
    df['BMI'] = df['Weight'] / ((df['Height']/100) ** 2)
    df['Sex_bin'] = (df['Sex'] == 'female').astype(int)
    df['Max_HR'] = 220 - df['Age']
    df['HR_Intensity'] = df['Heart_Rate'] / df['Max_HR']
    df['HR_Reserve'] = df['Max_HR'] - df['Heart_Rate']

    # Advanced physiological calculations
    df['Body_Surface_Area'] = 0.007184 * (df['Weight'] ** 0.425) * (df['Height'] ** 0.725)
    df['Lean_Body_Mass'] = df['Weight'] * (1.10 - 0.0128 * df['BMI'])
    df['Basal_Metabolic_Rate'] = np.where(
        df['Sex_bin'] == 1,  # Female
        655 + (9.6 * df['Weight']) + (1.8 * df['Height']) - (4.7 * df['Age']),
        66 + (13.7 * df['Weight']) + (5 * df['Height']) - (6.8 * df['Age'])  # Male
    )

    # Heart rate zones (more granular)
    df['HR_Zone_Fine'] = pd.cut(df['HR_Intensity'],
                               bins=[0, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 1.0],
                               labels=range(7)).astype(int)

    # Advanced interactions
    df['Weight_Duration'] = df['Weight'] * df['Duration']
    df['HR_Duration'] = df['Heart_Rate'] * df['Duration']
    df['Weight_HR'] = df['Weight'] * df['Heart_Rate']
    df['BMI_Duration'] = df['BMI'] * df['Duration']
    df['BMI_HR'] = df['BMI'] * df['Heart_Rate']
    df['Age_HR'] = df['Age'] * df['Heart_Rate']
    df['BSA_Duration'] = df['Body_Surface_Area'] * df['Duration']
    df['BMR_Duration'] = df['Basal_Metabolic_Rate'] * df['Duration'] / 1440  # Per minute
    df['LBM_Duration'] = df['Lean_Body_Mass'] * df['Duration']

    # 3-way and 4-way interactions
    df['Weight_HR_Duration'] = df['Weight'] * df['Heart_Rate'] * df['Duration']
    df['BMI_HR_Duration'] = df['BMI'] * df['Heart_Rate'] * df['Duration']
    df['Age_Weight_HR'] = df['Age'] * df['Weight'] * df['Heart_Rate']
    df['Intensity_Weight_Duration'] = df['HR_Intensity'] * df['Weight'] * df['Duration']
    df['BMR_HR_Duration'] = df['Basal_Metabolic_Rate'] * df['Heart_Rate'] * df['Duration'] / 1440

    return df

##  Specialized Feature Engineering Functions
Temperature, metabolic, polynomial, and ratio feature engineering functions for comprehensive feature creation.

In [None]:
def add_temperature_features(df):
    """Add temperature-related features"""
    # Temperature features
    df['Temp_Diff'] = df['Body_Temp'] - 37.0
    df['Temp_Squared'] = df['Temp_Diff'] ** 2
    df['Temp_Cubed'] = df['Temp_Diff'] ** 3
    df['Temp_Abs'] = np.abs(df['Temp_Diff'])
    df['Temp_Category'] = pd.cut(df['Body_Temp'],
                                bins=[0, 37.5, 38.5, 39.5, 40.5, 42],
                                labels=range(5)).astype(int)
    return df

def add_metabolic_features(df):
    """Add advanced metabolic features"""
    # Advanced metabolic features
    df['MET_Estimate'] = 3.5 + (df['HR_Intensity'] * 10)  # Metabolic equivalent
    df['Calorie_Rate_MET'] = df['MET_Estimate'] * df['Weight'] * df['Duration'] / 60
    df['Calorie_Rate_Karvonen'] = df['Weight'] * 0.0175 * df['HR_Intensity'] * df['Duration']
    df['Energy_Expenditure'] = df['BMR_Duration'] * (1 + df['HR_Intensity'] * 2)
    df['Workout_Efficiency'] = df['HR_Intensity'] / (df['Duration'] / 30)  # Intensity per 30min
    return df

def add_polynomial_features(df):
    """Add polynomial features for key variables"""
    # Polynomial features (up to 4th degree for key variables)
    for col in ['Duration', 'Weight', 'Heart_Rate', 'BMI', 'HR_Intensity']:
        df[f'{col}_Squared'] = df[col] ** 2
        df[f'{col}_Cubed'] = df[col] ** 3
        if col in ['Duration', 'HR_Intensity']:
            df[f'{col}_Fourth'] = df[col] ** 4
    return df

def add_ratio_features(df):
    """Add comprehensive ratio features"""
    # Ratio features (comprehensive)
    df['Weight_Height_Ratio'] = df['Weight'] / df['Height']
    df['Duration_Age_Ratio'] = df['Duration'] / (df['Age'] + 1)
    df['HR_Weight_Ratio'] = df['Heart_Rate'] / df['Weight']
    df['BMI_Age_Ratio'] = df['BMI'] / (df['Age'] + 1)
    df['BSA_Weight_Ratio'] = df['Body_Surface_Area'] / df['Weight']
    df['LBM_Weight_Ratio'] = df['Lean_Body_Mass'] / df['Weight']
    df['BMR_Weight_Ratio'] = df['Basal_Metabolic_Rate'] / df['Weight']
    df['Calorie_per_Minute'] = df['Calorie_Rate_MET'] / (df['Duration'] + 1)
    df['Calorie_per_KG'] = df['Calorie_Rate_MET'] / df['Weight']
    df['Intensity_per_Age'] = df['HR_Intensity'] / (df['Age'] / 40)  # Normalized by typical age
    return df

##  Categorical and Transformation Features
Binning, categorical grouping, logarithmic transformations, and complete feature engineering pipeline.

In [None]:
def add_binning_features(df):
    """Add binning features with granular categories"""
    # Binning features (very granular)
    df['Age_Group'] = pd.cut(df['Age'], bins=np.arange(20, 81, 5), labels=False).fillna(0).astype(int)
    df['Weight_Group'] = pd.cut(df['Weight'], bins=np.arange(40, 131, 10), labels=False).fillna(0).astype(int)
    df['Duration_Group'] = pd.cut(df['Duration'], bins=np.arange(0, 31, 3), labels=False).fillna(0).astype(int)
    df['HR_Group'] = pd.cut(df['Heart_Rate'], bins=np.arange(60, 131, 7), labels=False).fillna(0).astype(int)
    df['BMI_Group'] = pd.cut(df['BMI'], bins=[0, 18.5, 22, 25, 28, 30, 35, 100], labels=False).fillna(0).astype(int)

    # Interaction between categorical features
    df['Age_Weight_Group'] = df['Age_Group'] * 10 + df['Weight_Group']
    df['Age_Duration_Group'] = df['Age_Group'] * 10 + df['Duration_Group']
    df['Weight_Duration_Group'] = df['Weight_Group'] * 10 + df['Duration_Group']

    return df

def add_log_sqrt_features(df):
    """Add log and sqrt transformations"""
    # Log and sqrt transformations
    for col in ['Weight', 'Duration', 'Heart_Rate', 'BMI']:
        df[f'Log_{col}'] = np.log1p(df[col])
        df[f'Sqrt_{col}'] = np.sqrt(df[col])
    return df

def complete_feature_engineering(df, is_test=False):
    """Complete feature engineering pipeline with NaN handling"""
    print(f"Starting feature engineering... Input shape: {df.shape}")

    # Apply all feature engineering steps
    df = ultra_feature_engineering(df, is_test)
    df = add_temperature_features(df)
    df = add_metabolic_features(df)
    df = add_polynomial_features(df)
    df = add_ratio_features(df)
    df = add_binning_features(df)
    df = add_log_sqrt_features(df)

    # Handle NaN and infinite values
    print("Checking for NaN and infinite values...")

    # Replace infinite values with NaN first
    df = df.replace([np.inf, -np.inf], np.nan)

    # Count NaN values
    nan_counts = df.isnull().sum()
    total_nans = nan_counts.sum()

    if total_nans > 0:
        print(f"Found {total_nans} NaN values across {(nan_counts > 0).sum()} columns")

        # Fill NaN values with appropriate strategies
        numeric_cols = df.select_dtypes(include=[np.number]).columns

        for col in numeric_cols:
            if df[col].isnull().any():
                if col in ['id']:
                    continue  # Skip ID column
                elif 'Group' in col or 'Category' in col:
                    # Categorical features - fill with mode or 0
                    df[col] = df[col].fillna(0)
                else:
                    # Continuous features - fill with median
                    df[col] = df[col].fillna(df[col].median())

        print("NaN values handled successfully!")

    print(f"Feature engineering complete. Output shape: {df.shape}")
    return df

##  Advanced Target Encoding
Cross-validated target encoding with multiple statistics (mean, std, median) to prevent overfitting.

In [None]:
def create_advanced_target_encoding(train_df, test_df, cat_cols, target_col, cv_folds=5):
    """Create advanced target encoding with multiple statistics"""
    encoded_train = train_df.copy()
    encoded_test = test_df.copy()

    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)

    print(f"Creating target encoding for {len(cat_cols)} categorical columns...")

    for col in cat_cols:
        # Multiple target statistics
        encoded_train[f'{col}_target_mean'] = 0.0
        encoded_train[f'{col}_target_std'] = 0.0
        encoded_train[f'{col}_target_median'] = 0.0

        encoded_test[f'{col}_target_mean'] = 0.0
        encoded_test[f'{col}_target_std'] = 0.0
        encoded_test[f'{col}_target_median'] = 0.0

        # Global statistics
        global_mean = train_df[target_col].mean()
        global_std = train_df[target_col].std()
        global_median = train_df[target_col].median()

        # Cross-validation encoding for train
        for train_idx, val_idx in kf.split(train_df):
            train_fold = train_df.iloc[train_idx]
            val_fold = train_df.iloc[val_idx]

            # Calculate statistics for each category
            target_stats = train_fold.groupby(col)[target_col].agg(['mean', 'std', 'median']).reset_index()
            target_stats.columns = [col, 'mean', 'std', 'median']

            # Apply to validation fold
            val_merged = val_fold[[col]].merge(target_stats, on=col, how='left')
            encoded_train.loc[val_idx, f'{col}_target_mean'] = val_merged['mean'].fillna(global_mean)
            encoded_train.loc[val_idx, f'{col}_target_std'] = val_merged['std'].fillna(global_std)
            encoded_train.loc[val_idx, f'{col}_target_median'] = val_merged['median'].fillna(global_median)

        # Encoding for test (using full train data)
        target_stats = train_df.groupby(col)[target_col].agg(['mean', 'std', 'median']).reset_index()
        target_stats.columns = [col, 'mean', 'std', 'median']

        test_merged = test_df[[col]].merge(target_stats, on=col, how='left')
        encoded_test[f'{col}_target_mean'] = test_merged['mean'].fillna(global_mean)
        encoded_test[f'{col}_target_std'] = test_merged['std'].fillna(global_std)
        encoded_test[f'{col}_target_median'] = test_merged['median'].fillna(global_median)

    print("Target encoding complete!")
    return encoded_train, encoded_test

##  Feature Selection
Statistical feature selection using F-regression to identify the most important features for model training.

In [None]:
def select_best_features(X, y, n_features=100):
    """Select best features using statistical tests with NaN handling"""
    print(f"Selecting top {n_features} features from {X.shape[1]} total features...")

    # Check for NaN values and handle them
    if np.isnan(X).any():
        print("Warning: NaN values detected in features. Applying imputation...")
        from sklearn.impute import SimpleImputer
        imputer = SimpleImputer(strategy='median')
        X = imputer.fit_transform(X)
        print("NaN values handled with median imputation.")

    # Check if we have infinite values
    if np.isinf(X).any():
        print("Warning: Infinite values detected. Replacing with finite values...")
        X = np.where(np.isinf(X), np.finfo(np.float64).max, X)
        print("Infinite values handled.")

    # Ensure we don't select more features than available
    n_features = min(n_features, X.shape[1])

    selector = SelectKBest(score_func=f_regression, k=n_features)
    X_selected = selector.fit_transform(X, y)
    selected_features = selector.get_support(indices=True)

    print(f"Feature selection complete. Selected {len(selected_features)} features.")
    return X_selected, selected_features

##  Level 1 Ensemble Models
Definition of diverse Level 1 models including LightGBM, XGBoost, Random Forest, Extra Trees, and Gradient Boosting with optimized hyperparameters.

In [None]:
def get_level1_models():
    """Define Level 1 models with optimized hyperparameters"""
    level1_models = {
        'lgb_ultra': LGBMRegressor(
            n_estimators=2000,
            learning_rate=0.01,
            num_leaves=127,
            max_depth=10,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.001,
            reg_lambda=0.001,
            min_child_samples=5,
            min_child_weight=0.1,
            bagging_freq=1,
            feature_fraction=0.9,
            random_state=42,
            verbose=-1
        ),
        'lgb_deep': LGBMRegressor(
            n_estimators=1500,
            learning_rate=0.015,
            num_leaves=255,
            max_depth=12,
            subsample=0.85,
            colsample_bytree=0.85,
            reg_alpha=0.005,
            reg_lambda=0.005,
            min_child_samples=3,
            min_child_weight=0.05,
            random_state=123,
            verbose=-1
        ),
        'xgb_ultra': XGBRegressor(
            n_estimators=1500,
            learning_rate=0.01,
            max_depth=10,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.001,
            reg_lambda=0.001,
            min_child_weight=0.1,
            gamma=0,
            random_state=42,
            verbosity=0,
            early_stopping_rounds=200
        ),
        'xgb_deep': XGBRegressor(
            n_estimators=1200,
            learning_rate=0.015,
            max_depth=12,
            subsample=0.85,
            colsample_bytree=0.85,
            reg_alpha=0.005,
            reg_lambda=0.005,
            min_child_weight=0.05,
            gamma=0.1,
            random_state=123,
            verbosity=0,
            early_stopping_rounds=150
        ),
        'rf_ultra': RandomForestRegressor(
            n_estimators=800,
            max_depth=15,
            min_samples_split=3,
            min_samples_leaf=1,
            max_features='sqrt',
            random_state=42,
            n_jobs=-1
        ),
        'et_ultra': ExtraTreesRegressor(
            n_estimators=600,
            max_depth=12,
            min_samples_split=3,
            min_samples_leaf=1,
            max_features='sqrt',
            random_state=42,
            n_jobs=-1
        ),
        'gbr': GradientBoostingRegressor(
            n_estimators=800,
            learning_rate=0.02,
            max_depth=8,
            subsample=0.8,
            random_state=42
        )
    }
    return level1_models

##  Level 2 Meta-Learners and Training Pipeline
Level 2 meta-learners (Bayesian Ridge, Elastic Net, Ridge, LightGBM) and Level 1 model training with cross-validation.

In [None]:
def get_level2_models():
    """Define Level 2 meta-learners"""
    level2_models = {
        'bayesian_ridge': BayesianRidge(alpha_1=1e-6, alpha_2=1e-6, lambda_1=1e-6, lambda_2=1e-6),
        'elastic_net': ElasticNet(alpha=0.5, l1_ratio=0.3, random_state=42),
        'ridge': Ridge(alpha=5.0, random_state=42),
        'lgb_meta': LGBMRegressor(
            n_estimators=200,
            learning_rate=0.05,
            num_leaves=15,
            max_depth=4,
            random_state=42,
            verbose=-1
        )
    }
    return level2_models

def train_level1_models(X, y, X_test):
    """Train Level 1 models and generate out-of-fold predictions"""
    level1_models = get_level1_models()
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Level 1 predictions
    level1_train = np.zeros((len(y), len(level1_models)))
    level1_test = np.zeros((len(X_test), len(level1_models)))

    print("Training Level 1 ultra models...")

    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"Fold {fold+1}/5")

        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        for idx, (name, model) in enumerate(level1_models.items()):
            if 'lgb' in name:
                model.fit(
                    X_train, y_train,
                    eval_set=[(X_val, y_val)],
                    eval_metric='rmse',
                    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(0)]
                )
            elif 'xgb' in name:
                model.fit(
                    X_train, y_train,
                    eval_set=[(X_val, y_val)],
                    verbose=False
                )
            else:  # RF, ET, GBR
                model.fit(X_train, y_train)

            # Predictions
            val_pred = np.maximum(model.predict(X_val), 0.1)
            level1_train[val_idx, idx] = val_pred
            level1_test[:, idx] += model.predict(X_test) / 5

            val_score = rmsle_score(y_val, val_pred)
            print(f"  {name}: {val_score:.5f}")

    # Ensure positive predictions
    level1_test = np.maximum(level1_test, 0.1)

    return level1_train, level1_test

##  Ensemble Training and Weighted Combination
Level 2 model training, final ensemble creation with performance-based weighting, and complete ultra ensemble pipeline.

In [None]:
def train_level2_models(level1_train, y, level1_test):
    """Train Level 2 meta-learners and create final ensemble"""
    level2_models = get_level2_models()
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    print("\nTraining Level 2 meta-learners...")

    level2_predictions = {}
    level2_scores = {}

    for name, model in level2_models.items():
        oof_pred = np.zeros(len(y))

        for train_idx, val_idx in kf.split(level1_train):
            X_train_l2, X_val_l2 = level1_train[train_idx], level1_train[val_idx]
            y_train_l2, y_val_l2 = y[train_idx], y[val_idx]

            if 'lgb' in name:
                model.fit(
                    X_train_l2, y_train_l2,
                    eval_set=[(X_val_l2, y_val_l2)],
                    eval_metric='rmse',
                    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
                )
            else:
                model.fit(X_train_l2, y_train_l2)

            oof_pred[val_idx] = model.predict(X_val_l2)

        oof_pred = np.maximum(oof_pred, 0.1)
        score = rmsle_score(y, oof_pred)
        level2_scores[name] = score

        # Train on full data for final prediction
        model.fit(level1_train, y)
        final_pred = np.maximum(model.predict(level1_test), 0.1)
        level2_predictions[name] = final_pred

        print(f"{name}: {score:.5f}")

    return level2_predictions, level2_scores

def create_final_ensemble(level2_predictions, level2_scores):
    """Create weighted ensemble of Level 2 models"""
    # Ensemble of level 2 models (weighted by performance)
    scores = list(level2_scores.values())
    weights = [1.0 / (score + 1e-8) for score in scores]
    weights = [w / sum(weights) for w in weights]

    final_prediction = np.zeros(len(next(iter(level2_predictions.values()))))
    for (name, pred), weight in zip(level2_predictions.items(), weights):
        final_prediction += pred * weight
        print(f"Final weight {name}: {weight:.3f}")

    best_score = min(level2_scores.values())
    return final_prediction, best_score

def train_ultra_ensemble(X, y, X_test):
    """Complete ultra ensemble training pipeline"""
    # Train Level 1 models
    level1_train, level1_test = train_level1_models(X, y, X_test)

    # Train Level 2 models
    level2_predictions, level2_scores = train_level2_models(level1_train, y, level1_test)

    # Create final ensemble
    final_prediction, best_score = create_final_ensemble(level2_predictions, level2_scores)

    return final_prediction, best_score

##  Main Execution Pipeline
Complete orchestration pipeline that coordinates data loading, feature engineering, target encoding, feature selection, scaling, and ensemble training.

In [None]:
def main():
    """Main execution pipeline"""
    print("=" * 60)
    print("ULTRA ADVANCED SUB-0.05 SOLUTION")
    print("Target: RMSLE < 0.05")
    print("=" * 60)

    # Load data
    train, test = load_data()

    # Complete feature engineering
    print("\nApplying complete feature engineering...")
    train_fe = complete_feature_engineering(train, is_test=False)
    test_fe = complete_feature_engineering(test, is_test=True)

    # Advanced target encoding
    print("\nApplying advanced target encoding...")
    cat_cols = ['Age_Group', 'Weight_Group', 'Duration_Group', 'HR_Group', 'BMI_Group',
               'HR_Zone_Fine', 'Temp_Category', 'Age_Weight_Group', 'Age_Duration_Group']
    train_fe, test_fe = create_advanced_target_encoding(train_fe, test_fe, cat_cols, 'Calories')

    # Prepare features
    feature_cols = [col for col in train_fe.columns if col not in ['id', 'Calories', 'Sex']]

    X = train_fe[feature_cols].values
    y = train_fe['Calories'].values
    X_test = test_fe[feature_cols].values

    print(f"\nTotal features before selection: {len(feature_cols)}")

    # Feature selection
    X_selected, selected_indices = select_best_features(X, y, n_features=120)
    X_test_selected = X_test[:, selected_indices]

    print(f"Selected features: {len(selected_indices)}")

    # Advanced scaling
    print("\nApplying quantile transformation...")
    scaler = QuantileTransformer(n_quantiles=2000, random_state=42)
    X_scaled = scaler.fit_transform(X_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # Train ultra ensemble
    print("\nTraining ultra ensemble...")
    predictions, cv_score = train_ultra_ensemble(X_scaled, y, X_test_scaled)

    return train, test, predictions, cv_score

## ▶ Execute the Solution
Run the complete pipeline to generate predictions and evaluate performance.

In [None]:
# Execute the main pipeline
train_data, test_data, predictions, cv_score = main()

ULTRA ADVANCED SUB-0.05 SOLUTION
Target: RMSLE < 0.05
Training data shape: (750000, 9)
Test data shape: (250000, 8)

Applying complete feature engineering...
Starting feature engineering... Input shape: (750000, 9)
Checking for NaN and infinite values...
Feature engineering complete. Output shape: (750000, 80)
Starting feature engineering... Input shape: (250000, 8)
Checking for NaN and infinite values...
Feature engineering complete. Output shape: (250000, 79)

Applying advanced target encoding...
Creating target encoding for 9 categorical columns...
Target encoding complete!

Total features before selection: 104
Selecting top 120 features from 104 total features...
NaN values handled with median imputation.
Feature selection complete. Selected 104 features.
Selected features: 104

Applying quantile transformation...

Training ultra ensemble...
Training Level 1 ultra models...
Fold 1/5
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[15

##  Results Analysis and Submission Creation
Create the submission file, display comprehensive results, and analyze model performance against targets.

In [None]:
# Create submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'Calories': predictions
})
submission.to_csv("submission.csv", index=False)

# Display results
print(f"\n" + "=" * 60)
print("ULTRA ADVANCED RESULTS")
print("=" * 60)
print(f"Final CV Score: {cv_score:.5f}")
print(f"Previous best: 0.06305")
print(f"Target: < 0.05000")

if cv_score < 0.05:
    improvement = 0.06305 - cv_score
    print(f"🏆 TARGET ACHIEVED! Improvement: {improvement:.5f}")
elif cv_score < 0.055:
    improvement = 0.06305 - cv_score
    print(f"🥇 EXCELLENT! Improvement: {improvement:.5f}")
else:
    improvement = 0.06305 - cv_score
    print(f"📈 Progress: {improvement:.5f}")

print(f"\nPredictions range: {predictions.min():.2f} to {predictions.max():.2f}")
print(f"Mean prediction: {predictions.mean():.2f}")
print("=" * 60)

print(f"\nSubmission file created: submission.csv")
print(f"Submission shape: {submission.shape}")
submission.head()

##  Visualization and Statistical Analysis
Optional detailed analysis with prediction distributions, comparisons with training data, and comprehensive statistical summaries.

In [None]:
# Optional: Additional analysis and visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Prediction distribution analysis
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(predictions, bins=50, alpha=0.7, edgecolor='black')
plt.title('Prediction Distribution')
plt.xlabel('Predicted Calories')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
plt.hist(train_data['Calories'], bins=50, alpha=0.7, edgecolor='black', label='Train')
plt.hist(predictions, bins=50, alpha=0.7, edgecolor='black', label='Predictions')
plt.title('Train vs Predictions Distribution')
plt.xlabel('Calories')
plt.ylabel('Frequency')
plt.legend()

plt.subplot(1, 3, 3)
plt.scatter(range(len(predictions)), predictions, alpha=0.6)
plt.title('Prediction Index vs Value')
plt.xlabel('Sample Index')
plt.ylabel('Predicted Calories')

plt.tight_layout()
plt.show()

# Summary statistics
print("\nPrediction Summary Statistics:")
print(f"Min: {predictions.min():.2f}")
print(f"Max: {predictions.max():.2f}")
print(f"Mean: {predictions.mean():.2f}")
print(f"Median: {np.median(predictions):.2f}")
print(f"Std: {predictions.std():.2f}")

print("\nTrain Target Summary Statistics:")
print(f"Min: {train_data['Calories'].min():.2f}")
print(f"Max: {train_data['Calories'].max():.2f}")
print(f"Mean: {train_data['Calories'].mean():.2f}")
print(f"Median: {train_data['Calories'].median():.2f}")
print(f"Std: {train_data['Calories'].std():.2f}")