In [57]:
import lightgbm
print(lightgbm.__version__)


4.6.0


In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_log_error
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import itertools
from scipy.optimize import minimize
import time
import warnings
warnings.filterwarnings('ignore')

In [21]:
train = pd.read_csv(r"C:\Users\Aditya P J\Documents\Competition\Kaggle\train.csv")
test = pd.read_csv(r"C:\Users\Aditya P J\Documents\Competition\Kaggle\test.csv")
submission = pd.read_csv(r"C:\Users\Aditya P J\Documents\Competition\Kaggle\sample_submission.csv")

# Define features
numerical_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
categorical_features = ['Sex']

# Print data information
print("Train shape:", train.shape)
print("Test shape:", test.shape)

# Label encode categorical features
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
test['Sex'] = le.transform(test['Sex'])

# Transform target
train['Calories_log'] = np.log1p(train['Calories'])

Train shape: (750000, 9)
Test shape: (250000, 8)


In [79]:
# Feature engineering functions
def create_ratio_features(df):
    df = df.copy()
    # BMI and related metrics
    df['BMI'] = df['Weight'] / ((df['Height']/100) ** 2)
    df['BMI_Prime'] = df['BMI'] / 25.0  # Normalized BMI
    
    # Exercise intensity metrics
    df['Heart_Rate_per_kg'] = df['Heart_Rate'] / df['Weight']
    df['Duration_per_HR'] = df['Duration'] / df['Heart_Rate']
    
    # Energy expenditure related
    df['HR_Weight_Ratio'] = df['Heart_Rate'] / df['Weight']
    df['Intensity_Factor'] = df['Heart_Rate'] * df['Duration'] / 60
    df['Temp_HR_Ratio'] = df['Body_Temp'] / df['Heart_Rate']
    
    # Special exercise-specific formulas
    df['Energy_Index'] = df['Heart_Rate'] * df['Duration'] * df['Weight'] / 10000
    df['Calorie_Estimator'] = ((0.2 * df['Heart_Rate']) + (0.1 * df['Weight']) + (0.05 * df['Duration'])) * 5
    
    # NEW: Karvonen formula related features
    df['Max_HR'] = 220 - df['Age']
    df['HR_Reserve'] = df['Max_HR'] - 60  # Assuming resting HR of 60
    df['HR_Reserve_Used'] = (df['Heart_Rate'] - 60) / df['HR_Reserve']
    
    # NEW: Age adjusted features
    df['Age_Adjusted_HR'] = df['Heart_Rate'] / (220 - df['Age'])
    df['Age_Weight_Interaction'] = df['Age'] * df['Weight'] / 100
    
    # NEW: Metabolic features
    df['MET_estimate'] = 3.5 + (df['Heart_Rate'] - 60) * 0.1
    df['Est_VO2'] = (df['Heart_Rate'] / df['Max_HR']) * 100
    
    return df
    
def add_interaction_features(df, features):
    df_new = df.copy()
    # Create all pairwise feature interactions (limited to most meaningful ones)
    important_pairs = [
        ('Duration', 'Heart_Rate'),
        ('Weight', 'Heart_Rate'),
        ('Duration', 'Weight'),
        ('Age', 'Heart_Rate'),
        ('Height', 'Weight'),
        ('Body_Temp', 'Heart_Rate'),
        ('Age', 'Weight'),
        ('Duration', 'Body_Temp')
    ]
    
    for f1, f2 in important_pairs:
        # Multiplication (most important)
        df_new[f"{f1}_x_{f2}"] = df_new[f1] * df_new[f2]
        
        # Addition
        df_new[f"{f1}_plus_{f2}"] = df_new[f1] + df_new[f2]
        
        # Division (both ways)
        df_new[f"{f1}_div_{f2}"] = df_new[f1] / (df_new[f2] + 1e-5)
        df_new[f"{f2}_div_{f1}"] = df_new[f2] / (df_new[f1] + 1e-5)
        
        # NEW: Square root of product
        df_new[f"sqrt_{f1}_x_{f2}"] = np.sqrt(df_new[f1] * df_new[f2] + 1e-5)
        
        # NEW: Log of product
        df_new[f"log_{f1}_x_{f2}"] = np.log1p(df_new[f1] * df_new[f2])
    
    return df_new

def add_statistical_features(df, features):
    df_new = df.copy()
    df_new["row_mean"] = df[features].mean(axis=1)
    df_new["row_std"] = df[features].std(axis=1)
    df_new["row_max"] = df[features].max(axis=1)
    df_new["row_min"] = df[features].min(axis=1)
    df_new["row_range"] = df_new["row_max"] - df_new["row_min"]
    
    # Calculate mean absolute deviation manually
    df_new["row_mad"] = df[features].sub(df[features].mean(axis=1), axis=0).abs().mean(axis=1)
    
    # NEW: Coefficient of variation
    df_new["row_cv"] = df_new["row_std"] / (df_new["row_mean"] + 1e-5)
    
    # NEW: Z-score for each feature
    for feat in features:
        df_new[f"{feat}_zscore"] = (df[feat] - df[feat].mean()) / (df[feat].std() + 1e-5)
    
    return df_new

def add_exercise_features(df):
    df = df.copy()
    
    # Exercise science formulas
    df['MET_estimate'] = 3.5 + (df['Heart_Rate'] - 60) * 0.1
    df['HR_reserve_pct'] = (df['Heart_Rate'] - 60) / ((220 - df['Age']) - 60)
    
    # Advanced intensity metrics
    df['sqrt_Duration'] = np.sqrt(df['Duration'])
    df['log_Weight'] = np.log1p(df['Weight'])
    df['log_Duration'] = np.log1p(df['Duration'])
    
    # Non-linear Heart Rate transformations
    df['HR_squared'] = df['Heart_Rate'] ** 2
    df['HR_cubed'] = df['Heart_Rate'] ** 3
    df['HR_sqrt'] = np.sqrt(df['Heart_Rate'])
    
    # Cardiovascular load metrics
    df['CV_Load'] = df['Heart_Rate'] * df['Duration'] / 1000
    df['CV_Load_Weight_Adjusted'] = df['CV_Load'] / df['Weight']
    
    # NEW: Power features
    df['Power_estimate'] = df['Weight'] * df['Heart_Rate'] * df['Duration'] / 10000
    
    # NEW: Exponential transformations
    df['exp_HR_scaled'] = np.exp(df['Heart_Rate']/100) - 1
    df['exp_Duration_scaled'] = np.exp(df['Duration']/100) - 1
    
    # NEW: Fatigue features
    df['Fatigue_Index'] = df['Duration'] * (df['Heart_Rate'] / (220 - df['Age']))
    
    # NEW: Intensity zones (based on HR zones)
    df['HR_pct_max'] = df['Heart_Rate'] / (220 - df['Age'])
    
    # NEW: Physiological cost index
    df['PCI'] = df['Heart_Rate'] / df['Duration']
    
    return df


def add_target_encoding(X_train, X_test, y, cat_cols=['Sex']):
    X_train_enc = X_train.copy()
    X_test_enc = X_test.copy()
    
    # Use stratified KFold for more robust encoding
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Process each categorical column
    for col in cat_cols:
        if col in X_train.columns:
            # Initialize temporary array for out-of-fold predictions
            X_train_enc[f'{col}_target_mean'] = np.zeros(len(X_train))
            
            # Global target mean (for handling unseen categories)
            global_mean = y.mean()
            
            # Dictionary to store encodings for test data
            encoding_dict = {}
            
            # Cross-validation loop
            for train_idx, val_idx in kf.split(X_train):
                # Get target means from training fold
                target_means = y.iloc[train_idx].groupby(X_train[col].iloc[train_idx]).mean()
                
                # Apply to validation fold
                for category, mean_value in target_means.items():
                    val_indices = val_idx[X_train[col].iloc[val_idx] == category]
                    X_train_enc.loc[val_indices, f'{col}_target_mean'] = mean_value
                    
                    # Update dictionary for test set
                    if category in encoding_dict:
                        encoding_dict[category] = (encoding_dict[category] + mean_value) / 2
                    else:
                        encoding_dict[category] = mean_value
            
            # Apply encodings to test data
            X_test_enc[f'{col}_target_mean'] = X_test[col].map(encoding_dict).fillna(global_mean)
            
            # NEW: Add variance of target for each category
            target_vars = y.groupby(X_train[col]).var().to_dict()
            X_train_enc[f'{col}_target_var'] = X_train[col].map(target_vars).fillna(y.var())
            X_test_enc[f'{col}_target_var'] = X_test[col].map(target_vars).fillna(y.var())
    
    return X_train_enc, X_test_enc

def drop_highly_correlated(df, threshold=0.995):
    # Find and drop highly correlated features
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
    return df.drop(columns=to_drop), to_drop

# Create polynomial features
def add_polynomial_features(X_train, X_test, features, degree=2):
    poly = PolynomialFeatures(degree=degree, interaction_only=True, include_bias=False)
    
    # Fit on training data
    poly_features_train = poly.fit_transform(X_train[features])
    feature_names = poly.get_feature_names_out(features)
    
    # Create DataFrame with new features
    poly_train = pd.DataFrame(poly_features_train, columns=feature_names)
    
    # Apply to test data
    poly_features_test = poly.transform(X_test[features])
    poly_test = pd.DataFrame(poly_features_test, columns=feature_names)
    
    # Remove original features to avoid duplication
    poly_train = poly_train[[col for col in poly_train.columns if col not in features]]
    poly_test = poly_test[[col for col in poly_test.columns if col not in features]]
    
    # Concatenate with original dataframes
    X_train_poly = pd.concat([X_train.reset_index(drop=True), poly_train.reset_index(drop=True)], axis=1)
    X_test_poly = pd.concat([X_test.reset_index(drop=True), poly_test.reset_index(drop=True)], axis=1)
    
    return X_train_poly, X_test_poly

def add_cluster_features(X_train, X_test, n_clusters=5):
    from sklearn.cluster import KMeans
    
    # Select subset of features for clustering
    cluster_features = ['Age', 'Weight', 'Heart_Rate', 'Duration', 'Body_Temp']
    available_features = [f for f in cluster_features if f in X_train.columns]
    
    if len(available_features) < 3:
        return X_train, X_test
        
    # Standardize data for clustering
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(X_train[available_features])
    test_scaled = scaler.transform(X_test[available_features])
    
    # Fit KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    train_clusters = kmeans.fit_predict(train_scaled)
    test_clusters = kmeans.predict(test_scaled)
    
    # Add cluster labels as features
    X_train_new = X_train.copy()
    X_test_new = X_test.copy()
    
    X_train_new['cluster'] = train_clusters
    X_test_new['cluster'] = test_clusters
    
    # Add distances to centroids
    distances = kmeans.transform(train_scaled)
    for i in range(n_clusters):
        X_train_new[f'cluster_dist_{i}'] = distances[:, i]
    
    distances = kmeans.transform(test_scaled)
    for i in range(n_clusters):
        X_test_new[f'cluster_dist_{i}'] = distances[:, i]
    
    # Create dummy variables for clusters
    train_dummies = pd.get_dummies(train_clusters, prefix='cluster')
    test_dummies = pd.get_dummies(test_clusters, prefix='cluster')
    
    # Ensure all clusters are represented in test set
    for i in range(n_clusters):
        if f'cluster_{i}' not in test_dummies.columns:
            test_dummies[f'cluster_{i}'] = 0
    
    X_train_new = pd.concat([X_train_new, train_dummies], axis=1)
    X_test_new = pd.concat([X_test_new, test_dummies[train_dummies.columns]], axis=1)
    
    return X_train_new, X_test_new
    
# Feature creation pipeline
def create_features(train_df, test_df):
    # Copy data
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # 1. Add basic ratio features
    train_df = create_ratio_features(train_df)
    test_df = create_ratio_features(test_df)
    
    # 2. Add exercise-specific features
    train_df = add_exercise_features(train_df)
    test_df = add_exercise_features(test_df)
    
    # 3. Add interaction features
    train_df = add_interaction_features(train_df, numerical_features)
    test_df = add_interaction_features(test_df, numerical_features)
    
    # 4. Add statistical features
    train_df = add_statistical_features(train_df, numerical_features)
    test_df = add_statistical_features(test_df, numerical_features)
    
    # 5. Create polynomial features 
    train_poly, test_poly = add_polynomial_features(
        train_df.drop(columns=['id', 'Calories', 'Calories_log'] if 'Calories_log' in train_df.columns else ['id', 'Calories']), 
        test_df.drop(columns=['id']), 
        numerical_features, 
        degree=2
    )
    
    # 6. Add cluster features
    train_poly, test_poly = add_cluster_features(train_poly, test_poly, n_clusters=8)
    
    # 7. Prepare X and y
    X = train_poly
    y = train_df['Calories_log'] if 'Calories_log' in train_df.columns else np.log1p(train_df['Calories'])
    X_test = test_poly
    
    # 8. Add target encoding
    X, X_test = add_target_encoding(X, X_test, y, cat_cols=['Sex'])
    
    # 9. Drop highly correlated features
    X, dropped_cols = drop_highly_correlated(X, threshold=0.995)
    X_test = X_test.drop(columns=[col for col in dropped_cols if col in X_test.columns])
    
    # 10. Scale numerical features
    scaler = StandardScaler()
    numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
    
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
    
    return X, X_test, y

# Create features
X, X_test, y = create_features(train, test)

print(f"X shape: {X.shape}")
print(f"X_test shape: {X_test.shape}")

X shape: (750000, 75)
X_test shape: (250000, 75)


In [81]:
import numpy as np
import pandas as pd
import time
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostRegressor, Pool

# Load data - assumes X, y, X_test, and submission are already prepared
# If running this as standalone script, uncomment and modify these lines:
# X = pd.read_csv('train_features.csv')
# y = pd.read_csv('train_target.csv').squeeze()
# X_test = pd.read_csv('test_features.csv')
# submission = pd.read_csv('sample_submission.csv')

# Define model parameters
catboost_params = {
    'iterations': 2500,
    'learning_rate': 0.01,
    'depth': 10,
    'l2_leaf_reg': 3.0,
    'random_strength': 0.8,
    'bagging_temperature': 1.0,
    'od_type': 'Iter',
    'od_wait': 100,
    'boosting_type': 'Plain',  # Try 'Plain' instead of default
    'grow_policy': 'SymmetricTree',  # Better for regression tasks
    'verbose': 100,
    'random_seed': 42
}

# Number of folds for cross-validation
FOLDS = 10

# Create bins for stratification
bins = pd.qcut(y, 10, labels=False, duplicates='drop')

# Use StratifiedKFold for better distribution
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

# Initialize results
cat_oof = np.zeros(len(X))
cat_pred = np.zeros(len(X_test))
cat_rmsle = []

print("\n=== Training CatBoost ===")
model = CatBoostRegressor(**catboost_params)

# Train on each fold
for i, (train_idx, valid_idx) in enumerate(kf.split(X, bins)):
    print(f"\nFold {i+1}")
    x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    x_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]
    
    # Handle duplicate columns
    x_train = x_train.loc[:, ~x_train.columns.duplicated()]
    x_valid = x_valid.loc[:, ~x_valid.columns.duplicated()]
    x_test = X_test.loc[:, ~X_test.columns.duplicated()].copy()
    
    start = time.time()
    
    model = CatBoostRegressor(**catboost_params)
    cat_features = ['Sex'] if 'Sex' in x_train.columns else None
    train_pool = Pool(x_train, y_train, cat_features=cat_features)
    valid_pool = Pool(x_valid, y_valid, cat_features=cat_features)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=100)
    
    # Make predictions
    oof_pred = model.predict(x_valid)
    test_pred = model.predict(x_test)
    
    # Store results
    cat_oof[valid_idx] = oof_pred
    cat_pred += test_pred / FOLDS
    
    # Calculate RMSLE
    rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(oof_pred)))
    cat_rmsle.append(rmsle)
    
    print(f"Fold {i+1} RMSLE: {rmsle:.6f}")
    print(f"Training time: {time.time() - start:.1f} sec")

# Calculate and display performance
mean_rmsle = np.mean(cat_rmsle)
std_rmsle = np.std(cat_rmsle)
print(f"\nCatBoost - Mean RMSLE: {mean_rmsle:.6f} ± {std_rmsle:.6f}")

# Save results
np.save('catboost_oof.npy', cat_oof)
np.save('catboost_pred.npy', cat_pred)

# Create a basic submission file with just CatBoost predictions
cat_submission = submission.copy()
cat_submission['Calories'] = np.clip(np.expm1(cat_pred), 1, 314)
cat_submission.to_csv('catboost_submission.csv', index=False)


=== Training CatBoost ===

Fold 1
0:	learn: 0.9538152	test: 0.9547609	best: 0.9547609 (0)	total: 461ms	remaining: 19m 12s
100:	learn: 0.3675573	test: 0.3682006	best: 0.3682006 (100)	total: 45.1s	remaining: 17m 51s
200:	learn: 0.1539684	test: 0.1548560	best: 0.1548560 (200)	total: 1m 30s	remaining: 17m 10s
300:	learn: 0.0837300	test: 0.0851460	best: 0.0851460 (300)	total: 2m 18s	remaining: 16m 50s
400:	learn: 0.0657986	test: 0.0675918	best: 0.0675918 (400)	total: 3m 6s	remaining: 16m 15s
500:	learn: 0.0616675	test: 0.0634993	best: 0.0634993 (500)	total: 3m 50s	remaining: 15m 18s
600:	learn: 0.0603512	test: 0.0621743	best: 0.0621743 (600)	total: 4m 35s	remaining: 14m 30s
700:	learn: 0.0597415	test: 0.0615706	best: 0.0615706 (700)	total: 5m 20s	remaining: 13m 42s
800:	learn: 0.0593708	test: 0.0612529	best: 0.0612529 (800)	total: 6m 4s	remaining: 12m 52s
900:	learn: 0.0590771	test: 0.0610475	best: 0.0610475 (900)	total: 6m 47s	remaining: 12m 2s
1000:	learn: 0.0588388	test: 0.0608930	best:

KeyboardInterrupt: 

In [None]:
# Training XGBoost model
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import numpy as np
import time

# Improved XGBoost params
xgb_params = {
    'max_depth': 10,
    'learning_rate': 0.01,
    'min_child_weight': 3,
    'gamma': 0.01,
    'subsample': 0.9,
    'colsample_bytree': 0.7,
    'colsample_bylevel': 0.8,
    'reg_alpha': 0.05,
    'reg_lambda': 0.1,
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'seed': 42
}

FOLDS = 10
bins = pd.qcut(y, 20, labels=False, duplicates='drop')
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

xgb_oof = np.zeros(len(X))
xgb_pred = np.zeros(len(X_test))
xgb_rmsle = []

print("\n=== Training XGBoost ===")

for i, (train_idx, valid_idx) in enumerate(kf.split(X, bins)):
    print(f"\nFold {i+1}")
    x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    x_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]

    x_train = x_train.loc[:, ~x_train.columns.duplicated()]
    x_valid = x_valid.loc[:, ~x_valid.columns.duplicated()]
    x_test = X_test.loc[:, ~X_test.columns.duplicated()].copy()

    # Convert to DMatrix
    dtrain = xgb.DMatrix(data=x_train, label=y_train)
    dvalid = xgb.DMatrix(data=x_valid, label=y_valid)
    dtest = xgb.DMatrix(data=x_test)

    evals = [(dtrain, 'train'), (dvalid, 'eval')]

    start = time.time()
    model = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=3000,
        evals=evals,
        early_stopping_rounds=100,
        verbose_eval=100
    )

    oof_pred = model.predict(dvalid)
    test_pred = model.predict(dtest)

    xgb_oof[valid_idx] = oof_pred
    xgb_pred += test_pred / FOLDS

    rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(oof_pred)))
    xgb_rmsle.append(rmsle)

    print(f"Fold {i+1} RMSLE: {rmsle:.6f}")
    print(f"Training time: {time.time() - start:.1f} sec")

# Final evaluation
mean_rmsle = np.mean(xgb_rmsle)
print(f"\nXGBoost - Mean RMSLE: {mean_rmsle:.6f}")

# Save results
np.save('xgboost_oof.npy', xgb_oof)
np.save('xgboost_pred.npy', xgb_pred)

xgb_submission = submission.copy()
xgb_submission['Calories'] = np.clip(np.expm1(xgb_pred), 1, 314)
xgb_submission.to_csv('xgboost_submission.csv', index=False)

In [None]:
from lightgbm import LGBMRegressor
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import StratifiedKFold
import numpy as np
import time

# Improved LightGBM params
lgb_params = {
    'n_estimators': 3000,
    'learning_rate': 0.01,
    'max_depth': 12,
    'num_leaves': 95,
    'min_child_samples': 20,
    'subsample': 0.85,
    'colsample_bytree': 0.7,
    'reg_alpha': 0.05,
    'reg_lambda': 0.1,
    'n_jobs': -1,
    'verbose': -1,
    'random_state': 42
}

FOLDS = 10
bins = pd.qcut(y, 20, labels=False, duplicates='drop')
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

lgb_oof = np.zeros(len(X))
lgb_pred = np.zeros(len(X_test))
lgb_rmsle = []

print("\n=== Training LightGBM ===")

for i, (train_idx, valid_idx) in enumerate(kf.split(X, bins)):
    print(f"\nFold {i+1}")
    x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    x_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]
    
    x_train = x_train.loc[:, ~x_train.columns.duplicated()]
    x_valid = x_valid.loc[:, ~x_valid.columns.duplicated()]
    x_test = X_test.loc[:, ~X_test.columns.duplicated()].copy()
    
    start = time.time()
    
    # Train model
    model = LGBMRegressor(**lgb_params)
    model.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        callbacks=[early_stopping(stopping_rounds=100), log_evaluation(100)]
    )
    
    # Make predictions
    oof_pred = model.predict(x_valid)
    test_pred = model.predict(x_test)
    
    # Store results
    lgb_oof[valid_idx] = oof_pred
    lgb_pred += test_pred / FOLDS
    
    # Calculate RMSLE
    rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(oof_pred)))
    lgb_rmsle.append(rmsle)
    
    print(f"Fold {i+1} RMSLE: {rmsle:.6f}")
    print(f"Training time: {time.time() - start:.1f} sec")

# Calculate and display performance
mean_rmsle = np.mean(lgb_rmsle)
std_rmsle = np.std(lgb_rmsle)
print(f"\nLightGBM - Mean RMSLE: {mean_rmsle:.6f} ± {std_rmsle:.6f}")

# Save results
np.save('lightgbm_oof.npy', lgb_oof)
np.save('lightgbm_pred.npy', lgb_pred)

# Create submission
lgb_submission = submission.copy()
lgb_submission['Calories'] = np.clip(np.expm1(lgb_pred), 1, 314)
lgb_submission.to_csv('lightgbm_submission.csv', index=False)

In [92]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error
from scipy.optimize import minimize
from sklearn.linear_model import Ridge, Lasso, ElasticNet, HuberRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import StandardScaler

print("\n=== Creating Advanced Ensemble ===")

try:
    # Load saved predictions
    available_models = []
    try:
        cat_oof = np.load('catboost_oof.npy')
        available_models.append(('catboost', cat_oof))
        print("Loaded CatBoost OOF predictions")
    except FileNotFoundError:
        print("CatBoost OOF predictions not found")
    
    try:
        xgb_oof = np.load('xgboost_oof.npy')
        available_models.append(('xgboost', xgb_oof))
        print("Loaded XGBoost OOF predictions")
    except FileNotFoundError:
        print("XGBoost OOF predictions not found")
    
    try:
        lgb_oof = np.load('lightgbm_oof.npy')
        available_models.append(('lightgbm', lgb_oof))
        print("Loaded LightGBM OOF predictions")
    except FileNotFoundError:
        print("LightGBM OOF predictions not found")
    
    # Check if we have at least 2 models to ensemble
    if len(available_models) < 2:
        raise ValueError("Need at least 2 models for ensembling")
    
    # Load true target values
    try:
        y_true = np.load('true_target.npy')
        print("Loaded true target values")
    except FileNotFoundError:
        raise FileNotFoundError("True target values file not found")
    
    # Prepare data for ensemble
    model_names, oof_predictions = zip(*available_models)
    X_stack = np.column_stack(oof_predictions)
    
    # Define ensemble optimization function
    def objective(weights):
        combined_pred = np.average(X_stack, axis=1, weights=weights)
        return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(combined_pred)))
    
    # Optimize weights
    initial_weights = np.ones(len(available_models)) / len(available_models)
    bounds = [(0, 1) for _ in range(len(available_models))]
    constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
    
    print("\nOptimizing ensemble weights...")
    result = minimize(
        objective,
        initial_weights,
        method='SLSQP',
        bounds=bounds,
        constraints=constraints,
        options={'disp': True}
    )
    
    optimal_weights = result.x
    print("\nOptimal weights:")
    for name, weight in zip(model_names, optimal_weights):
        print(f"{name}: {weight:.4f}")
    
    # Create weighted ensemble predictions
    ensemble_oof = np.average(X_stack, axis=1, weights=optimal_weights)
    ensemble_rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(ensemble_oof)))
    print(f"\nEnsemble RMSLE: {ensemble_rmsle:.6f}")
    
    # Load test predictions
    test_predictions = []
    for name in model_names:
        try:
            pred = np.load(f'{name}_pred.npy')
            test_predictions.append(pred)
            print(f"Loaded {name} test predictions")
        except FileNotFoundError:
            print(f"{name} test predictions not found")
            test_predictions.append(None)
    
    # Create final ensemble prediction
    valid_test_preds = [pred for pred in test_predictions if pred is not None]
    if len(valid_test_preds) != len(optimal_weights):
        print("Warning: Not all test predictions available, using equal weights")
        optimal_weights = np.ones(len(valid_test_preds)) / len(valid_test_preds)
    
    ensemble_test_pred = np.average(np.column_stack(valid_test_preds), axis=1, weights=optimal_weights)
    
    # Create stacked features for meta-model
    print("\nCreating stacked features for meta-model...")
    X_meta_train = X_stack
    X_meta_test = np.column_stack(valid_test_preds)
    
    # Train meta-model (Stacking)
    meta_models = [
        ('ridge', Ridge(alpha=0.1, random_state=42)),
        ('lasso', Lasso(alpha=0.0005, random_state=42)),
        ('elasticnet', ElasticNet(alpha=0.0005, l1_ratio=0.7, random_state=42)),
        ('huber', HuberRegressor(epsilon=1.35, alpha=0.0005))
    ]
    
    stacking_regressor = StackingRegressor(
        estimators=meta_models,
        final_estimator=CatBoostRegressor(
            iterations=1000,
            learning_rate=0.01,
            depth=6,
            random_seed=42,
            verbose=0
        ),
        cv=5,
        n_jobs=-1
    )
    
    print("Training stacking regressor...")
    stacking_regressor.fit(X_meta_train, y_true)
    
    # Get stacking predictions
    stacking_oof = stacking_regressor.predict(X_meta_train)
    stacking_test_pred = stacking_regressor.predict(X_meta_test)
    
    stacking_rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(stacking_oof)))
    print(f"Stacking RMSLE: {stacking_rmsle:.6f}")
    
    # Blend simple weighted average with stacking
    print("\nBlending weighted average and stacking...")
    blend_oof = 0.7 * ensemble_oof + 0.3 * stacking_oof
    blend_test_pred = 0.7 * ensemble_test_pred + 0.3 * stacking_test_pred
    
    blend_rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(blend_oof)))
    print(f"Blended RMSLE: {blend_rmsle:.6f}")
    
    # Create final submission
    final_submission = submission.copy()
    final_submission['Calories'] = np.clip(np.expm1(blend_test_pred), 1, 314)
    final_submission.to_csv('final_ensemble_submission.csv', index=False)
    print("\nSaved final ensemble submission!")
    
    # Plot feature importance for stacking
    try:
        import matplotlib.pyplot as plt
        
        # Get feature importance from the meta-model
        if hasattr(stacking_regressor.final_estimator_, 'feature_importances_'):
            importance = stacking_regressor.final_estimator_.feature_importances_
        elif hasattr(stacking_regressor.final_estimator_, 'coef_'):
            importance = np.abs(stacking_regressor.final_estimator_.coef_)
        else:
            importance = None
            
        if importance is not None:
            plt.figure(figsize=(10, 6))
            plt.bar(range(len(importance)), importance)
            plt.xticks(range(len(model_names)), model_names, rotation=45)
            plt.title('Stacking Meta-Model Feature Importance')
            plt.tight_layout()
            plt.savefig('stacking_importance.png')
            plt.close()
            print("Saved stacking importance plot")
    except Exception as e:
        print(f"Could not create importance plot: {str(e)}")

except Exception as e:
    print(f"\nError in ensemble creation: {str(e)}")
    print("Creating simple average ensemble as fallback...")
    
    try:
        # Simple average fallback
        test_predictions = []
        for name in model_names:
            try:
                pred = np.load(f'{name}_pred.npy')
                test_predictions.append(pred)
            except:
                continue
        
        if len(test_predictions) > 0:
            avg_test_pred = np.mean(np.column_stack(test_predictions), axis=1)
            final_submission = submission.copy()
            final_submission['Calories'] = np.clip(np.expm1(avg_test_pred), 1, 314)
            final_submission.to_csv('simple_average_submission.csv', index=False)
            print("Saved simple average submission")
    except Exception as e2:
        print(f"Could not create fallback submission: {str(e2)}")

print("\n=== Ensemble Process Complete ===")


=== Creating Advanced Ensemble ===
Loaded CatBoost OOF predictions
Loaded XGBoost OOF predictions
Loaded LightGBM OOF predictions

Error in ensemble creation: True target values file not found
Creating simple average ensemble as fallback...
Could not create fallback submission: name 'model_names' is not defined

=== Ensemble Process Complete ===
