In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

print("=== 4-WAY ENSEMBLE WITH 10,000 WEIGHT COMBOS ===")

# Load data
data_path = '/kaggle/input/nk-iv-prediction'
test = pd.read_parquet(f'{data_path}/test_data.parquet')
sample_submission = pd.read_csv(f'{data_path}/sample_submission.csv')

# Load provided file for comparison
provided_file_path = '/kaggle/input/prev/prev_best_submission.csv'
provided_data = pd.read_csv(provided_file_path)
print(f"✓ Loaded provided file: {provided_data.shape}")

# Prepare submission
target_columns = [col for col in sample_submission.columns if col != 'timestamp']
submission = sample_submission.copy()
for col in target_columns:
    submission[col] = test[col] if col in test.columns else np.nan

# Add underlying for imputation context
if 'underlying' in test.columns:
    submission['underlying'] = test['underlying']
    all_imputation_cols = target_columns + ['underlying']
    print("✓ Added underlying for imputation context")
else:
    all_imputation_cols = target_columns

print(f"✓ Submission prepared: {submission.shape}")

# 1. HistGradientBoosting
imputer_hist = IterativeImputer(
    estimator=HistGradientBoostingRegressor(
        max_iter=400,
        max_depth=15,
        learning_rate=0.1,
        early_stopping=True,
        random_state=42
    ),
    max_iter=60,
    tol=1e-6,
    n_nearest_features=30,
    initial_strategy='median',
    imputation_order='ascending',
    random_state=42,
    verbose=2
)

# 2. BayesianRidge
imputer_bayes = IterativeImputer(
    estimator=BayesianRidge(
        alpha_1=1,
        alpha_2=1,
        lambda_1=1,
        lambda_2=1,
        compute_score=True
    ),
    max_iter=60,
    tol=1e-6,
    n_nearest_features=30,
    initial_strategy='median',
    imputation_order='ascending',
    random_state=43,
    verbose=2
)

# 3. XGBoost
imputer_xgb = IterativeImputer(
    estimator=XGBRegressor(
        n_estimators=400,        
        max_depth=6,             
        learning_rate=0.05,      
        subsample=0.9,           
        colsample_bytree=0.8,    
        tree_method='hist',      
        random_state=42,
        verbosity=0
    ),
    max_iter=60,                 
    tol=1e-6,                    
    n_nearest_features=30,       
    initial_strategy='median',   
    imputation_order='ascending', 
    random_state=44,
    verbose=2
)

# 4. LightGBM
imputer_lgb = IterativeImputer(
    estimator=lgb.LGBMRegressor(
        n_estimators=1000,
        max_depth=6,
        learning_rate=0.1,
        num_leaves=63,
        min_data_in_leaf=50,
        lambda_l1=0.0,
        lambda_l2=0.1,
        max_bin=255,
        bagging_fraction=0.85,
        feature_fraction=0.85,
        verbose=-1,
        random_state=42
    ),
    max_iter=60,
    tol=1e-6,
    n_nearest_features=30,
    initial_strategy='median',
    imputation_order='ascending',
    random_state=45,
    verbose=2
)

# TRAIN ALL 4 IMPUTERS
full_training_data = submission[all_imputation_cols].copy()

imputer_hist.fit(full_training_data)
imputer_bayes.fit(full_training_data)
imputer_xgb.fit(full_training_data)
imputer_lgb.fit(full_training_data)

# GENERATE PREDICTIONS FROM ALL 4 IMPUTERS

imputed_hist = imputer_hist.transform(submission[all_imputation_cols])
print("✓ HistGradientBoosting predictions generated")

print("Generating BayesianRidge predictions...")
imputed_bayes = imputer_bayes.transform(submission[all_imputation_cols])
print("✓ BayesianRidge predictions generated")

print("Generating XGBoost predictions...")
imputed_xgb = imputer_xgb.transform(submission[all_imputation_cols])
print("✓ XGBoost predictions generated")

print("Generating LightGBM predictions...")
imputed_lgb = imputer_lgb.transform(submission[all_imputation_cols])
print("✓ LightGBM predictions generated")

# Extract only target columns (exclude underlying)
if 'underlying' in all_imputation_cols:
    pred_hist = imputed_hist[:, :-1]
    pred_bayes = imputed_bayes[:, :-1]
    pred_xgb = imputed_xgb[:, :-1]
    pred_lgb = imputed_lgb[:, :-1]
else:
    pred_hist = imputed_hist
    pred_bayes = imputed_bayes
    pred_xgb = imputed_xgb
    pred_lgb = imputed_lgb

print("✓ All predictions extracted (target columns only)")

# GENERATE 10,000 RANDOM WEIGHT COMBINATIONS
print("\n=== GENERATING 10,000 RANDOM WEIGHT COMBINATIONS ===")
np.random.seed(42)
weights = np.random.dirichlet(np.ones(4), size=10000)
print("✓ 10,000 weight combinations generated using Dirichlet distribution")

# TEST ALL 10,000 WEIGHT COMBINATIONS
print(f"\n=== TESTING ALL 10,000 WEIGHT COMBINATIONS ===")
print("Format: HistGrad, Bayes, XGBoost, LightGBM")

best_mse = float('inf')
best_result = None
best_weights = None
best_combo_id = None

for i, w in enumerate(weights, 1):
    weight_hist, weight_bayes, weight_xgb, weight_lgb = w
    
    # Create 4-way ensemble
    ensemble_pred = (weight_hist * pred_hist + 
                    weight_bayes * pred_bayes + 
                    weight_xgb * pred_xgb + 
                    weight_lgb * pred_lgb)
    
    # Create test submission
    test_submission = submission[['timestamp'] + target_columns].copy()
    test_submission[target_columns] = ensemble_pred
    
    # Preserve existing values exactly
    values_preserved = 0
    for col in target_columns:
        if col in test.columns:
            existing_mask = ~test[col].isna()
            if existing_mask.sum() > 0:
                test_submission.loc[existing_mask, col] = test.loc[existing_mask, col]
                values_preserved += existing_mask.sum()
    
    # Post-processing
    test_submission[target_columns] = test_submission[target_columns].clip(0.01, 3.0)
    test_submission = test_submission.drop_duplicates(subset=['timestamp'])
    
    # Calculate MSE vs provided file
    provided_errors = []
    comparisons_made = 0
    
    for col in target_columns:
        if col in provided_data.columns:
            merged = pd.merge(test_submission[['timestamp', col]], 
                            provided_data[['timestamp', col]], 
                            on='timestamp', suffixes=('_pred', '_true'))
            
            valid_mask = ~merged[col + '_true'].isna() & ~merged[col + '_pred'].isna()
            if valid_mask.sum() > 0:
                errors = (merged.loc[valid_mask, col + '_true'] - merged.loc[valid_mask, col + '_pred'])**2
                provided_errors.extend(errors)
                comparisons_made += valid_mask.sum()
    
    current_mse = np.mean(provided_errors) if provided_errors else float('inf')
    
    # Track best result
    if current_mse < best_mse:
        best_mse = current_mse
        best_result = test_submission.copy()
        best_weights = (weight_hist, weight_bayes, weight_xgb, weight_lgb)
        best_combo_id = i

# SAVE BEST RESULT
if best_result is not None:
    best_result.to_csv('submission.csv', index=False)