In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import HistGradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')

# Load data
data_path = '/kaggle/input/nk-iv-prediction'
test = pd.read_parquet(f'{data_path}/test_data.parquet')
sample_submission = pd.read_csv(f'{data_path}/sample_submission.csv')
print(f"✓ Data loaded successfully - Test: {test.shape}, Sample: {sample_submission.shape}")

target_columns = [col for col in sample_submission.columns if col != 'timestamp']
submission = sample_submission.copy()
for col in target_columns:
    submission[col] = test[col] if col in test.columns else np.nan
print(f"✓ Submission prepared with {len(target_columns)} target columns")

if 'underlying' in test.columns:
    submission['underlying'] = test['underlying']
    all_imputation_cols = target_columns + ['underlying']
    print("✓ Added underlying column as predictor")
else:
    all_imputation_cols = target_columns

# Model 1: HistGradientBoosting
print("Setting up Model 1: Enhanced HistGradientBoosting...")
imputer1 = IterativeImputer(
    estimator=HistGradientBoostingRegressor(
        max_iter=450,            
        max_depth=16,            
        learning_rate=0.08,      
        early_stopping=True,
        l2_regularization=0.03,  
        random_state=42
    ),
    max_iter=65,                 
    tol=1e-7,                    
    n_nearest_features=25,       
    initial_strategy='median',
    imputation_order='ascending',
    random_state=42,
    verbose=2                    
)
print("✓ Model 1 configured: HistGradientBoosting with 450 iter, depth 16")

# Model 2: BayesianRidge
print("Setting up Model 2: Enhanced BayesianRidge...")
imputer2 = IterativeImputer(
    estimator=BayesianRidge(
        alpha_1=5e-7, alpha_2=5e-7,  
        lambda_1=5e-7, lambda_2=5e-7,
        compute_score=True
    ),
    max_iter=65,
    tol=1e-7,
    n_nearest_features=25,
    initial_strategy='median',
    imputation_order='ascending',
    random_state=43,
    verbose=2               
)
print("✓ Model 2 configured: BayesianRidge with enhanced precision")

fit_subset_size = 8500
fit_subset = submission[all_imputation_cols].iloc[:fit_subset_size].copy()

imputer1.fit(fit_subset)
imputer2.fit(fit_subset)

# Generate predictions
imputed1 = imputer1.transform(submission[all_imputation_cols])
imputed2 = imputer2.transform(submission[all_imputation_cols])

if 'underlying' in all_imputation_cols:
    pred1 = imputed1[:, :-1]
    pred2 = imputed2[:, :-1]
else:
    pred1 = imputed1
    pred2 = imputed2

print(f"✓ Prediction arrays ready: {pred1.shape}")

# Test weights
weight_combinations = [
    (0.67, 0.33),  
    (0.69, 0.31),  
    (0.65, 0.35),  
    (0.71, 0.29),  
]

for i, (w1, w2) in enumerate(weight_combinations, 1):
    
    test_submission = submission.copy()
    ensemble_result = w1 * pred1 + w2 * pred2
    test_submission[target_columns] = ensemble_result
    
    # Preserve existing values
    values_preserved = 0
    for col in target_columns:
        if col in test.columns:
            existing_mask = ~test[col].isna()
            if existing_mask.sum() > 0:
                test_submission.loc[existing_mask, col] = test.loc[existing_mask, col]
                values_preserved += existing_mask.sum()
    
    # Conservative post-processing
    test_submission[target_columns] = test_submission[target_columns].clip(0.015, 2.8)
    test_submission = test_submission.drop_duplicates(subset=['timestamp'])
    
    # Calculate MSE
    overall_errors = []
    for col in target_columns:
        if col in test.columns:
            mask = ~test[col].isna()
            if mask.sum() > 0:
                errors = (test.loc[mask, col] - test_submission.loc[mask, col])**2
                overall_errors.extend(errors)
    
    if overall_errors:
        current_mse = np.mean(overall_errors)
        
        if current_mse < best_mse:
            best_mse = current_mse
            best_result = test_submission[['timestamp'] + target_columns].copy()
            best_weights = (w1, w2)
        else:
            difference = ((current_mse - best_mse) / best_mse) * 100
    else:
        print("❌ No valid MSE calculation possible")

# Save best result
if best_result is not None:
    best_result.to_csv('submission.csv', index=False)
    print("submission saved!")
else:
    print("❌ ERROR: No valid results generated")