In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

# Load your data
test = pd.read_parquet('test_data.parquet')
sample_submission = pd.read_csv('sample_submission.csv')

# Identify target columns (excluding 'timestamp')
target_columns = [col for col in sample_submission.columns if col != 'timestamp']
submission = sample_submission.copy()
for col in target_columns:
    submission[col] = test[col] if col in test.columns else np.nan

# Optionally include 'underlying' if present
if 'underlying' in test.columns:
    submission['underlying'] = test['underlying']
    all_imputation_cols = target_columns + ['underlying']
else:
    all_imputation_cols = target_columns

# Setup XGBoost imputer
imputer_xgb = IterativeImputer(
    estimator=XGBRegressor(
        n_estimators=200,
        max_depth=10,
        learning_rate=0.07,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist',
        random_state=42,
        verbosity=0
    ),
    max_iter=50,
    tol=1e-6,
    n_nearest_features=20,
    initial_strategy='median',
    imputation_order='ascending',
    random_state=42,
    verbose=2
)

# Fit imputer on the submission data
imputer_xgb.fit(submission[all_imputation_cols])

# Transform to get imputed values
imputed_xgb = imputer_xgb.transform(submission[all_imputation_cols])

# Prepare output dataframe
xgb_pred = submission.copy()
xgb_pred[target_columns] = imputed_xgb[:, :len(target_columns)]

# Save to CSV
xgb_pred.to_csv('xgb_imputation.csv', index=False)
print("XGBoost imputation predictions saved to 'xgb_predictions.csv'.")


[IterativeImputer] Completing matrix with shape (12065, 53)
[IterativeImputer] Ending imputation round 1/50, elapsed time 24.26
[IterativeImputer] Change: 2.5996083081703185, scaled tolerance: 0.0250595 
[IterativeImputer] Ending imputation round 2/50, elapsed time 52.21
[IterativeImputer] Change: 0.32498234510421753, scaled tolerance: 0.0250595 
[IterativeImputer] Ending imputation round 3/50, elapsed time 72.16
[IterativeImputer] Change: 0.1609470695257187, scaled tolerance: 0.0250595 
[IterativeImputer] Ending imputation round 4/50, elapsed time 87.42
[IterativeImputer] Change: 0.14960743486881256, scaled tolerance: 0.0250595 
[IterativeImputer] Ending imputation round 5/50, elapsed time 102.70
[IterativeImputer] Change: 0.1466507464647293, scaled tolerance: 0.0250595 
[IterativeImputer] Ending imputation round 6/50, elapsed time 117.38
[IterativeImputer] Change: 0.18262746930122375, scaled tolerance: 0.0250595 
[IterativeImputer] Ending imputation round 7/50, elapsed time 132.61
[I