In [None]:
# to change file locations scroll down and see at the start of main execution

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor

Preparation by defining my functions

In [None]:
def load_data_parquet_csv(test_path, submission_path, train_path):
    train_df = pd.read_parquet(train_path)
    test_df = pd.read_parquet(test_path)
    sample_submission_df = pd.read_csv(submission_path)
    return train_df, test_df, sample_submission_df

def prepare_imputation_data(test_df, timestamp_col='timestamp'):
    """Drop timestamp for imputation, return data and timestamp separately."""
    imputation_df = test_df.drop(columns=[timestamp_col])
    timestamp = test_df[timestamp_col]
    return imputation_df, timestamp

In [None]:
def build_extra_trees_imputer():
    estimator = ExtraTreesRegressor(
        n_estimators=300,
        criterion='squared_error',
        max_depth=30,
        min_samples_split=2,
        min_samples_leaf=2,
        max_features=0.8,
        max_leaf_nodes=None,
        bootstrap=True,
        n_jobs=-1,
        random_state=42,
        warm_start=False,
        max_samples=None
    )
    imputer = IterativeImputer(
        estimator=estimator,
        missing_values=float("nan"),
        sample_posterior=False,
        max_iter=10,
        tol=0,
        n_nearest_features=None,
        initial_strategy='mean',
        imputation_order='roman',
        skip_complete=True,
        verbose=2,
        random_state=99,
        add_indicator=False
    )
    return imputer

In [None]:
# I do know that by keeping tol=0, I need to use very high number of
# iterations to converge but Increasing max_iter to a larger number
# I was not able to run on my pc so I just let it be 10 and still 
# tried to keep tol=0 which has given me decent improvement in the final result.

In [None]:
def impute_data(imputer, data_to_impute):
    print("Starting imputation...")
    imputed_array = imputer.fit_transform(data_to_impute)
    print("Imputation completed.")
    return imputed_array

def reconstruct_imputed_df(imputed_array, impute_columns, timestamp):
    imputed_df = pd.DataFrame(imputed_array, columns=impute_columns)
    imputed_df['timestamp'] = timestamp.values
    return imputed_df

In [None]:
def filter_to_submission_columns(imputed_df, sample_submission_df):
    # Keep only columns present in sample_submission, in the same order
    common_cols = [col for col in sample_submission_df.columns if col in imputed_df.columns]
    filtered_df = imputed_df[common_cols]
    return filtered_df

def save_final_submission(filtered_df, output_path):
    filtered_df.to_csv(output_path, index=False)
    print(f"Final submission saved to '{output_path}'.")

MAIN EXECUTION

In [None]:
TRAIN_PATH = 'train_data.parquet'
TEST_PATH = 'test_data.parquet'
SUBMISSION_PATH = 'sample_submission.csv'
OUTPUT_PATH = 'final_submission.csv'

In [None]:
# 1. Load data (test and train as parquet, sample submission as CSV)
train_df, test_df, sample_submission_df = load_data_parquet_csv(TEST_PATH, SUBMISSION_PATH, TRAIN_PATH)

# 2. Prepare data for imputation
imputation_df, timestamp = prepare_imputation_data(test_df, timestamp_col='timestamp')

In [None]:
# 3. Build and fit imputer
imputer = build_extra_trees_imputer()
imputed_array = impute_data(imputer, imputation_df)

# 4. Reconstruct imputed DataFrame with timestamp
imputed_df = reconstruct_imputed_df(imputed_array, imputation_df.columns, timestamp)

In [None]:
# 5. Filter columns to match sample submission
filtered_df = filter_to_submission_columns(imputed_df, sample_submission_df)

# 6. Save final submission as CSV
save_final_submission(filtered_df, OUTPUT_PATH)

This saved file is my best submission