In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

# Load your test data and sample submission
test = pd.read_parquet('test_data.parquet')
sample_submission = pd.read_csv('sample_submission.csv')

# Use only columns in the sample submission and present in test
target_columns = [col for col in sample_submission.columns if col != 'timestamp' and col in test.columns]
submission = sample_submission.copy()
for col in target_columns:
    submission[col] = test[col]

# Prepare data for imputation (only target columns, but you can add features if present)
impute_cols = target_columns  # Add more columns if you want to use extra features
impute_data = submission[impute_cols].copy()

# KNN Imputer
imputer = KNNImputer(n_neighbors=35, weights="uniform")  # n_neighbors can be tuned
imputed = imputer.fit_transform(impute_data)

# Fill back the imputed values only where they were missing
for i, col in enumerate(impute_cols):
    mask = submission[col].isna()
    submission.loc[mask, col] = imputed[:, i][mask.values]

# Ensure timestamp is correct
if 'timestamp' in sample_submission.columns and 'timestamp' in test.columns:
    submission['timestamp'] = test['timestamp'].values

# Save the imputed test data
submission.to_csv('knn_imputation_n35.csv', index=False)
print("KNN imputation (test data only) saved to 'knn_imputation_test_only.csv'.")


KNN imputation (test data only) saved to 'knn_imputation_test_only.csv'.
