In [1]:
import pandas as pd
import numpy as np
import json
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler

from tabnanny import verbose

In [2]:
directory = "./PROCESSED/DATA"
file_name = "merged_and_dropped.parquet"
path = f"{directory}/{file_name}"

df = pd.read_parquet(path)

with open("./PROCESSED/DATA/merged_and_dropped.cat_cols.json") as f:
    cat_cols = json.load(f)

df[cat_cols] = df[cat_cols].astype("category")

# drop SEQN
df = df.drop(columns=['SEQN'])

dtypes = df.dtypes

In [3]:
# split data into train-test
train_pre_cleaned, test_pre_cleaned = train_test_split(df, test_size=0.2, random_state=42)
X = df.drop(columns=['LBXGH'])
y = df[['LBXGH']]
X_train_pre_cleaned, X_test_pre_cleaned, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pd.DataFrame(X_test_pre_cleaned).to_csv("RESULTS/X_test_pre_cleaned.csv", index=False) # for manual checking purpose

# Imputation
Random Forest imputation using sklearn iterativeimputer. MissForest doesn't allow to re-use the model to impute the test.

step 1 - encode both train/test   
step 2 - fit iterativeimputer random forest on train set only  
step 3 - use trained imputation model on both train and test  

In [4]:
# The data is encoded to ordinal encoder since iterative imputer only accepts numerical values
# Trees can handle ordinal encoded categorical variables without issue
# Using one-hot encoding would increase the number of features too much
# However, linear models would require one-hot encoding to avoid implying ordinality
# And also for distribution tests later, we need to decode back to original categories

# named cat_cols instead of cat_vars to avoid confusion
cat_cols = X_train_pre_cleaned.select_dtypes(include=['category']).columns.tolist()
# cat_cols = X_train_pre_cleaned.select_dtypes(include=['category', 'object']).columns.tolist() # include 'object' dtype as well

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train_ordinal = X_train_pre_cleaned.copy()
X_test_ordinal = X_test_pre_cleaned.copy()
X_train_ordinal[cat_cols] = ordinal_encoder.fit_transform(X_train_ordinal[cat_cols])
X_test_ordinal[cat_cols] = ordinal_encoder.transform(X_test_ordinal[cat_cols])

# transform "unknown" into NaN so the imputer imputes them
for c in cat_cols:
    X_train_ordinal[c] = X_train_ordinal[c].replace(-1, np.nan)
    X_test_ordinal[c] = X_test_ordinal[c].replace(-1, np.nan)

# not needed but in case want to try distance-based imputers later
# num_cols = [c for c in X_train_ordinal.columns if c not in cat_cols]
# scaler = StandardScaler()
# X_train_scaled = X_train_ordinal.copy()
# X_test_scaled = X_test_ordinal.copy()
# X_train_scaled[num_cols] = scaler.fit_transform(X_train_scaled[num_cols])
# X_test_scaled[num_cols] = scaler.transform(X_test_scaled[num_cols])

In [11]:
# For a random forest imputer, we don't really need a huge number of trees
# Imputation is about generating stable estimates, not prediction accuracy
# So we can limit the number of trees to speed up computation
# Usually n=10-50 is sufficient, but we can go a bit higher if it's unstable

# def random_forest_imputer(n_estimators=20, max_iter=3):
#     rf_imputer = IterativeImputer(
#         estimator=RandomForestRegressor(n_estimators=n_estimators, max_depth=10, n_jobs=-1, random_state=42),
#         max_iter=max_iter,
#         random_state=42
#     )
#     return rf_imputer


# def random_forest_imputer():
#     rf_imputer = IterativeImputer(
#         estimator=RandomForestRegressor(
#             n_estimators=20,
#             max_depth=10,
#             min_samples_leaf=30,     # increased for stability
#             max_features='sqrt',
#             n_jobs=-1,
#             random_state=42
#         ),
#         n_nearest_features=60,       # uses most relevant subset of features
#         max_iter=10,
#         tol=0.01,                    # balanced tolerance for convergence
#         initial_strategy='median',
#         random_state=42,
#         verbose=2
#     )
#     return rf_imputer

# def random_forest_imputer():
#     rf_imputer = IterativeImputer(
#         estimator=RandomForestRegressor(
#             n_estimators=20,
#             max_depth=15,
#             min_samples_leaf=20,
#             min_samples_split=40,
#             max_features='sqrt',
#             n_jobs=-1,
#             random_state=42
#         ),
#         n_nearest_features=60,
#         max_iter=20,
#         tol=0.01,
#         initial_strategy='median',
#         random_state=42,
#         verbose=2
#     )
#     return rf_imputer

# def extra_trees_imputer(n_estimators=20, max_iter=3, verbose=0):
#     et_imputer = IterativeImputer(
#         estimator=ExtraTreesRegressor(n_estimators=n_estimators, max_depth=10, n_jobs=-1, random_state=42),
#         max_iter=max_iter,
#         random_state=42,
#         verbose=verbose
#     )
#     return et_imputer

# def extra_trees_imputer(n_estimators=20, max_iter=10, verbose=0):
#     et_imputer = IterativeImputer(
#         estimator=ExtraTreesRegressor(
#             n_estimators=n_estimators, 
#             max_depth=6, 
#             min_samples_leaf=10,
#             n_jobs=-1, 
#             random_state=42
#         ),
#         max_iter=max_iter,
#         initial_strategy='median',
#         random_state=42,
#         verbose=verbose
#     )
#     return et_imputer

# def extra_trees_imputer():
#     et_imputer = IterativeImputer(
#         estimator=ExtraTreesRegressor(
#             n_estimators=30,
#             max_depth=10,
#             min_samples_leaf=30,     # increased for stability
#             max_features='sqrt',
#             n_jobs=-1,
#             random_state=42
#         ),
#         n_nearest_features=60,       # uses most relevant subset of features
#         max_iter=10,
#         tol=0.01,                    # 0.001 is too small
#         initial_strategy='median',
#         random_state=42,
#         verbose=2
#     )
#     return et_imputer


def extra_trees_imputer():
    et_imputer = IterativeImputer(
        estimator=ExtraTreesRegressor(
            n_estimators=100,
            max_depth=10,
            min_samples_leaf=40,
            min_samples_split=60,
            max_features='sqrt',
            n_jobs=-1,
            random_state=42
        ),
        n_nearest_features=40,
        max_iter=10,
        initial_strategy='median',
        random_state=42,
        imputation_order='ascending',
        verbose=2
    )
    return et_imputer

imputer = extra_trees_imputer()
# imputer = random_forest_imputer()

X_train = imputer.fit_transform(X_train_ordinal)

[IterativeImputer] Completing matrix with shape (7789, 256)
[IterativeImputer] Ending imputation round 1/10, elapsed time 54.30
[IterativeImputer] Change: 10621.036681443979, scaled tolerance: 68.0 
[IterativeImputer] Ending imputation round 2/10, elapsed time 108.37
[IterativeImputer] Change: 3403.843214620837, scaled tolerance: 68.0 
[IterativeImputer] Ending imputation round 3/10, elapsed time 161.39
[IterativeImputer] Change: 3504.9341219513453, scaled tolerance: 68.0 
[IterativeImputer] Ending imputation round 4/10, elapsed time 217.70
[IterativeImputer] Change: 3520.375820193589, scaled tolerance: 68.0 
[IterativeImputer] Ending imputation round 5/10, elapsed time 272.09
[IterativeImputer] Change: 2621.9086009891275, scaled tolerance: 68.0 
[IterativeImputer] Ending imputation round 6/10, elapsed time 327.70
[IterativeImputer] Change: 2785.279687242277, scaled tolerance: 68.0 
[IterativeImputer] Ending imputation round 7/10, elapsed time 384.43
[IterativeImputer] Change: 2333.067



> The iterative imputer did not reach the convergence tolerance. However, the change is stabilized and non-divergent after first few iterations, which suffice for practical purpose. The imputation was terminated after 10 rounds as further iteration unlikely will produce any meaningful improvement.

In [6]:
# joblib.dump(imputer, "RESULTS/extra_trees_imputer.pkl", compress=3)

In [12]:
X_test = imputer.transform(X_test_ordinal)

[IterativeImputer] Completing matrix with shape (1948, 256)
[IterativeImputer] Ending imputation round 1/10, elapsed time 7.54
[IterativeImputer] Ending imputation round 2/10, elapsed time 15.01
[IterativeImputer] Ending imputation round 3/10, elapsed time 22.87
[IterativeImputer] Ending imputation round 4/10, elapsed time 32.18
[IterativeImputer] Ending imputation round 5/10, elapsed time 41.55
[IterativeImputer] Ending imputation round 6/10, elapsed time 50.25
[IterativeImputer] Ending imputation round 7/10, elapsed time 59.40
[IterativeImputer] Ending imputation round 8/10, elapsed time 66.79
[IterativeImputer] Ending imputation round 9/10, elapsed time 74.17
[IterativeImputer] Ending imputation round 10/10, elapsed time 81.50


In [17]:
# convert back to DataFrames
X_train_imputed = pd.DataFrame(X_train, columns=X_train_ordinal.columns, index=X_train_ordinal.index)
X_test_imputed  = pd.DataFrame(X_test, columns=X_test_ordinal.columns,  index=X_test_ordinal.index)

# reverse scaling for numeric columns, if applied
# X_train_imputed[num_cols] = scaler.inverse_transform(X_train_imputed[num_cols])
# X_test_imputed[num_cols] = scaler.inverse_transform(X_test_imputed[num_cols])

# rounding categorical codes to valid range before inverse_transform
for i, c in enumerate(cat_cols):
    n = len(ordinal_encoder.categories_[i])
    X_train_imputed[c] = np.clip(np.rint(X_train_imputed[c]).astype(int), 0, n-1)
    X_test_imputed[c] = np.clip(np.rint(X_test_imputed[c]).astype(int),  0, n-1)

# restore categorical dtype
X_train_imputed[cat_cols] = ordinal_encoder.inverse_transform(X_train_imputed[cat_cols])
X_test_imputed[cat_cols] = ordinal_encoder.inverse_transform(X_test_imputed[cat_cols])
for col in cat_cols:
    X_train_imputed[col] = X_train_imputed[col].astype('category')
    X_test_imputed[col] = X_test_imputed[col].astype('category')

#### Binning

In [19]:
# Binning
# SLQ300_Usual_sleep_time_on_weekdays_or_workdays
# SLQ320_Usual_sleep_time_on_weekends
# 1_very_early: 19-20
# 2_early: 21-22
# 3_normal: 23-00
# 4_late: 01-02
# 5_extreme: else


cols = ["P_SLQ__SLQ320_Usual_sleep_time_on_weekends", "P_SLQ__SLQ300_Usual_sleep_time_on_weekdays_or_workdays"]

for df in (X_train_imputed, X_test_imputed):
    for col in cols:
        # convert "HH:MM" to rounded hour 0–23
        df[col] = pd.to_datetime(df[col], format="%H:%M", errors="coerce").dt.round("h").dt.hour

        # binning according to plan above
        df[col] = np.select(
            [
                # use isin instead of between since we already round it anyway and to handle the midnight
                df[col].isin([19, 20]),       # 1 very early: 19–20
                df[col].isin([21, 22]),       # 2 early: 21–22
                df[col].isin([23, 0]),         # 3 normal: 23–00
                df[col].isin([1, 2]),          # 4 late: 01–02
            ],
            [1, 2, 3, 4],
            default=5                         # 5 extreme: everything else / NaN
        ).astype("int64")   # convert to clear integer, but later it will be preserved as category dtype by json files


# Taken out, but kept here for reference
# SLQ310_Usual_wake_time_on_weekdays_or_workdays
# SLQ330_Usual_wake_time_on_weekends
# 1_very_early: 3-4
# 2_early: 5-6
# 3_normal: 7-8
# 4_late: 9-10
# 5_extreme: else

# SLD012_Sleep_hours_weekdays_or_workdays
# SLD013_Sleep_hours_weekends
# ≤5 hours -> Very short
# >5-<7 hours -> Short
# 7-<9 hours -> Normal
# ≥9 hours -> Long
cols = ["P_SLQ__SLD012_Sleep_hours_weekdays_or_workdays", "P_SLQ__SLD013_Sleep_hours_weekends"]

for df in (X_train_imputed, X_test_imputed):
    for col in cols:
        # convert to numeric
        df[col] = pd.to_numeric(df[col], errors="coerce")

        # binning according to plan above
        df[col] = np.select(
            [
                df[col] <= 5,                   # 1: very short
                (df[col] > 5) & (df[col] < 7),  # 2: short
                (df[col] >= 7) & (df[col] < 9), # 3: normal
                df[col] >= 9                  # 4: long
            ],
            [1, 2, 3, 4],
            default=4                         # 5 extreme: everything else / NaN
        ).astype("int64")   # convert to clear integer, but later it will be preserved as category dtype by json files


In [20]:
os.makedirs("INPUTS/TRAIN", exist_ok=True)
os.makedirs("INPUTS/TEST", exist_ok=True)
os.makedirs("RESULTS", exist_ok=True)

X_train_imputed.to_parquet("INPUTS/TRAIN/X_train.parquet", index=False)
X_test_imputed.to_parquet("INPUTS/TEST/X_test.parquet", index=False)
y_train.to_parquet("INPUTS/TRAIN/y_train.parquet", index=False)
y_test.to_parquet("INPUTS/TEST/y_test.parquet", index=False)