# Train-Test Split

Step 1 - Read dict_data_type.xlsx as tables, remove variables with IS_KEEP = FALSE, cleanup  
Step 2 - Merge data with diabetes_medication output  
Step 3 - Split data into train-test  

In [1]:
# Data manipulation
import numpy as np
import pandas as pd

# Statistics
from scipy.stats import ks_2samp, chi2_contingency

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

In [None]:
# read tables
init_selection_tab = pd.read_excel('SHEETS/dict_data_type.xlsx', sheet_name='init_selection')
init_selection_tab.to_csv('TABLES/init_selection_tab.csv', index=False)
drop_vars = init_selection_tab.loc[init_selection_tab["IS_KEEP"] == False, "variable_name"]
cat_vars = init_selection_tab.loc[init_selection_tab["IS_CATEGORICAL"] == True, "variable_name"]

# primary df
dtypes = pd.read_csv("INPUTS/CSV/df_03_renamed_dtypes.csv", index_col=0).squeeze("columns").to_dict()
df = pd.read_csv("INPUTS/CSV/df_03_renamed.csv", dtype=dtypes)
df = df.drop(columns=drop_vars)

# diabetes medication
diamed_dtypes = pd.read_csv("INPUTS/CSV/DIAMED_dtypes.csv", index_col=0).squeeze("columns").to_dict()
diamed = pd.read_csv("INPUTS/CSV/DIAMED.csv", dtype=diamed_dtypes)

# merge
df = df.merge(diamed, on="SEQN", how="left")




# set categorical variables
for var in cat_vars:
    if var in df.columns: df[var] = df[var].astype("category")

# read table to set 7,77,9,99 etc to NaN
# invalid_map = pd.read_csv("./PROCESSED/DATA/invalid_map_questionnaire_2017_2020.csv")
invalid_map = pd.read_csv("./PROCESSED/DATA/invalid_map_questionnaire_2017_2020.csv", dtype=str)
invalid_dict = invalid_map.groupby("variable")["invalid_code"].apply(list).to_dict()

# loop and replace invalid (don't know, refused, etc.) with NaN
# for var, codes in invalid_dict.items():
#     match = [c for c in df.columns if c.startswith(var)]
#     if not match: continue
#     col = match[0]
#     df[col] = df[col].replace(codes, pd.NA)

for var, codes in invalid_dict.items():
    cols = [c for c in df.columns if c == var or c.startswith(var)]
    if not cols: continue
    # match "7"/"77"/"." and also 7/77 (numbers)
    codes_mixed = set(codes) | {int(c) for c in codes if c.isdigit()}
    df[cols] = df[cols].mask(df[cols].isin(codes_mixed), pd.NA)

del dtypes, diamed_dtypes, init_selection_tab, drop_vars, cat_vars, diamed

In [3]:
# split data into train-test
train_pre_cleaned, test_pre_cleaned = train_test_split(df, test_size=0.2, random_state=42)
X = df.drop(columns=['LBXGH'])
y = df[['LBXGH']]
X_train_pre_cleaned, X_test_pre_cleaned, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pd.DataFrame(X_test_pre_cleaned).to_csv("RESULTS/X_test_pre_cleaned.csv", index=False)

del X, y

# Imputation
Random Forest imputation using sklearn iterativeimputer. MissForest doesn't allow to re-use the model to impute the test.

step 1 - encode both train/test   
step 2 - train iterativeimputer random forest on train set only  
step 3 - use trained imputation model on both train and test  
Step 4 - Perform distribution test, Kolgomorov-Smirnov for continuous, Chi-square for categorical  

In [4]:
# The data is encoded to ordinal encoder since iterative imputer only accepts numerical values
# Trees can handle ordinal encoded categorical variables without issue
# Using one-hot encoding would increase the number of features too much
# However, linear models would require one-hot encoding to avoid implying ordinality
# And also for distribution tests later, we need to decode back to original categories

# named cat_cols instead of cat_vars to avoid confusion
cat_cols = X_train_pre_cleaned.select_dtypes(include=['category']).columns.tolist()
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train_ordinal = X_train_pre_cleaned.copy()
X_test_ordinal = X_test_pre_cleaned.copy()
X_test_ordinal[cat_cols] = ordinal_encoder.fit_transform(X_test_ordinal[cat_cols])
X_train_ordinal[cat_cols] = ordinal_encoder.transform(X_train_ordinal[cat_cols])

In [6]:
# For a random forest imputer, we don't really need a huge number of trees
# Imputation is about generating stable estimates, not prediction accuracy
# So we can limit the number of trees to speed up computation
# Usually n=10-50 is sufficient, but we can go a bit higher if it's unstable
def random_forest_imputer(n_estimators=20, random_state=42):
    iterative_imputer = IterativeImputer(
        estimator=RandomForestRegressor(n_estimators=n_estimators, max_depth=10, n_jobs=-1, random_state=random_state),
        max_iter=5,
        random_state=random_state
    )
    return iterative_imputer

X_train = random_forest_imputer(n_estimators=20, random_state=42).fit_transform(X_train_ordinal)



In [None]:
X_train2 = random_forest_imputer(n_estimators=20, random_state=0).fit_transform(X_train_ordinal)
diff = np.abs(X_train - X_train2)
print(f'Max difference between two imputations on train set with different random states: {diff.max()}')

In [None]:
X_test = random_forest_imputer(n_estimators=20, random_state=42).fit_transform(X_test_ordinal)

In [None]:
# Perform distribution test, Kolgomorov-Smirnov for continuous, Chi-square for categorical

ks_results = []
chi2_results = []

for col in X_train.columns:
    if col == "LBXGH":
        continue

    # if str(X_train[col].dtype) == "category":
    #     # Chi-square test for categorical
    #     contingency = pd.crosstab(X_train[col], X_test[col])

    #     # skip if no valid data for chi-square
    #     if contingency.size == 0 or contingency.shape[0] < 2 or contingency.shape[1] < 2: continue

    #     chi2, p, dof, expected = chi2_contingency(contingency)
    #     chi2_results.append({"variable": col, "Chi2_stat": chi2, "p_value": p})

    if str(X_train[col].dtype) == "category":
        train_counts = X_train[col].value_counts(dropna=False)
        test_counts = X_test[col].value_counts(dropna=False)
        # cats = sorted(set(train_counts.index) | set(test_counts.index))
        cats = list(set(train_counts.index) | set(test_counts.index))
        contingency = pd.DataFrame({
            "train": train_counts.reindex(cats, fill_value=0),
            "test": test_counts.reindex(cats, fill_value=0)
        }).T

    if contingency.shape[1] >= 2:  # need at least two categories
        chi2, p, dof, expected = chi2_contingency(contingency)
        chi2_results.append({"variable": col, "Chi2_stat": chi2, "p_value": p})


    else:
        # KS test for continuous
        ks_stat, ks_p = ks_2samp(X_train[col].dropna(), X_test[col].dropna())
        ks_results.append({"variable": col, "KS_stat": ks_stat, "p_value": ks_p})


ks_results_df = pd.DataFrame(ks_results)
chi2_results_df = pd.DataFrame(chi2_results)

# KS summary
total_ks = len(ks_results_df)
n_sig_ks = (ks_results_df["p_value"] < 0.05).sum()
pct_sig_ks = n_sig_ks / total_ks * 100
print(f"KS test: {n_sig_ks} variables ({pct_sig_ks:.2f}%) have p < 0.05")

# Chi-square summary
total_chi2 = len(chi2_results_df)
n_sig_chi2 = (chi2_results_df["p_value"] < 0.05).sum()
pct_sig_chi2 = n_sig_chi2 / total_chi2 * 100
print(f"Chi-square test: {n_sig_chi2} variables ({pct_sig_chi2:.2f}%) have p < 0.05")


# Save results for audit purposes
pd.DataFrame(ks_results).to_excel("RESULTS/KS_results.xlsx", index=False)
pd.DataFrame(chi2_results).to_excel("RESULTS/Chi2_results.xlsx", index=False)