# Collinearity and Leakage Test

__Split Distribution__  
Step 1 - Perform distribution test, Kolgomorov-Smirnov for continuous, Chi-square for categorical  

__Collinearity (Moved to later part)__  
step 2 - Run the VIF -> drop the highest -> Repeat (Threshold VIF < 5)  
Note:
* unlike p-value which the choice of drop is arbitrary, VIF check against the remaining variables which give clear values
* modern ML can handle multicollinearity but GLM struggle  
* the VIF will be run after variable selection only for GLM, but not for trees

__Leakage Test__  
step 3 - Check p-value against the predictor for regression data. Check if p-value is suspiciously high.  
step 4 - Run random forest against the data (can use default param). Check top 10 feature importance and check manually for a potential leakage.  

In [1]:
import pandas as pd
import numpy as np
import json
import os

from sklearn.ensemble import RandomForestRegressor

# stats
from scipy.stats import ks_2samp, chi2_contingency, pearsonr
from statsmodels.stats.outliers_influence import variance_inflation_factor

__Split Distribution__  
Step 1 - Perform distribution test, Kolgomorov-Smirnov for continuous, Chi-square for categorical  

> KS test: 2 variables (1.60%) have p < 0.05
> Chi-square test: 4 variables (3.05%) have p < 0.05
>
> Conclusion: The train-test split preserved the distribution well

In [2]:
with open("PROCESSED/DATA/merged_and_dropped.cat_cols.json") as f:
    cat_cols = json.load(f)

X_train = pd.read_parquet("INPUTS/TRAIN/X_train.parquet")
X_test = pd.read_parquet("INPUTS/TEST/X_test.parquet")
y_train = pd.read_parquet("INPUTS/TRAIN/y_train.parquet")
y_test = pd.read_parquet("INPUTS/TEST/y_test.parquet")

X_train[cat_cols] = X_train[cat_cols].astype("category")
X_test[cat_cols] = X_test[cat_cols].astype("category")

num_cols = [c for c in X_train.columns if c not in cat_cols]

In [3]:
ks_results = []
chi2_results = []

for col in X_train.columns:
    if col == "LBXGH":
        continue

    # if str(X_train[col].dtype) == "category":
    #     # Chi-square test for categorical
    #     contingency = pd.crosstab(X_train[col], X_test[col])

    #     # skip if no valid data for chi-square
    #     if contingency.size == 0 or contingency.shape[0] < 2 or contingency.shape[1] < 2: continue

    #     chi2, p, dof, expected = chi2_contingency(contingency)
    #     chi2_results.append({"variable": col, "Chi2_stat": chi2, "p_value": p})

    if str(X_train[col].dtype) == "category":
        train_counts = X_train[col].value_counts(dropna=False)
        test_counts = X_test[col].value_counts(dropna=False)
        # cats = sorted(set(train_counts.index) | set(test_counts.index))
        cats = list(set(train_counts.index) | set(test_counts.index))
        contingency = pd.DataFrame({
            "train": train_counts.reindex(cats, fill_value=0),
            "test": test_counts.reindex(cats, fill_value=0)
        }).T

        if contingency.shape[1] >= 2:  # need at least two categories
            chi2, p, dof, expected = chi2_contingency(contingency)
            chi2_results.append({"variable": col, "Chi2_stat": chi2, "p_value": p})


    else:
        # KS test for continuous
        ks_stat, ks_p = ks_2samp(X_train[col].dropna(), X_test[col].dropna())
        ks_results.append({"variable": col, "KS_stat": ks_stat, "p_value": ks_p})


ks_results_df = pd.DataFrame(ks_results)
chi2_results_df = pd.DataFrame(chi2_results)

# KS summary
total_ks = len(ks_results_df)
n_sig_ks = (ks_results_df["p_value"] < 0.05).sum()
pct_sig_ks = n_sig_ks / total_ks * 100
print(f"KS test: {n_sig_ks} variables ({pct_sig_ks:.2f}%) have p < 0.05")

# Chi-square summary
total_chi2 = len(chi2_results_df)
n_sig_chi2 = (chi2_results_df["p_value"] < 0.05).sum()
pct_sig_chi2 = n_sig_chi2 / total_chi2 * 100
print(f"Chi-square test: {n_sig_chi2} variables ({pct_sig_chi2:.2f}%) have p < 0.05")


# Save results for audit purposes
pd.DataFrame(ks_results).to_csv("LOG/log_KS.csv", index=False)
pd.DataFrame(chi2_results).to_csv("LOG/log_Chi2.csv", index=False)

KS test: 2 variables (1.60%) have p < 0.05
Chi-square test: 4 variables (3.05%) have p < 0.05


__Collinearity (Moved to later part)__  
step 2 - Run the VIF -> drop the highest -> Repeat (Threshold VIF < 5)  

In [None]:
# numeric_cols = X_train.select_dtypes(include=['number']).columns.tolist()
# X_vif = X_train[numeric_cols].copy()
# vif = pd.DataFrame({
#         'feature': X_vif.columns,
#         'VIF': [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
#     })

In [None]:
# numeric_cols = X_train.select_dtypes(include=['number']).columns.tolist()
# X_vif = X_train[numeric_cols].dropna()

# # initialize
# vif_summary = pd.DataFrame({'feature': X_vif.columns})

# for iteration in range(1, 4):
#     # calculate VIF
#     vif_values = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
#     vif = pd.DataFrame({
#         'feature': X_vif.columns,
#         f'VIF_iter{iteration}': vif_values
#     })
    
#     vif_summary = vif_summary.merge(vif, on='feature', how='left')

#     # find max VIF then drop
#     max_idx = vif_values.index(max(vif_values))
#     drop_col = X_vif.columns[max_idx]
#     max_vif = vif_values[max_idx]
    
#     print(f"Iteration {iteration}: Drop {drop_col} (VIF={max_vif:.1f})")
#     X_vif = X_vif.drop(columns=drop_col)
    
# vif_summary.to_csv("RESULTS/VIF_log.csv", index=False)

Iteration 1: Drop P_BIOPRO__LBDSTPSI_Total_Protein_g_L (VIF=434311.7)
Iteration 2: Drop P_CBC__LBXMCVSI_Mean_cell_volume_fL (VIF=157805.5)
Iteration 3: Drop P_CBC__LBXHCT_Hematocrit (VIF=60156.0)


In [None]:
# numeric_cols = X_train.select_dtypes(include=['number']).columns.tolist()
# X_vif = X_train[numeric_cols].dropna()

# # initialize
# vif_summary = pd.DataFrame({'feature': X_vif.columns})

# iteration = 0
# while True:
#     iteration += 1
    
#     # calculate VIF
#     vif_values = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
#     max_vif = max(vif_values)
    
#     vif = pd.DataFrame({
#         'feature': X_vif.columns,
#         f'VIF_iter{iteration}': vif_values
#     })
    
#     vif_summary = vif_summary.merge(vif, on='feature', how='left')
    
#     # break when all VIF < 5
#     if max_vif < 5:
#         break
    
#     # find max VIF then drop
#     max_idx = vif_values.index(max_vif)
#     drop_col = X_vif.columns[max_idx]
    
#     print(f"[{iteration}] Drop {drop_col}: VIF={max_vif:.1f}")
#     X_vif = X_vif.drop(columns=drop_col)

# vif_summary.to_csv("LOG/VIF_log.csv", index=False)
# print(f"VIF run finished with {len(X_vif.columns)} features remaining")

[1] Drop P_BIOPRO__LBDSTPSI_Total_Protein_g_L: VIF=434311.7
[2] Drop P_CBC__LBXMCVSI_Mean_cell_volume_fL: VIF=157805.5
[3] Drop P_CBC__LBXHCT_Hematocrit: VIF=60156.0
[4] Drop P_FETIB__LBDTIBSI_Tot_Iron_Binding_Capacity_TIBC_umol_L: VIF=38903.5
[5] Drop P_BIOPRO__LBXSNASI_Sodium_mmol_L: VIF=37253.5
[6] Drop P_CBC__LBXWBCSI_White_blood_cell_count_1000_cells_uL: VIF=15441.5
[7] Drop P_BIOPRO__LBXSOSSI_Osmolality_mmol_Kg: VIF=7896.9
[8] Drop P_CBC__LBXMCHSI_Mean_cell_hemoglobin_pg: VIF=7475.7
[9] Drop P_BMX__BMXHT_Standing_Height_cm: VIF=7077.2
[10] Drop P_CBC__LBXNEPCT_Segmented_neutrophils_percent: VIF=3510.4
[11] Drop P_CBC__LBXMC_Mean_Cell_Hgb_Conc_g_dL: VIF=2056.2
[12] Drop P_BIOPRO__LBDSCHSI_Cholesterol_refrigerated_serum_mmol_L: VIF=1706.8
[13] Drop P_WHQ__WHD010_Current_self_reported_height_inches: VIF=1688.8
[14] Drop P_BIOPRO__LBDSCASI_Total_Calcium_mmol_L: VIF=962.8
[15] Drop P_BIOPRO__LBXSCLSI_Chloride_mmol_L: VIF=883.2
[16] Drop P_FETIB__LBDIRNSI_Iron_frozen_Serum_umol_L: VIF=

In [4]:
rows = []
for col in num_cols:
    x = X_train[col]
    r, p = pearsonr(x, y_train.iloc[:, 0]) # because (n,1) not (n,)
    rows.append((col, r, p))

df = pd.DataFrame(rows, columns=["variable", "correlation", "p_value"])
df.to_csv("LOG/leakage_pvalue.csv")

suspect = df[df["correlation"].abs() > 0.6]
print("suspect list:")
print(suspect)


suspect list:
Empty DataFrame
Columns: [variable, correlation, p_value]
Index: []


In [5]:
# one-hot encode categorical variables
X_encoded = pd.get_dummies(X_train, drop_first=True)

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
rf.fit(X_encoded, y_train.iloc[:, 0])
importances = pd.Series(rf.feature_importances_, index=X_encoded.columns)
top10 = importances.sort_values(ascending=False).head(10)
print(top10)
top10.to_csv("LOG/rf_leakage_test.csv")

Biguanide                                            0.177011
Insulin                                              0.175790
P_DEMO__RIDAGEYR_Age_in_years_at_screening           0.048060
Sulfonylurea                                         0.024929
P_BIOPRO__LBXSAPSI_Alkaline_Phosphatase_ALP_IU_L     0.020622
P_BIOPRO__LBXSCLSI_Chloride_mmol_L                   0.015296
P_LUX__LUXCAPM_Median_CAP_decibels_per_meter_dB_m    0.014529
P_BIOPRO__LBXSOSSI_Osmolality_mmol_Kg                0.013685
P_TST__LBXSHBG_SHBG_nmol_L                           0.013501
P_CBC__LBXRDW_Red_cell_distribution_width            0.012215
dtype: float64
