In [8]:
import torch
from torch.utils.data import DataLoader
import pandas as pd
from dataset_transforms import PositionalRandomReplaceVector,ClinicalDataset
from transformer_filling_model import TransformerFillingModel
from evaluate import evaluate
from train_model import train_model
from run_test import run_test

In [2]:
study_group = [2,3]
condition_cols = ['healthy']
df_subset=pd.read_csv("dataset_progression_analysis_"+str(study_group[0])+"_"+str(study_group[1])+".csv",index_col=0)
print(df_subset.columns)
train_df = df_subset[df_subset["recommended_split"]=="train"]
val_df = df_subset[df_subset["recommended_split"]=="val"]
test_df = df_subset[df_subset["recommended_split"]=="test"]

train_df.drop(["recommended_split",'participant_id',
       'study_group_id'],axis=1, inplace=True)
val_df.drop(["recommended_split",'participant_id',
       'study_group_id'],axis=1,inplace=True)
test_df.drop(["recommended_split",'participant_id',
       'study_group_id'],axis=1,inplace=True)
print(train_df.shape)


target_cols = condition_cols
print(target_cols)


# start with zeros, then set 0.5 for the selected columns
#p_vec = 0.5*torch.ones(len(train_df.columns), dtype=torch.float32) # For 1/2
#p_vec = 0.3*torch.ones(len(train_df.columns), dtype=torch.float32)  # For 2/3


feats_to_keep = train_df.columns[:-1]

#Default, drop everything
p_vec = torch.ones(len(train_df.columns), dtype=torch.float32)  # default = 0

# set feats_to_keep → 0.3
mask_keep = torch.tensor(train_df.columns.isin(feats_to_keep), dtype=torch.bool)
p_vec[mask_keep] = 0.5

# set target cols → 1.0
mask_target = torch.tensor(train_df.columns.isin(target_cols), dtype=torch.bool)
p_vec[mask_target] = 1.0

print(p_vec)
# Option 1: Train one model that can handle any number of missing features
drop_transform=PositionalRandomReplaceVector(p_vec, value=16)


# Test

p_vec = torch.ones(len(train_df.columns), dtype=torch.float32)

mask_keep = torch.tensor(train_df.columns.isin(feats_to_keep), dtype=torch.bool)
p_vec[mask_keep] = 0
mask = train_df.columns.isin(target_cols)
p_vec[torch.from_numpy(mask)] = 1
print(p_vec)
drop_transform_test=PositionalRandomReplaceVector(p_vec, value=16)

train_dataset = ClinicalDataset(train_df, drop_transform=drop_transform)
val_dataset = ClinicalDataset(val_df, drop_transform=drop_transform_test)
test_dataset = ClinicalDataset(test_df, drop_transform=drop_transform_test)
print(len(train_dataset), len(val_dataset), len(test_dataset))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Index(['Creatinine (mg/dL)', 'Troponin-T (ng/L)', 'mean_glucose',
       'hyper_time_pcts', 'nocturnal_hypoglycemias', 'BUN (mg/dL)',
       'mhoccur_rnl, Kidney problems',
       'mlcsodlog, OD: Log Contrast Sensitivity',
       'mlcsodfcl, OD: Value of final correct letter', 'CRP - HS (mg/L)',
       'plcsoslog, OS: Log Contrast Sensitivity',
       'via3, Because of your eyesight, how much difficul',
       'plcsosfcl, OS: Value of final correct letter', 'Albumin (g/dL)',
       'HbA1c (%)', 'via1, How much difficulty, if any, do you have in',
       'participant_id', 'study_group_id', 'recommended_split', 'healthy'],
      dtype='object')
(273, 17)
['healthy']
tensor([0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
        0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 1.0000])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])
273 71 79


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(["recommended_split",'participant_id',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df.drop(["recommended_split",'participant_id',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop(["recommended_split",'participant_id',


In [3]:
original, masked = train_dataset[0]
print("Original:", original, len(original))
print("Masked:", masked)

Original: tensor([ 3.,  4., 11., 14., 12.,  5.,  0.,  8.,  7.,  0., 12.,  0., 12.,  9.,
        11.,  0.,  0.]) 17
Masked: tensor([ 3., 16., 16., 16., 12.,  5., 16., 16., 16.,  0., 12.,  0., 12.,  9.,
        16.,  0., 16.])


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
num_feats = train_df.shape[1]
embed_dim = 256
target_idx=16
print(train_df.columns[target_idx])

model = TransformerFillingModel(num_feats=num_feats, embed_dim=embed_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
model = train_model(
    model=model,
    optimizer=optimizer,
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    target_idx=target_idx,
    num_epochs=30)

healthy
✅ Saved new best model (AUC improved)
Epoch 01 | Train Loss: 1.0288 | Val Loss: 0.7928 | Val Acc: 0.6197 | Val AUC: 0.5758 | Val BalAcc: 0.5000 | Test Loss: 0.8921 | Test Acc: 0.5190 | Test AUC: 0.6053 | Test BalAcc: 0.5000
Test Confusion Matrix:
 [[41  0]
 [38  0]]
------------------------------------------------------------------------------------------
✅ Saved new best model (AUC improved)
Epoch 02 | Train Loss: 0.5380 | Val Loss: 0.9031 | Val Acc: 0.6197 | Val AUC: 0.6002 | Val BalAcc: 0.5000 | Test Loss: 1.0651 | Test Acc: 0.5190 | Test AUC: 0.6739 | Test BalAcc: 0.5000
Test Confusion Matrix:
 [[41  0]
 [38  0]]
------------------------------------------------------------------------------------------
✅ Saved new best model (AUC improved)
Epoch 03 | Train Loss: 0.5219 | Val Loss: 0.7448 | Val Acc: 0.6197 | Val AUC: 0.6069 | Val BalAcc: 0.5000 | Test Loss: 0.8435 | Test Acc: 0.5190 | Test AUC: 0.7009 | Test BalAcc: 0.5000
Test Confusion Matrix:
 [[41  0]
 [38  0]]
---------

In [5]:

model.eval()
model.load_state_dict(torch.load("best_infill_model_B.pth"))
feats_to_keep = [0] * 7


if study_group == [2,3]:
    print("Study group 2,3")
    # Blood test only features
    feats_to_keep[0] = ['HbA1c (%)']
    feats_to_keep[1] = ['HbA1c (%)']
    
    
    feats_to_keep[2] = ['HbA1c (%)',  'Creatinine (mg/dL)', 'Troponin-T (ng/L)', 'BUN (mg/dL)','Albumin (g/dL)'
           'CRP - HS (mg/L)']
    
    # CGM 
    feats_to_keep[3] = [ 
             'hyper_time_pcts',
           'mean_glucose', 'nocturnal_hypoglycemias']
    
    # CGM + non blood test features
    feats_to_keep[4] = [ 
               'hyper_time_pcts',
           'mean_glucose', 'nocturnal_hypoglycemias',
            'mhoccur_rnl, Kidney problems',
           'via3, Because of your eyesight, how much difficul',
           'mlcsodlog, OD: Log Contrast Sensitivity',
           'mlcsodfcl, OD: Value of final correct letter',
           'plcsosfcl, OS: Value of final correct letter',
           'via1, How much difficulty, if any, do you have in', 
           'plcsoslog, OS: Log Contrast Sensitivity', ]
    
    feats_to_keep[5] = feats_to_keep[4]

if study_group ==[0,1]:
    print("Study group 0,1")
    feats_to_keep[0] = ['HbA1c (%)']
    feats_to_keep[1] = ['HbA1c (%)','Glucose (mg/dL)','INSULIN (ng/mL)']
    feats_to_keep[2] = ['HbA1c (%)','Glucose (mg/dL)','INSULIN (ng/mL)']
    feats_to_keep[3] = [ 
             'hyper_time_pcts', 'spike_resolutions',
           'mean_glucose', 'nocturnal_hypoglycemias']
    feats_to_keep[4] = [ 
             'hyper_time_pcts', 'spike_resolutions',
            'mean_glucose', 'nocturnal_hypoglycemias',
            'mhoccur_obs, Obesity', 
            'bmi_vsorres, BMI', 
            'waist_vsorres, Waist Circumference (cm)']
    feats_to_keep[5] = [ 
             'hyper_time_pcts', 'spike_resolutions',
             'mean_glucose', 'nocturnal_hypoglycemias',
             'mhoccur_rnl, Kidney problems',
             'mhoccur_hbp, High blood pressure', 'mhoccur_obs, Obesity', 'dmlsugar, When reflecting on your typical eating ',
             'bmi_vsorres, BMI', 'dmlact, What would you consider your typical acti', 
             'waist_vsorres, Waist Circumference (cm)', 'pxne7, The crime rate in my neighborhood makes it',
             'fh_dm2pt, Were either of your parents diagnosed w']
if study_group ==[1,2]:
    print("Study group 1,2")
    feats_to_keep[0] = ['HbA1c (%)']
    feats_to_keep[1] = ['HbA1c (%)','Glucose (mg/dL)']
    # Blood test only features
    feats_to_keep[2] = ['HbA1c (%)','Glucose (mg/dL)', 
     'LDL Cholesterol Calculation (mg/dL)',
     'HDL Cholesterol (mg/dL)',
     'Total Cholesterol (mg/dL)']
    # CGM 
    feats_to_keep[3] = [ 
             'hyper_time_pcts', 'spike_resolutions',
           'mean_glucose',  'expected_daily_spikes', 'nocturnal_hypoglycemias']
    # CGM + basic biometric features
    feats_to_keep[4] = [ 
             'hyper_time_pcts', 'spike_resolutions',
           'mean_glucose',  'expected_daily_spikes',
           'nocturnal_hypoglycemias',
           'mhoccur_hbp, High blood pressure',
           'whr_vsorres, Waist to Hip Ratio (WHR)',
           'waist_vsorres, Waist Circumference (cm)', 
           'pulse_vsorres, Heart Rate (bpm)']
    # CGM + non blood test features
    feats_to_keep[5] = [ 
           'hyper_time_pcts', 'spike_resolutions',
           'mean_glucose',  'expected_daily_spikes',
           'nocturnal_hypoglycemias',
           'mhoccur_hbp, High blood pressure',
           'whr_vsorres, Waist to Hip Ratio (WHR)',
           'waist_vsorres, Waist Circumference (cm)', 
           'pulse_vsorres, Heart Rate (bpm)',
           'fh_dm2pt, Were either of your parents diagnosed w',
           'via2, How much difficulty, if any, do you have re',
     ]

feats_to_keep[6] = train_df.columns[:-1]
result_acc, result_ba, result_auc = run_test(model, feats_to_keep, test_df, target_idx, target_cols)


Study group 2,3
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.])
Test Acc: 0.5190 |Test Auc: 0.5565 
Test cm [[41  0]
 [38  0]]
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.])
Test Acc: 0.5190 |Test Auc: 0.5565 
Test cm [[41  0]
 [38  0]]
tensor([0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.])
Test Acc: 0.5063 |Test Auc: 0.6540 
Test cm [[38  3]
 [36  2]]
tensor([1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
Test Acc: 0.5190 |Test Auc: 0.5318 
Test cm [[41  0]
 [38  0]]
tensor([1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1.])
Test Acc: 0.5570 |Test Auc: 0.7375 
Test cm [[41  0]
 [35  3]]
tensor([1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1.])
Test Acc: 0.5570 |Test Auc: 0.7375 
Test cm [[41  0]
 [35  3]]
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])
Test Acc: 0.6962 |Test Auc: 0.7927 
Test cm [[40  1]
 [23 15]]


In [6]:
# Create dataframe
result_df = pd.DataFrame({'feats':feats_to_keep,'ba':result_ba, 'acc':result_acc, 'auc':result_auc})
result_df.head(7)

Unnamed: 0,feats,ba,acc,auc
0,[HbA1c (%)],0.5,0.518987,0.556483
1,[HbA1c (%)],0.5,0.518987,0.556483
2,"[HbA1c (%), Creatinine (mg/dL), Troponin-T (ng...",0.48973,0.506329,0.654044
3,"[hyper_time_pcts, mean_glucose, nocturnal_hypo...",0.5,0.518987,0.531772
4,"[hyper_time_pcts, mean_glucose, nocturnal_hypo...",0.539474,0.556962,0.737484
5,"[hyper_time_pcts, mean_glucose, nocturnal_hypo...",0.539474,0.556962,0.737484
6,"Index(['Creatinine (mg/dL)', 'Troponin-T (ng/L...",0.685173,0.696203,0.792683


In [7]:
import os
os.makedirs("results", exist_ok=True)
result_df.to_csv('results\\study_'+str(study_group[0])+"_"+str(study_group[1])+'_results_one_model_5.csv', index=False)