# IR WFM

# Library requirements

The libraries used in this colab are standard for [colab.research.google.com](https://colab.research.google.com)

In case of import failures, manually install the failing packages
by running `!pip install <package>` in a new cell.

# Run required imports

In [1]:
import datetime

from collections import defaultdict
import numpy as np
import os
import pandas as pd
import pathlib
import pickle

import seaborn as sns
import shap

from sklearn.decomposition import PCA
from sklearn import metrics, model_selection
from sklearn.manifold import TSNE
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import r2_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

import seaborn as sns
import pandas as pd
import numpy as np
import os

from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm

import xgboost

In [2]:
# Colab specifics
from google.colab import files
from google.colab import sheets

import warnings
warnings.filterwarnings('ignore')

# Define paths & upload data

File uploads and algorithm outputs can be explored under the "Files" tab in the menu on the left.

In [5]:
INPUTS_FOLDER = '/content/input_data'
OUTPUTS_FOLDER = '/content/output_data'

# Input file names
subject_split_file_name = 'subject_split.csv'
data_splits_file_name = 'participant_ids_5_folds.pkl'
labels_and_scores_file_name = 'labels_and_scores.pkl'

saf_subjects_file_name = 'saf_ir_subjects.parquet'
saf_pin_mapping_file_name = 'saf_pin_mapping.parquet'

# Input folder names
embeddings_dir_name = 'embeddings'
saf_embeddings_dir_name = 'saf_embeddings'

# Output file names
tree_search_output_embeddings_file = 'lsm_embeddings_for_tree_search.pkl'

# Create paths
path_list = [INPUTS_FOLDER, OUTPUTS_FOLDER]

for path in path_list:
  if not os.path.exists(path):
    os.makedirs(path)

In [4]:
# @title Upload subject split data, labels, and scores
# @markdown The following file names are expected:
# @markdown * subject_split.csv
# @markdown * participant_ids_5_folds.pkl
# @markdown * labels_and_scores.pkl
# @markdown * saf_ir_subjects.parquet
# @markdown * saf_pin_mapping.parquet

uploaded = files.upload(target_dir=INPUTS_FOLDER)

paths_count = 0

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes to folder {folder}'.format(
      name=fn, length=len(uploaded[fn]), folder=INPUTS_FOLDER))

  file_name = os.path.basename(fn)
  if file_name == subject_split_file_name:
    subject_split_path = path
    paths_count +=1
  elif file_name == data_splits_file_name:
    data_splits_path = path
    paths_count +=1
  elif file_name == labels_and_scores_file_name:
    labels_and_scores_path = path
    paths_count +=1
  elif file_name == saf_subjects_file_name:
    saf_subjects_path = path
    paths_count +=1
  elif file_name == saf_pin_mapping_file_name:
    saf_pin_path = path
    paths_count +=1

if len(paths_count) == 5:
  print('Using subject split file: ', subject_split_path)
  print('Using data split file: ', data_splits_path)
  print('Using labels and scores file: ', labels_and_scores_path)
  print('Using SAF subject labels file: ', saf_subjects_path)
  print('Using SAF PIN mapping file: ', saf_pin_path)

else:
  print('You should upload 5 files at this stage, adn the names should match the spec.')

Only one file supported, try again.


In [None]:
# @title Upload embeddings

embeddings_dir = os.path.join(INPUTS_FOLDER, embeddings_dir_name)

if not os.path.exists(embeddings_dir):
  os.mkdir(embeddings_dir)

embeddings_uploaded = files.upload(target_dir=embeddings_dir)

for fn in embeddings_uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes to folder {folder}'.format(
      name=fn, length=len(embeddings_uploaded[fn]), folder=embeddings_dir))

embedding_paths = embeddings_uploaded.keys()

if embedding_paths > 0:
  print('Using files: ', embedding_paths)
else:
  print('You should upload 2 files at this stage.')


In [None]:
# @title Upload SAF embeddings

saf_embeddings_dir = os.path.join(INPUTS_FOLDER, saf_embeddings_dir_name)

if not os.path.exists(saf_embeddings_dir):
  os.mkdir(saf_embeddings_dir)

saf_embeddings_uploaded = files.upload(target_dir=saf_embeddings_dir)

for fn in saf_embeddings_uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes to folder {folder}'.format(
      name=fn, length=len(saf_embeddings_uploaded[fn]), folder=saf_embeddings_dir))

saf_embedding_paths = saf_embeddings_uploaded.keys()

if saf_embedding_paths > 0:
  print('Using files: ', saf_embedding_paths)
else:
  print('You should upload 2 files at this stage.')

In [None]:
# @title Session storage upload
# @markdown Load files from given paths. Paths are local to the session storage.

# @markdown Individual files
subject_split_path = 'input_data/subject_split.csv' # @param {type:"string"}
print('Using subject split file: ', subject_split_path)

data_splits_path = 'input_data/participant_ids_5_folds.pkl' # @param {type:"string"}
print('Using data split file: ', data_splits_path)

labels_and_scores_path = 'input_data/labels_and_scores.pkl' # @param {type:"string"}
print('Using labels and scores file: ', labels_and_scores_path)

saf_subjects_path = 'input_data/saf_ir_subjects.parquet' # @param {type:"string"}
print('Using SAF subject labels file: ', saf_subjects_path)

saf_pin_path = 'input_data/saf_pin_mapping.parquet'  # @param {type:"string"}
print('Using SAF PIN mapping file: ', saf_pin_path)

# @markdown Directories
embeddings_dir = 'input_data/embeddings/' # @param {type:"string"}
print('Using embeddings dir: ', embeddings_dir)

saf_embeddings_dir = 'input_data/saf_embeddings/' # @param {type:"string"}
print('Using SAF embeddings dir: ', saf_embeddings_dir)

In [None]:
# @title Load data

# Load subject split
with open(subject_split_path, "rb") as infile:
  training_data_aggregated_all_days = pd.read_csv(infile)

# Load data splits
with open(data_splits_path, 'rb') as infile:
  splits = pd.read_pickle(infile)

# Load labels and scores
with open(labels_and_scores_path, 'rb') as infile:
  labels_and_scores_df = pd.read_pickle(infile)

# Load embeddings
embeddings_filepaths= os.listdir(embeddings_dir)

embeddings_list_lsmv2 = []
labels_list_lsmv2 = []
mask_list_lsmv2 = []
id_list_lsmv2 = []
for filepath in embeddings_filepaths:
  with open(os.path.join(embeddings_dir, filepath), 'rb') as infile:
    d = pd.read_pickle(infile)
    embeddings_list_lsmv2.append(d['embedding_pre_logits'].reshape(-1, d['embedding_pre_logits'].shape[-1]))
    labels_list_lsmv2.append(np.argmax(d['label'].reshape(-1, d['label'].shape[-1]), axis=1))
    tmp = np.squeeze(d['imputation_mask'], axis=-1)
    id_list_lsmv2.append(d['subject_id'].flatten())
    x, y = tmp.shape[:2]
    mask_list_lsmv2.append(tmp.reshape(x * y, tmp.shape[2], tmp.shape[3]))

# SAF Labels:
with open(saf_subjects_path, "rb") as infile:
  saf_last_visit_df = pd.read_parquet(infile)

# SAF Embeddings:
saf_embeddings_file_names = os.listdir(saf_embeddings_dir)

embeddings_list_lsmv2c = []
labels_list_lsmv2c = []
mask_list_lsmv2c = []
id_list_lsmv2c = []

for file_name in saf_embeddings_file_names:
  with open(os.path.join(saf_embeddings_dir, file_name), 'rb') as infile:
    d = pd.read_pickle(infile)
    embeddings_list_lsmv2c.append(d['embedding_pre_logits'].reshape(-1, d['embedding_pre_logits'].shape[-1]))
    labels_list_lsmv2c.append(np.argmax(d['label'].reshape(-1, d['label'].shape[-1]), axis=1))#d['label'].reshape(-1, d['label'].shape[-1]))
    tmp = np.squeeze(d['imputation_mask'], axis=-1)
    str_fpath = d['str_fpath']

    tmp_ids = []
    for i in d['str_fpath'].flatten():
      try:
        tmp_ids.append(i.decode('utf-8').split('/')[-1].split('_')[1].split('.')[0])
      except:
        tmp_ids.append('')

    id_list_lsmv2c.append(tmp_ids)
    x, y = tmp.shape[:2]
    mask_list_lsmv2c.append(tmp.reshape(x * y, tmp.shape[2], tmp.shape[3]))

grouped_arrays = defaultdict(list)
for id, arr in zip(list(np.concat(id_list_lsmv2c)), list(np.concat(embeddings_list_lsmv2c))):
    grouped_arrays[id].append(arr)

with open(saf_pin_mapping_file_name, "rb") as infile:
  pin_mapping = pd.read_parquet(infile)

# Helper classes and functions

In [None]:
def preprocess_data(X_train, X_test, scaler_name="standard"):
    """
    Preprocesses and splits a Pandas DataFrame for neural network training,
    returning the scaler object for inverse transformation.

    Args:
        df: The input Pandas DataFrame.
        target_cols: A list of column names representing the target variables (optional).
                     If not provided, the entire DataFrame is considered for normalization.

    Returns:
        A tuple containing:
            - Train and test DataFrames, potentially with normalized values and split into
              features (X) and targets (y) if target_cols are provided.
            - The fitted StandardScaler object for inverse transformation.
    """
    if scaler_name == "minmax":
      scaler = MinMaxScaler()
    elif scaler_name == "standard":
      scaler = StandardScaler()
    else:
      raise ValueError(f"Unknown scaler name: {scaler_name}")

    x_train_normalized = scaler.fit_transform(X_train)
    x_test_normalized = scaler.transform(X_test)

    return x_train_normalized, x_test_normalized

# Five Fold

In [None]:
# @title Pre-processing and data validation

labels_and_scores_df['sex'] = labels_and_scores_df['sex'].map({'Male': 1, 'Female': 2})

grouped_arrays = defaultdict(list)
for id, arr in zip(list(np.concat(id_list_lsmv2)), list(np.concat(embeddings_list_lsmv2))):
    grouped_arrays[id].append(arr)

embeddings_list_lsmv2_mean = []
id_list_lsmv2_mean = []
labels_lsmv2_mean = []
for key, arrays in grouped_arrays.items():
  try:
    age = labels_and_scores_df[labels_and_scores_df['participant_id']==key].age.values[0]
    gender = labels_and_scores_df[labels_and_scores_df['participant_id']==key].sex.values[0]
    bmi = labels_and_scores_df[labels_and_scores_df['participant_id']==key].bmi.values[0]

    glucose = labels_and_scores_df[labels_and_scores_df['participant_id']==key].glucose.values[0]
    total_cholesterol = labels_and_scores_df[labels_and_scores_df['participant_id']==key]['total cholesterol'].values[0]
    hdl = labels_and_scores_df[labels_and_scores_df['participant_id']==key].hdl.values[0]
    triglycerides = labels_and_scores_df[labels_and_scores_df['participant_id']==key].triglycerides.values[0]
    ldl = labels_and_scores_df[labels_and_scores_df['participant_id']==key].ldl.values[0]
    chol_hdl = labels_and_scores_df[labels_and_scores_df['participant_id']==key]['chol/hdl'].values[0]
    non_hdl = labels_and_scores_df[labels_and_scores_df['participant_id']==key]['non hdl'].values[0]
    bun = labels_and_scores_df[labels_and_scores_df['participant_id']==key].bun.values[0]
    creatinine = labels_and_scores_df[labels_and_scores_df['participant_id']==key].creatinine.values[0]
    egfr = labels_and_scores_df[labels_and_scores_df['participant_id']==key].egfr.values[0]
    sodium = labels_and_scores_df[labels_and_scores_df['participant_id']==key].sodium.values[0]
    potassium = labels_and_scores_df[labels_and_scores_df['participant_id']==key].potassium.values[0]
    chloride = labels_and_scores_df[labels_and_scores_df['participant_id']==key].chloride.values[0]
    co2 = labels_and_scores_df[labels_and_scores_df['participant_id']==key].co2.values[0]
    calcium = labels_and_scores_df[labels_and_scores_df['participant_id']==key].calcium.values[0]
    total_protein = labels_and_scores_df[labels_and_scores_df['participant_id']==key]['total protein'].values[0]
    albumin = labels_and_scores_df[labels_and_scores_df['participant_id']==key].albumin.values[0]
    globulin = labels_and_scores_df[labels_and_scores_df['participant_id']==key].globulin.values[0]
    albumin_globulin = labels_and_scores_df[labels_and_scores_df['participant_id']==key]['albumin/globulin'].values[0]
    total_bilirubin = labels_and_scores_df[labels_and_scores_df['participant_id']==key]['total bilirubin'].values[0]
    alp = labels_and_scores_df[labels_and_scores_df['participant_id']==key].alp.values[0]
    ast = labels_and_scores_df[labels_and_scores_df['participant_id']==key].ast.values[0]
    alt = labels_and_scores_df[labels_and_scores_df['participant_id']==key].alt.values[0]

    additional_features = [age,
                           gender,
                           bmi,
                           glucose,
                           total_cholesterol,
                           hdl,
                           triglycerides,
                           ldl,
                           chol_hdl,
                           non_hdl,
                           bun,
                           creatinine,
                           egfr,
                           sodium,
                           potassium,
                           chloride,
                           co2,
                           calcium,
                           total_protein,
                           albumin,
                           globulin,
                           albumin_globulin,
                           total_bilirubin,
                           alp,
                           ast,
                           alt]

    embeddings_list_lsmv2_mean.append(np.append(np.nanmedian(np.array(arrays), axis=0),additional_features))
    id_list_lsmv2_mean.append(key)
  except:
    print('Missing data for key: ', key)

In [None]:
#@title Write out embeddings

EXP_11 = {'name': "EXP_11",
'description': "Fitbit + Demographics + Lipid Panel + Metabolic Panel",
'features': [0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25],
'label': "homa_ir"}

results = []
data = []
for j in ['homa_ir']:
  for i in ['_lsmv2']:
    for experiment_name in [EXP_11]:

      random_seeds=[0, 92, 1, 2024, 12121]

      if experiment_name['description'] == 'Demographics':
        feature_indexes = np.array(experiment_name['features'])+384
      else:
        feature_indexes = np.append(np.array(range(0,384,1)),np.array(experiment_name['features'])+384)
      print('Experiment Name: ', experiment_name)

      test_results = []
      test_labels = []

      for s, split in enumerate(splits.keys()):
        np.random.seed(random_seeds[s])
        print(split, len(splits[split]['training']))
        print(split, len(splits[split]['testing']))
        train_ids = splits[split]['training']
        test_ids = splits[split]['testing']

        # Features:
        embedding_arr = np.array(embeddings_list_lsmv2_mean)
        embedding_arr = embedding_arr[:,feature_indexes.astype(int)]
        label_df = pd.DataFrame()
        label_df['participant_id'] = np.array(id_list_lsmv2_mean)
        label_df['participant_id'] = label_df['participant_id'].astype(int)

        label_df = pd.merge(label_df, labels_and_scores_df, on='participant_id', how='left')
        label_df = label_df[['participant_id', 'age', 'bmi','homa_ir']]

        label_df_no_nan = label_df.dropna(subset=['participant_id'])
        label_df_no_nan = label_df_no_nan[label_df['participant_id'] > 0]
        label_df_no_nan = label_df_no_nan[label_df['bmi'] >= 12]
        label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]
        if j == 'homa_ir':
          label_df_no_nan = label_df_no_nan[label_df_no_nan['homa_ir'] >= 0]
          label_df_no_nan = label_df_no_nan[label_df_no_nan['homa_ir'] <= 15]

        daniels_subset_lsm_metabolic = label_df_no_nan

        Xtrain = embedding_arr[label_df_no_nan[label_df_no_nan['participant_id'].isin(train_ids)].index, :]
        ytrain = label_df_no_nan[label_df_no_nan['participant_id'].isin(train_ids)][j].values
        Xtest = embedding_arr[label_df_no_nan[label_df_no_nan['participant_id'].isin(test_ids)].index, :]
        ytest = label_df_no_nan[label_df_no_nan['participant_id'].isin(test_ids)][j].values

        tmp = {}
        tmp['Xtrain'] = Xtrain
        tmp['ytrain'] = ytrain
        tmp['Xtest'] = Xtest
        tmp['ytest'] = ytest
        data.append(tmp)

with open(os.path.join(OUTPUTS_FOLDER, tree_search_output_embeddings_file), 'wb') as outfile:
  outfile.write(pickle.dumps(data))


Experiment Name:  {'name': 'EXP_11', 'description': 'Fitbit + Demographics + Lipid Panel + Metabolic Panel', 'features': [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], 'label': 'homa_ir'}
fold_1_of_5_folds 938
fold_1_of_5_folds 235


Boolean Series key will be reindexed to match DataFrame index.


fold_2_of_5_folds 938
fold_2_of_5_folds 235
fold_3_of_5_folds 938
fold_3_of_5_folds 235


Boolean Series key will be reindexed to match DataFrame index.
Boolean Series key will be reindexed to match DataFrame index.


fold_4_of_5_folds 939
fold_4_of_5_folds 234
fold_5_of_5_folds 939
fold_5_of_5_folds 234


Boolean Series key will be reindexed to match DataFrame index.
Boolean Series key will be reindexed to match DataFrame index.


13760632

In [None]:
#@title Homa IR Grouped Embeddings

EXP_4 = {'name': "EXP_4",
'description': "Demographics",
'features': [0,2],
'label': "homa_ir"}

EXP_0 = {'name': "EXP_0",
'description': "Fitbit",
'features': [],
'label': "homa_ir"}

EXP_1 = {'name': "EXP_1",
'description': "Fitbit + Demographics",
'features': [0,2],
'label': "homa_ir"}

EXP_10 = {'name': "EXP_10",
'description': "Fitbit + Demographics + Fasting Glucose",
'features': [0,2,3],
'label': "homa_ir"}

EXP_11 = {'name': "EXP_11",
'description': "Fitbit + Demographics + Lipid Panel + Metabolic Panel",
'features': [0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25],
'label': "homa_ir"}

EXP_21 = {'name': "EXP_21",
'description': "Fitbit + Demographics + Lipid Panel",
'features': [0,2,4,5,6,7,8,9],
'label': "homa_ir"}

EXP_27 = {'name': "EXP_27",
'description': "Fitbit + Demographics + Lipid Panel + Fasting Glucose",
'features': [0,2,3,4,5,6,7,8,9],
'label': "homa_ir"}


results = []
for j in ['homa_ir']:
  for i in ['_lsmv2']:
    print('LSM Version: ', i)
    for experiment_name in [EXP_4, EXP_0, EXP_1, EXP_21, EXP_10, EXP_11, EXP_27]:

      random_seeds=[0, 92, 1, 2024, 12121]

      if experiment_name['description'] == 'Demographics':
        feature_indexes = np.array(experiment_name['features'])+384
      else:
        feature_indexes = np.append(np.array(range(0,384,1)),np.array(experiment_name['features'])+384)
      print('Experiment Name: ', experiment_name)

      test_results = []
      test_labels = []

      for s, split in enumerate(splits.keys()):
        np.random.seed(random_seeds[s])
        print(split, len(splits[split]['training']))
        print(split, len(splits[split]['testing']))
        train_ids = splits[split]['training']
        test_ids = splits[split]['testing']

        # Features:
        embedding_arr = np.array(embeddings_list_lsmv2_mean)
        embedding_arr = embedding_arr[:,feature_indexes.astype(int)]
        label_df = pd.DataFrame()
        label_df['participant_id'] = np.array(id_list_lsmv2_mean)
        label_df['participant_id'] = label_df['participant_id'].astype(int)

        label_df = pd.merge(label_df, labels_and_scores_df, on='participant_id', how='left')
        label_df = label_df[['participant_id', 'age', 'bmi','homa_ir']]

        label_df_no_nan = label_df.dropna(subset=['participant_id'])
        label_df_no_nan = label_df_no_nan[label_df['participant_id'] > 0]
        label_df_no_nan = label_df_no_nan[label_df['bmi'] >= 12]
        label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]
        if j == 'homa_ir':
          label_df_no_nan = label_df_no_nan[label_df_no_nan['homa_ir'] >= 0]
          label_df_no_nan = label_df_no_nan[label_df_no_nan['homa_ir'] <= 15]

        daniels_subset_lsm_metabolic = label_df_no_nan

        Xtrain = embedding_arr[label_df_no_nan[label_df_no_nan['participant_id'].isin(train_ids)].index, :]
        ytrain = label_df_no_nan[label_df_no_nan['participant_id'].isin(train_ids)][j].values
        Xtest = embedding_arr[label_df_no_nan[label_df_no_nan['participant_id'].isin(test_ids)].index, :]
        ytest = label_df_no_nan[label_df_no_nan['participant_id'].isin(test_ids)][j].values

        print(split, Xtrain.shape[0])
        print(split, Xtest.shape[0])

        Xtrain, Xtest = preprocess_data(Xtrain, Xtest)

        XtrainLSM = np.nan_to_num(Xtrain[:,:384], nan=0)
        XtestLSM = np.nan_to_num(Xtest[:,:384], nan=0)
        if experiment_name['description'] != 'Demographics':
          n_components = 50
          pca = PCA(n_components=n_components)

          # Fit PCA on the scaled training data
          X_train_pca = pca.fit_transform(XtrainLSM)

          # 3. Transform the scaled test data using the fitted PCA model
          X_test_pca = pca.transform(XtestLSM)

          # The transformed data will have 5 columns (the principal components)
          XtrainLSM = pd.DataFrame(data=X_train_pca, columns=[f'PC{i+1}' for i in range(n_components)])
          XtestLSM = pd.DataFrame(data=X_test_pca, columns=[f'PC{i+1}' for i in range(n_components)])

          Xtrain = np.concatenate((XtrainLSM, np.nan_to_num(Xtrain[:,384:], nan=0)), axis=1)
          Xtest = np.concatenate((XtestLSM, np.nan_to_num(Xtest[:,384:], nan=0)), axis=1)

        # Fit Model
        from xgboost import XGBRegressor

        kfold_found_params = {
              'booster': 'gbtree',
              'learning_rate': 0.1,
              'max_depth': 1,
              'n_estimators': 85,#125,
              'reg_alpha': 2,
              'reg_lambda': 2,
              'objective': 'reg:squarederror',
          }

        model = XGBRegressor(**kfold_found_params, random_state=random_seeds[s])
        model.fit(Xtrain, ytrain)
        yd = model.predict(Xtest)

        test_results.append(yd)
        test_labels.append(ytest)

        # Plot
        plotOn = False
        if plotOn:
          plt.rcParams['axes.facecolor'] = 'white'
          fig, axs = plt.subplots(1, 1, figsize=(6, 6))
          axs.scatter(ytest, yd, alpha=0.1)

          axs.set_title('Age Prediction')
          axs.set_xlabel('True Age')
          axs.set_ylabel('Predicted Age')
          axs.spines['bottom'].set_color('#000000')
          axs.spines['left'].set_color('#000000')
          m, b = np.polyfit(ytest, yd, 1)

          if j == 'age':
            y_predicted = m * range(20,80) + b
            axs.plot(range(20,80), y_predicted, color='red', label=f'Line of Best Fit: y = {m:.2f}x + {b:.2f}')
            axs.set_ylim([20, 80])
            axs.set_xlim([20, 80])
          if j == 'bmi':
            y_predicted = m * range(12,65) + b
            axs.plot(range(12,65), y_predicted, color='red', label=f'Line of Best Fit: y = {m:.2f}x + {b:.2f}')
            axs.set_ylim([12, 65])
            axs.set_xlim([12, 65])
          if j == 'homa_ir':
            y_predicted = m * range(0,15) + b
            axs.plot(range(0,15), y_predicted, color='red', label=f'Line of Best Fit: y = {m:.2f}x + {b:.2f}')
            axs.set_ylim([0, 15])
            axs.set_xlim([0, 15])

          print('.   MAE: ', np.mean(np.abs(yd - ytest)))
          print('.   RMSE: ', np.sqrt(np.mean((yd - ytest)**2)))
          print('.   R2: ', r2_score(ytest, yd))
          print('.   MSE: ', mean_squared_error(ytest, yd))

      results_dict = {}
      results_dict['lsm_version'] = i
      results_dict['experiment_name'] = experiment_name
      results_dict['test_labels'] = np.concat(test_labels)
      results_dict['test_labels_array'] = test_labels
      results_dict['test_results'] = np.concat(test_results)
      results_dict['test_results_array'] = test_results
      results.append(results_dict)

      print("================")
      print('R2: ', r2_score(np.concat(test_labels), np.concat(test_results)))
      print("================")

LSM Version:  _lsmv2
Experiment Name:  {'name': 'EXP_4', 'description': 'Demographics', 'features': [0, 2], 'label': 'homa_ir'}
fold_1_of_5_folds 938
fold_1_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_1_of_5_folds 669
fold_1_of_5_folds 170
fold_2_of_5_folds 938
fold_2_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_2_of_5_folds 662
fold_2_of_5_folds 177
fold_3_of_5_folds 938
fold_3_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_3_of_5_folds 664
fold_3_of_5_folds 175
fold_4_of_5_folds 939
fold_4_of_5_folds 234


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_4_of_5_folds 673
fold_4_of_5_folds 166
fold_5_of_5_folds 939
fold_5_of_5_folds 234


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_5_of_5_folds 688
fold_5_of_5_folds 151
R2:  0.17964277048467692
Experiment Name:  {'name': 'EXP_0', 'description': 'Fitbit', 'features': [], 'label': 'homa_ir'}
fold_1_of_5_folds 938
fold_1_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_1_of_5_folds 669
fold_1_of_5_folds 170
fold_2_of_5_folds 938
fold_2_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_2_of_5_folds 662
fold_2_of_5_folds 177
fold_3_of_5_folds 938
fold_3_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_3_of_5_folds 664
fold_3_of_5_folds 175
fold_4_of_5_folds 939
fold_4_of_5_folds 234


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_4_of_5_folds 673
fold_4_of_5_folds 166
fold_5_of_5_folds 939
fold_5_of_5_folds 234


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_5_of_5_folds 688
fold_5_of_5_folds 151
R2:  0.18415490401912749
Experiment Name:  {'name': 'EXP_1', 'description': 'Fitbit + Demographics', 'features': [0, 2], 'label': 'homa_ir'}
fold_1_of_5_folds 938
fold_1_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_1_of_5_folds 669
fold_1_of_5_folds 170
fold_2_of_5_folds 938
fold_2_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_2_of_5_folds 662
fold_2_of_5_folds 177
fold_3_of_5_folds 938
fold_3_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_3_of_5_folds 664
fold_3_of_5_folds 175
fold_4_of_5_folds 939
fold_4_of_5_folds 234


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_4_of_5_folds 673
fold_4_of_5_folds 166
fold_5_of_5_folds 939
fold_5_of_5_folds 234


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_5_of_5_folds 688
fold_5_of_5_folds 151
R2:  0.22681728135371138
Experiment Name:  {'name': 'EXP_21', 'description': 'Fitbit + Demographics + Lipid Panel', 'features': [0, 2, 4, 5, 6, 7, 8, 9], 'label': 'homa_ir'}
fold_1_of_5_folds 938
fold_1_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_1_of_5_folds 669
fold_1_of_5_folds 170
fold_2_of_5_folds 938
fold_2_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_2_of_5_folds 662
fold_2_of_5_folds 177
fold_3_of_5_folds 938
fold_3_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_3_of_5_folds 664
fold_3_of_5_folds 175
fold_4_of_5_folds 939
fold_4_of_5_folds 234


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_4_of_5_folds 673
fold_4_of_5_folds 166
fold_5_of_5_folds 939
fold_5_of_5_folds 234


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_5_of_5_folds 688
fold_5_of_5_folds 151
R2:  0.29333470281850627
Experiment Name:  {'name': 'EXP_10', 'description': 'Fitbit + Demographics + Fasting Glucose', 'features': [0, 2, 3], 'label': 'homa_ir'}
fold_1_of_5_folds 938
fold_1_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_1_of_5_folds 669
fold_1_of_5_folds 170
fold_2_of_5_folds 938
fold_2_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_2_of_5_folds 662
fold_2_of_5_folds 177
fold_3_of_5_folds 938
fold_3_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_3_of_5_folds 664
fold_3_of_5_folds 175
fold_4_of_5_folds 939
fold_4_of_5_folds 234


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_4_of_5_folds 673
fold_4_of_5_folds 166
fold_5_of_5_folds 939
fold_5_of_5_folds 234


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_5_of_5_folds 688
fold_5_of_5_folds 151
R2:  0.4669597696782657
Experiment Name:  {'name': 'EXP_11', 'description': 'Fitbit + Demographics + Lipid Panel + Metabolic Panel', 'features': [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], 'label': 'homa_ir'}
fold_1_of_5_folds 938
fold_1_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_1_of_5_folds 669
fold_1_of_5_folds 170
fold_2_of_5_folds 938
fold_2_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_2_of_5_folds 662
fold_2_of_5_folds 177
fold_3_of_5_folds 938
fold_3_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_3_of_5_folds 664
fold_3_of_5_folds 175
fold_4_of_5_folds 939
fold_4_of_5_folds 234
fold_4_of_5_folds 673
fold_4_of_5_folds 166


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_5_of_5_folds 939
fold_5_of_5_folds 234


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_5_of_5_folds 688
fold_5_of_5_folds 151
R2:  0.4923179854309967
Experiment Name:  {'name': 'EXP_27', 'description': 'Fitbit + Demographics + Lipid Panel + Fasting Glucose', 'features': [0, 2, 3, 4, 5, 6, 7, 8, 9], 'label': 'homa_ir'}
fold_1_of_5_folds 938
fold_1_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_1_of_5_folds 669
fold_1_of_5_folds 170
fold_2_of_5_folds 938
fold_2_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_2_of_5_folds 662
fold_2_of_5_folds 177
fold_3_of_5_folds 938
fold_3_of_5_folds 235


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_3_of_5_folds 664
fold_3_of_5_folds 175
fold_4_of_5_folds 939
fold_4_of_5_folds 234


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_4_of_5_folds 673
fold_4_of_5_folds 166
fold_5_of_5_folds 939
fold_5_of_5_folds 234


  label_df_no_nan = label_df_no_nan[label_df['bmi'] <= 65]


fold_5_of_5_folds 688
fold_5_of_5_folds 151
R2:  0.4926616362463003


In [None]:
rows = []
for r in results:
  r2_splits = []
  tpr_splits = []
  tnr_splits = []
  rocauc_splits = []
  mae_splits = []
  prauc_splits = []
  precision_splits = []
  for i in range(len(r['test_labels_array'])):
    tpr_splits.append(recall_score(r['test_labels_array'][i]>2.9, r['test_results_array'][i]>2.9))
    cm = confusion_matrix(r['test_labels_array'][i]>2.9, r['test_results_array'][i]>2.9)
    tn = cm[0, 0]
    fp = cm[0, 1]
    fn = cm[1, 0]
    tp = cm[1, 1]
    tnr = tn / (tn + fp)
    roc_auc = roc_auc_score(r['test_labels_array'][i]>2.9, r['test_results_array'][i])

    tpr_splits.append(recall_score(r['test_labels_array'][i]>2.9, r['test_results_array'][i]>2.9))
    tnr_splits.append(tnr)
    rocauc_splits.append(roc_auc)
    r2_splits.append(r2_score(r['test_labels_array'][i], r['test_results_array'][i]))
    mae_splits.append(np.mean(abs(r['test_labels_array'][i] - r['test_results_array'][i])))

    precision, recall, thresholds = precision_recall_curve(r['test_labels_array'][i]>2.9, r['test_results_array'][i])
    prauc_splits.append(auc(recall, precision))

    precision_splits.append(precision_score(r['test_labels_array'][i]>2.9, r['test_results_array'][i]>2.9))


  tpr = recall_score(r['test_labels']>2.9, r['test_results']>2.9)
  cm = confusion_matrix(r['test_labels']>2.9, r['test_results']>2.9)
  tn = cm[0, 0]
  fp = cm[0, 1]
  fn = cm[1, 0]
  tp = cm[1, 1]
  tnr = tn / (tn + fp)
  roc_auc = roc_auc_score(r['test_labels']>2.9, r['test_results'])
  precision, recall, thresholds = precision_recall_curve(r['test_labels']>2.9, r['test_results'])
  area = auc(recall, precision)

  rows.append(pd.DataFrame({'Name': r['experiment_name']['description'],
                        'LSM Version': r['lsm_version'],
                'R2': r2_score(r['test_labels'], r['test_results']),
                'TPR': recall_score(r['test_labels']>2.9, r['test_results']>2.9),
                'TNR': tn / (tn + fp),
              'ROC AUC': roc_auc,
                'MAE': np.mean(abs(r['test_labels'] - r['test_results'])),
                'Precision': precision_score(r['test_labels']>2.9, r['test_results']>2.9),
                'PR AUC': area,
                            'R2 Split': [np.round(r2_splits, decimals=3)],
                            'TPR Split': [np.round(tpr_splits, decimals=3)],
                            'TNR Split': [np.round(tnr_splits, decimals=3)],
                            'ROC AUC Split': [np.round(rocauc_splits, decimals=3)],
                            'MAE Split': [np.round(mae_splits, decimals=3)],
                            'PR AUC Split': [np.round(prauc_splits, decimals=3)],
                            'Precision Split': [np.round(precision_splits, decimals=3)],
                            },
                            index=[0]))

pd.concat(rows)

Unnamed: 0,Name,LSM Version,R2,TPR,TNR,ROC AUC,MAE,Precision,PR AUC,R2 Split,TPR Split,TNR Split,ROC AUC Split,MAE Split,PR AUC Split,Precision Split
0,Demographics,_lsmv2,0.179643,0.636752,0.74876,0.763905,1.337342,0.495017,0.516212,"[0.214, 0.185, 0.171, 0.176, 0.132]","[0.706, 0.706, 0.673, 0.673, 0.643, 0.643, 0.5...","[0.773, 0.672, 0.774, 0.81, 0.716]","[0.795, 0.752, 0.807, 0.722, 0.765]","[1.358, 1.358, 1.348, 1.272, 1.351]","[0.623, 0.512, 0.534, 0.452, 0.518]","[0.571, 0.44, 0.474, 0.532, 0.475]"
0,Fitbit,_lsmv2,0.178305,0.619658,0.768595,0.766592,1.363152,0.508772,0.538649,"[0.176, 0.166, 0.176, 0.236, 0.116]","[0.667, 0.667, 0.633, 0.633, 0.595, 0.595, 0.6...","[0.723, 0.75, 0.782, 0.802, 0.789]","[0.792, 0.757, 0.736, 0.796, 0.747]","[1.401, 1.382, 1.424, 1.246, 1.357]","[0.621, 0.538, 0.448, 0.649, 0.502]","[0.507, 0.492, 0.463, 0.596, 0.477]"
0,Fitbit + Demographics,_lsmv2,0.223265,0.675214,0.765289,0.786805,1.317047,0.526667,0.557914,"[0.241, 0.196, 0.217, 0.27, 0.173]","[0.725, 0.725, 0.653, 0.653, 0.643, 0.643, 0.6...","[0.798, 0.734, 0.782, 0.784, 0.725]","[0.824, 0.761, 0.781, 0.784, 0.779]","[1.305, 1.374, 1.355, 1.219, 1.328]","[0.671, 0.552, 0.504, 0.583, 0.519]","[0.607, 0.485, 0.482, 0.569, 0.492]"
0,Fitbit + Demographics + Lipid Panel,_lsmv2,0.287868,0.700855,0.766942,0.821184,1.240625,0.537705,0.641038,"[0.308, 0.277, 0.279, 0.318, 0.239]","[0.745, 0.745, 0.694, 0.694, 0.667, 0.667, 0.6...","[0.756, 0.75, 0.752, 0.828, 0.752]","[0.828, 0.82, 0.832, 0.821, 0.816]","[1.265, 1.268, 1.272, 1.141, 1.255]","[0.695, 0.651, 0.622, 0.649, 0.619]","[0.567, 0.515, 0.459, 0.63, 0.526]"
0,Fitbit + Demographics + Fasting Glucose,_lsmv2,0.460783,0.739316,0.82314,0.868129,1.054755,0.617857,0.731851,"[0.465, 0.472, 0.458, 0.531, 0.342]","[0.765, 0.765, 0.755, 0.755, 0.738, 0.738, 0.6...","[0.832, 0.836, 0.812, 0.862, 0.771]","[0.886, 0.885, 0.858, 0.868, 0.848]","[1.064, 1.071, 1.039, 0.958, 1.15]","[0.808, 0.773, 0.697, 0.778, 0.667]","[0.661, 0.638, 0.554, 0.68, 0.561]"
0,Fitbit + Demographics + Lipid Panel + Metaboli...,_lsmv2,0.488118,0.752137,0.833058,0.885138,1.023056,0.635379,0.761984,"[0.488, 0.514, 0.48, 0.544, 0.384]","[0.765, 0.765, 0.816, 0.816, 0.762, 0.762, 0.6...","[0.849, 0.859, 0.827, 0.871, 0.752]","[0.879, 0.914, 0.899, 0.893, 0.86]","[1.054, 0.999, 1.007, 0.933, 1.135]","[0.808, 0.818, 0.763, 0.805, 0.714]","[0.684, 0.69, 0.582, 0.688, 0.542]"
0,Fitbit + Demographics + Lipid Panel + Fasting ...,_lsmv2,0.487905,0.760684,0.819835,0.881991,1.02498,0.620209,0.756806,"[0.486, 0.512, 0.476, 0.558, 0.373]","[0.784, 0.784, 0.796, 0.796, 0.738, 0.738, 0.7...","[0.832, 0.844, 0.805, 0.862, 0.752]","[0.877, 0.911, 0.888, 0.893, 0.854]","[1.054, 1.004, 1.02, 0.917, 1.141]","[0.802, 0.811, 0.748, 0.805, 0.697]","[0.667, 0.661, 0.544, 0.686, 0.55]"


# SAF

In [None]:


saf_last_visit_df['participant_id'] = saf_last_visit_df.index
saf_last_visit_df['non hdl'] = (saf_last_visit_df['ldl'] + saf_last_visit_df['vldl']).astype(float)
saf_last_visit_df.dropna(inplace=True)
print(f"Number of non-na participants from last visit: {len(saf_last_visit_df)}")

pin_mapping.reset_index(inplace=True)
pin_mapping.set_index('pin', inplace=True)
pin_mapping.rename(columns={'participant_id': 'raw_id'}, inplace=True)

saf_last_visit_df = saf_last_visit_df.merge(pin_mapping, left_on='pin', right_index=True)

In [None]:


embeddings_list_lsmv2c_mean = []
id_list_lsmv2c_mean = []
labels_lsmv2c_mean = []
for key, arrays in grouped_arrays.items():
  try:
    age = saf_last_visit_df[saf_last_visit_df['raw_id']==key].age.values[0]
    gender = saf_last_visit_df[saf_last_visit_df['raw_id']==key].sex.values[0]
    bmi = saf_last_visit_df[saf_last_visit_df['raw_id']==key].bmi.values[0]

    glucose = saf_last_visit_df[saf_last_visit_df['raw_id']==key].glucose.values[0]
    total_cholesterol = saf_last_visit_df[saf_last_visit_df['raw_id']==key]['total cholesterol'].values[0]
    hdl = saf_last_visit_df[saf_last_visit_df['raw_id']==key].hdl.values[0]
    triglycerides = saf_last_visit_df[saf_last_visit_df['raw_id']==key].triglycerides.values[0]
    ldl = saf_last_visit_df[saf_last_visit_df['raw_id']==key].ldl.values[0]
    chol_hdl = saf_last_visit_df[saf_last_visit_df['raw_id']==key]['chol/hdl'].values[0]
    non_hdl = saf_last_visit_df[saf_last_visit_df['raw_id']==key]['non hdl'].values[0]
    #bun = saf_last_visit_df[saf_last_visit_df['raw_id']==key].bun.values[0]
    #creatinine = saf_last_visit_df[saf_last_visit_df['raw_id']==key].creatinine.values[0]
    #egfr = saf_last_visit_df[saf_last_visit_df['raw_id']==key].egfr.values[0]
    #sodium = saf_last_visit_df[saf_last_visit_df['raw_id']==key].sodium.values[0]
    #potassium = saf_last_visit_df[saf_last_visit_df['raw_id']==key].potassium.values[0]
    #chloride = saf_last_visit_df[saf_last_visit_df['raw_id']==key].chloride.values[0]
    #co2 = saf_last_visit_df[saf_last_visit_df['raw_id']==key].co2.values[0]
    #calcium = saf_last_visit_df[saf_last_visit_df['raw_id']==key].calcium.values[0]
    total_protein = saf_last_visit_df[saf_last_visit_df['raw_id']==key]['total protein'].values[0]
    albumin = saf_last_visit_df[saf_last_visit_df['raw_id']==key].albumin.values[0]
    globulin = saf_last_visit_df[saf_last_visit_df['raw_id']==key].globulin.values[0]
    albumin_globulin = saf_last_visit_df[saf_last_visit_df['raw_id']==key]['albumin/globulin'].values[0]
    total_bilirubin = saf_last_visit_df[saf_last_visit_df['raw_id']==key]['total bilirubin'].values[0]
    alp = saf_last_visit_df[saf_last_visit_df['raw_id']==key].alp.values[0]
    #ast = saf_last_visit_df[saf_last_visit_df['raw_id']==key].ast.values[0]
    alt = saf_last_visit_df[saf_last_visit_df['raw_id']==key].alt.values[0]

    additional_features = [age,
                            gender,
                            bmi,
                            glucose,
                            total_cholesterol,
                            hdl,
                            triglycerides,
                            ldl,
                            chol_hdl,
                            non_hdl,
                            0,#bun,
                            0,#creatinine,
                            0,#egfr,
                            0,#sodium,
                            0,#potassium,
                            0,#chloride,
                            0,#co2,
                            0,#calcium,
                            total_protein,
                            albumin,
                            globulin,
                            albumin_globulin,
                            total_bilirubin,
                            alp,
                            0,#ast,
                            alt
                          ]

    embeddings_list_lsmv2c_mean.append(np.append(np.nanmedian(np.array(arrays), axis=0),additional_features))
    id_list_lsmv2c_mean.append(key)
  except:
    print('missing', key)

test_full_embedding_arr = np.array(embeddings_list_lsmv2c_mean)
test_label_df = pd.DataFrame()
test_label_df['participant_id'] = np.array(id_list_lsmv2c_mean)

missing 


In [None]:
fold_ids = [ 56007,  56008,  56011,  56014,  56015,  56023,  56025,  56026,
        56029,  56032,  56034,  56040,  56044,  56046,  56049,  56050,
        56053,  56058,  56059,  56062,  56063,  56069,  56070,  56074,
        56076,  56077,  56079,  56081,  56084,  56085,  56087,  56089,
        56093,  56094,  56096,  56097,  56099,  56100,  56105,  56106,
        56107,  56108,  56111,  56118,  56121,  56126,  56127,  56131,
        56133,  56134,  56135,  56136,  56138,  56141,  56143,  56145,
        56153,  56154,  56158,  56159,  56165,  56174,  56175,  56176,
        56181,  56187,  56188,  56203,  56208,  56209,  56212,  56213,
        56222,  56223,  56226,  56231,  56235,  56237,  56238,  56241,
        56263,  56265,  56266,  56268,  56297,  56299,  56305,  56317,
        56357,  56358,  56360,  56364,  56393,  56394,  56395,  56399,
        56418,  56423,  56427,  56431,  56441,  56442,  56456,  56463,
        56473,  56517,  56519,  56521,  56532,  56534,  56535,  56554,
        56570,  56571,  56606,  56609,  56611,  56624,  56625,  56638,
        56639,  56640,  56655,  56665,  56666,  56670,  56671,  56672,
        56686,  56691,  56696,  56699,  56738,  56754,  56770,  56771,
        56789,  56821,  56823,  56824,  56849,  56850,  56857,  56858,
        56861,  56863,  56883,  56888,  56893,  56895,  56900,  56918,
        56919,  56921,  56922,  56923,  56960,  56977,  56979,  56980,
        57002,  57003,  57004,  57010,  57022,  57026,  57028,  57029,
        57030,  57035,  57038,  57049,  57084,  57086,  57095,  57161,
        57168,  57170,  57171,  57182,  57187,  57190,  57191,  57197,
        57199,  57201,  57202,  57208,  57209,  57214,  57219,  57222,
        57230,  57239,  57245,  57249,  57250,  57251,  57254,  57256,
        57273,  57275,  57276,  57284,  57285,  57286,  57289,  57296,
        57302,  57308,  57310,  57311,  57314,  57331,  57335,  57383,
        57386,  57387,  57389,  57390,  57403,  57406,  57424,  57425,
        57427,  57433,  57452,  57453,  57470,  57471,  57472,  57473,
        57477,  57494,  57496,  57503,  57504,  57505,  57506,  57515,
        57527,  57529,  57531,  57534,  57535,  57538,  57548,  57549,
        57557,  57558,  57568,  57569,  57572,  57573,  57576,  57577,
        57588,  57589,  57591,  57593,  57599,  57600,  57601,  57608,
        57609,  57613,  57616,  57628,  57632,  57640,  57641,  57644,
        57645,  57648,  57650,  57653,  57656,  57657,  57662,  57669,
        57672,  57674,  57676,  57678,  57686,  57687,  57688,  57689,
        57691,  57695,  57773,  57775,  57778,  57792,  57794,  57813,
        57815,  57817,  57819,  57821,  57830,  57832,  57835,  57842,
        57843,  57852,  57853,  57855,  57862,  57864,  57866,  57896,
        57905,  57913,  57920,  57937,  57938,  57943,  57951,  57953,
        57972,  57978,  57992,  57997,  57998,  58000,  58001,  58002,
        58008,  58012,  58042,  58044,  58057,  58087,  58088,  58090,
        58093,  58094,  58099,  58118,  58146,  58147,  58154,  58155,
        58168,  58179,  58182,  58199,  58210,  58285,  58290,  58297,
        58318,  58327,  58328,  58329,  58334,  58337,  58341,  58342,
        58344,  58354,  58363,  58383,  58387,  58389,  58391,  58393,
        58395,  58399,  58401,  58422,  58423,  58433,  58434,  58435,
        58439,  58444,  58445,  58452,  58453,  58456,  58459,  58483,
        58495,  58498,  58508,  58510,  58524,  58531,  58556,  58559,
        58561,  58579,  58582,  58584,  58595,  58598,  58601,  58603,
        58606,  58645,  58649,  58657,  58659,  58662,  58663,  70206,
        70250,  70251,  70253,  70254,  70255,  70258,  70260,  70261,
        70263,  70264,  70270,  70276,  70279,  70280,  70288,  70292,
        70297,  70299,  70300,  70301,  70306,  70307,  70310,  70315,
        70316,  70317,  70319,  70320,  70322,  70323,  70325,  70327,
        70328,  70330,  70333,  70337,  70341,  70344,  70348,  70349,
        70352,  70355,  70357,  70358,  70361,  70367,  70373,  70374,
        70379,  70387,  70388,  70390,  70391,  70395,  70397,  70400,
        70406,  70407,  70408,  70410,  70412,  70413,  70414,  70420,
        70421,  70424,  70429,  70431,  70433,  70436,  70439,  70440,
        70443,  70449,  70451,  70452,  70455,  70458,  70461,  70462,
        70467,  70472,  70473,  70475,  70480,  70484,  70485,  70496,
        70499,  70503,  70506,  70511,  70513,  70516,  70519,  70521,
        70522,  70523,  70526,  70527,  70528,  70532,  70534,  70542,
        70545,  70546,  70547,  70548,  70549,  70552,  70562,  70565,
        70568,  70569,  70578,  70579,  70580,  70583,  70584,  70594,
        70619,  70620,  70621,  70642,  70645,  70646,  70647,  70652,
        70653,  70654,  70703,  70704,  70719,  70725,  70726,  70739,
        70740,  70741,  70743,  70744,  70765,  70786,  70813,  70829,
        70877,  70878,  70879,  70881,  70910,  70913,  70917,  70919,
        70920,  70921,  70931,  70932,  70938,  70940,  70943,  70944,
        70948,  70949,  70958,  71004,  71006,  71019,  71021,  71023,
        71036,  71054,  71083,  71084,  71085,  71086,  71088,  71089,
        71101,  71131,  71132,  71169,  71185,  71202,  71222,  71224,
        71225,  71238,  71239,  71241,  71255,  71256,  71257,  71286,
        71302,  71303,  71305,  71320,  71321,  71322,  71323,  71324,
        71335,  71341,  71390,  71429,  71430,  71467,  71468,  71510,
        71515,  71518,  71523,  71525,  71545,  71552,  71553,  71554,
        71569,  71572,  71579,  71603,  71606,  71649,  71651,  71655,
        71656,  71665,  71666,  71667,  71668,  71669,  71674,  71676,
        71678,  71680,  71683,  71684,  71687,  71688,  71693,  71698,
        71702,  71708,  71709,  71720,  71722,  71729,  71735,  71738,
        71739,  71741,  71743,  71744,  71745,  71757,  71758,  71760,
        71764,  71771,  71772,  71773,  71829,  71834,  71835,  71836,
        71837,  71838,  71839,  71841,  71842,  71845,  71847,  71851,
        71852,  71857,  71862,  71864,  71866,  71888,  71893,  71894,
        71919,  71926,  71928,  71937,  71938,  71960,  72010,  72013,
        72025,  72032,  72034,  72035,  72036,  72046,  72048,  72049,
        72072,  72082,  72085,  72097,  72100,  72103,  72105,  72107,
        72108,  72113,  72122,  72123,  72127,  72131,  72132,  72143,
        72146,  72148,  72150,  72151,  72152,  72153,  72163,  72164,
        72170,  72183,  72195,  72197,  72198,  72199,  72211,  72213,
        72217,  72219,  72224,  72225,  72284,  72292,  72296,  72302,
        72305,  72306,  72309,  72316,  72317,  72320,  72326,  72327,
        72331,  72334,  72335,  72343,  72344,  72346,  72350,  72354,
        72355,  72364,  72367,  72370,  72373,  72374,  72375,  72378,
        72380,  72384,  72385,  72393,  72398,  72399,  72409,  72413,
        72415,  72417,  72423,  72426,  72428,  72444,  72450,  72455,
        72466,  72474,  72476,  72477,  72480,  72483,  72484,  72491,
        72510,  72512,  72518,  72522,  72527,  72531,  72534,  72535,
        72539,  72582,  72585,  72588,  72590,  72591,  72605,  72623,
        72625,  72628,  72631,  72632,  72636,  72647,  72665,  72683,
        72690,  72692,  72694,  72695,  72711,  72718,  72721,  72723,
        72728,  72751,  72753,  72755,  72770,  72772,  72784,  72787,
        72788,  72791,  72793,  72816,  72824,  72825,  72832,  72841,
        72875,  72910,  72912,  72913,  72922,  72970,  72978,  72990,
        72997,  73004,  73009,  73010,  73011,  73024,  73025,  73030,
        73031,  73088,  73089,  73102,  73103,  73105,  73107,  73109,
        73112,  73121,  73128,  73129,  73145,  73168,  73174,  73177,
        73178,  73179,  73182,  73184,  73186,  73187,  73200,  73208,
        73210,  73218,  73221,  73227,  73233,  73237,  73242,  73244,
        73249,  73250,  73256,  73307,  73308,  73311,  73325,  73342,
        73358, 186151, 186152, 186154, 186156, 186157, 186159, 186161,
       186165, 186166, 186172, 186174, 186343, 186344, 186353, 186354,
       186357, 186358, 186362, 186364, 186371, 186420, 186427, 186444,
       186446, 186501, 186502, 186503, 186506, 186508, 192591, 192592,
       192596, 192609, 192612, 199740, 199754, 199756, 199757, 199758,
       199759, 199809]


In [None]:
train_full_embedding_arr = np.array(embeddings_list_lsmv2_mean)
train_label_df = pd.DataFrame()
train_label_df['participant_id'] = np.array(id_list_lsmv2_mean)
train_label_df['participant_id'] = train_label_df['participant_id'].astype(int)

target_col = 'homa_ir'

EXP_4 = {'name': "EXP_4",
'description': "Demographics",
'features': [0,2],
'label': "homa_ir"}

EXP_28 = {'name': "EXP_0",
'description': "Fitbit",
'features': [],
'label': "homa_ir"}

EXP_1 = {'name': "EXP_1",
'description': "Fitbit + Demographics",
'features': [0,2],
'label': "homa_ir"}

EXP_10 = {'name': "EXP_10",
'description': "Fitbit + Demographics + Fasting Glucose",
'features': [0,2,3],
'label': "homa_ir"}

EXP_11 = {'name': "EXP_11",
'description': "Fitbit + Demographics + Lipid Panel + Metabolic Panel",
'features': [0,2,3,4,5,6,7,8,9,18,19,20,21,22,23,25],
'label': "homa_ir"}

EXP_21 = {'name': "EXP_21",
'description': "Fitbit + Demographics + Lipid Panel",
'features': [0,2,4,5,6,7,8,9],
'label': "homa_ir"}

EXP_22 = {'name': "EXP_22",
'description': "Fitbit + Demographics + Fasting Glucose + Lipid Panel",
'features': [0,2,3,4,5,6,7,8,9],
'label': "homa_ir"}


results = []
experiment_to_latent_space = {}
experiment_to_prediction_df = {}

results = []
for experiment_name in [EXP_1, EXP_10, EXP_22]:

  learner = "tree"
  n_days = 120
  prediction_df = pd.DataFrame()
  random_seeds=[0, 92, 1, 2024, 12121]


  if experiment_name['description'] == 'Demographics':
    feature_indexes = np.array(experiment_name['features'])+384
  else:
    feature_indexes = np.append(np.array(range(0,384,1)),np.array(experiment_name['features'])+384)
  print('Experiment Name: ', experiment_name)

  if learner == "l1-l2":
        kfold_found_params = {'booster': 'gblinear',
                              'learning_rate': 0.19,
                              'n_estimators': 50,
                              'eta':0.1,
                              'reg_alpha': 0,
                              'reg_lambda': 0}

  elif learner == "tree":
        kfold_found_params = {'booster': 'gbtree',
                              'learning_rate': 0.1,
                              'max_depth': 1,
                              'n_estimators': 85,
                              'reg_alpha': 2,
                              'reg_lambda': 2,
                              'objective': 'reg:squarederror'}
  else:
    raise ValueError(
        f"Unknown learner: {learner}. Learners must be 'l1-l2' or 'tree'."
    )

  # Get the training data
  embedding_arr = train_full_embedding_arr[:,feature_indexes.astype(int)]
  training_df = pd.DataFrame(embedding_arr)
  training_df['participant_id'] = train_label_df['participant_id'].values
  training_df.sort_values(by=['participant_id'], inplace=True)
  training_df = pd.merge(training_df, training_data_aggregated_all_days[['participant_id','homa_ir']], on='participant_id', how='left')
  training_df.dropna(subset='homa_ir', inplace=True)
  training_df = training_df[training_df['participant_id'].isin(fold_ids)]

  # Get the test data
  embedding_arr = test_full_embedding_arr[:,feature_indexes.astype(int)]
  test_df = pd.DataFrame(embedding_arr)
  test_df['participant_id'] = test_label_df['participant_id'].values
  test_df.sort_values(by=['participant_id'], inplace=True)
  saf_last_visit_df['participant_id'] = saf_last_visit_df['raw_id']
  test_df = pd.merge(test_df, saf_last_visit_df[['participant_id','homa_ir']], on='participant_id', how='left')
  test_df.dropna(subset='homa_ir', inplace=True)

  participant_ids = training_df['participant_id'].values
  accuracy_dfs = []
  ordered_participant_ids = []
  all_predictions = []
  all_true_labels = []
  clf_labels_preds = []
  clf_labels_true_labels = []

  # Perform cross-validation
  all_x_test_encoded = []

  # save participant IDs
  participant_ids_train = training_df['participant_id']
  participant_ids_test = test_df['participant_id']

  X_train = training_df.drop(columns=["participant_id",
                                                        target_col], axis=1)
  y_train = training_df[target_col]
  X_test = test_df.drop(columns=["participant_id",
                                                      target_col], axis=1)
  y_test = test_df[target_col]


  print('Test subjects: ', len(y_test))
  #X_test = X_test.values
  X_train, X_test = preprocess_data(X_train, X_test.values)


  XtrainLSM = np.nan_to_num(X_train[:,:384], nan=0)
  XtestLSM = np.nan_to_num(X_test[:,:384], nan=0)
  if experiment_name['description'] != 'Demographics':
    n_components = 50
    pca = PCA(n_components=n_components)

    # Fit PCA on the scaled training data
    X_train_pca = pca.fit_transform(XtrainLSM)

    # 3. Transform the scaled test data using the fitted PCA model
    X_test_pca = pca.transform(XtestLSM)

    # The transformed data will have 5 columns (the principal components)
    XtrainLSM = pd.DataFrame(data=X_train_pca, columns=[f'PC{i+1}' for i in range(n_components)])
    XtestLSM = pd.DataFrame(data=X_test_pca, columns=[f'PC{i+1}' for i in range(n_components)])

    X_train = np.concatenate((XtrainLSM, np.nan_to_num(X_train[:,384:], nan=0)), axis=1)
    X_test = np.concatenate((XtestLSM, np.nan_to_num(X_test[:,384:], nan=0)), axis=1)

  # Initialize and train the XGBoost model
  xgb_regressor = xgboost.XGBRegressor(
      **kfold_found_params, random_state=0
  )
  xgb_regressor.fit(X_train, y_train.values)
  predicted_labels = xgb_regressor.predict(X_test)

  # Store predictions and true labels for later evaluation
  all_true_labels.append(y_test.values)
  all_predictions.append(predicted_labels)
  ordered_participant_ids.append(participant_ids_test)

  experiment_to_latent_space[experiment_name['name']] = all_x_test_encoded
  true_values = np.concatenate(all_true_labels)
  predicted_values = np.concatenate(all_predictions)
  participant_ids_predicted = np.concatenate(ordered_participant_ids)

  prediction_df['participant_id'] = participant_ids_predicted
  prediction_df['predicted_values'] = predicted_values
  prediction_df['true_values'] = true_values
  prediction_df['Experiment_ID'] = experiment_name['name']

  experiment_to_prediction_df[experiment_name['name']] = prediction_df

  results_dict = {}
  results_dict['experiment_name'] = experiment_name
  results_dict['test_labels'] = true_values
  results_dict['test_results'] = predicted_values

  results_dict['test_labels_array'] = all_true_labels
  results_dict['test_results_array'] = all_predictions

  results.append(results_dict)

Experiment Name:  {'name': 'EXP_1', 'description': 'Fitbit + Demographics', 'features': [0, 2], 'label': 'homa_ir'}
Test subjects:  71
Experiment Name:  {'name': 'EXP_10', 'description': 'Fitbit + Demographics + Fasting Glucose', 'features': [0, 2, 3], 'label': 'homa_ir'}
Test subjects:  71
Experiment Name:  {'name': 'EXP_22', 'description': 'Fitbit + Demographics + Fasting Glucose + Lipid Panel', 'features': [0, 2, 3, 4, 5, 6, 7, 8, 9], 'label': 'homa_ir'}
Test subjects:  71


In [None]:
threshold = 2.9
rows = []
for r in results:
  r2_splits = []
  tpr_splits = []
  tnr_splits = []
  rocauc_splits = []
  for i in range(len(r['test_labels_array'])):
    tpr_splits.append(recall_score(r['test_labels_array'][i]>threshold, r['test_results_array'][i]>threshold))
    cm = confusion_matrix(r['test_labels_array'][i]>threshold, r['test_results_array'][i]>threshold)
    tn = cm[0, 0]
    fp = cm[0, 1]
    fn = cm[1, 0]
    tp = cm[1, 1]
    tnr = tn / (tn + fp)
    roc_auc = roc_auc_score(r['test_labels_array'][i]>threshold, r['test_results_array'][i])

    tpr_splits.append(recall_score(r['test_labels_array'][i]>threshold, r['test_results_array'][i]>threshold))
    tnr_splits.append(tnr)
    rocauc_splits.append(roc_auc)
    r2_splits.append(r2_score(r['test_labels_array'][i], r['test_results_array'][i]))

  tpr = recall_score(r['test_labels']>threshold, r['test_results']>threshold)
  cm = confusion_matrix(r['test_labels']>threshold, r['test_results']>threshold)
  tn = cm[0, 0]
  fp = cm[0, 1]
  fn = cm[1, 0]
  tp = cm[1, 1]
  tnr = tn / (tn + fp)
  roc_auc = roc_auc_score(r['test_labels']>threshold, r['test_results'])

  precision, recall, thresholds = precision_recall_curve(r['test_labels']>threshold, r['test_results'])
  pr_auc = auc(recall, precision)

  for split in range(len(r['test_labels_array'])):
    rows.append(pd.DataFrame({'Name': r['experiment_name']['description'],
                              #'Reps': r['reps'],
                  'R2': r2_score(r['test_labels'], r['test_results']),
                  'TNR': tn / (tn + fp),
                              'TPR': recall_score(r['test_labels']>2.9, r['test_results']>threshold),
                                                            'Precision': precision_score(r['test_labels']>threshold, r['test_results']>threshold),
                'ROC AUC': roc_auc,
                              'PR AUC': pr_auc,
                              'R2 Split': r2_splits[split],
                              'TPR Split': tpr_splits[split],
                              'TNR Split': tnr_splits[split],
                              'ROC AUC Split': rocauc_splits[split],
                              },
                              index=[0]))

results_df = pd.concat(rows)
results_df

In [None]:
threshold = 2.9
rows = []
for r in results:
  r2_splits = []
  tpr_splits = []
  tnr_splits = []
  rocauc_splits = []
  mae_splits = []
  for i in range(len(r['test_labels_array'])):
    tpr_splits.append(recall_score(r['test_labels_array'][i]>threshold, r['test_results_array'][i]>threshold))
    cm = confusion_matrix(r['test_labels_array'][i]>threshold, r['test_results_array'][i]>threshold)
    tn = cm[0, 0]
    fp = cm[0, 1]
    fn = cm[1, 0]
    tp = cm[1, 1]
    tnr = tn / (tn + fp)
    roc_auc = roc_auc_score(r['test_labels_array'][i]>threshold, r['test_results_array'][i])
    mae = mean_absolute_error(r['test_labels_array'][i], r['test_results_array'][i])

    tpr_splits.append(recall_score(r['test_labels_array'][i]>threshold, r['test_results_array'][i]>threshold))
    tnr_splits.append(tnr)
    rocauc_splits.append(roc_auc)
    r2_splits.append(r2_score(r['test_labels_array'][i], r['test_results_array'][i]))
    mae_splits.append(mean_absolute_error(r['test_labels_array'][i], r['test_results_array'][i]))


  tpr = recall_score(r['test_labels']>threshold, r['test_results']>threshold)
  cm = confusion_matrix(r['test_labels']>threshold, r['test_results']>threshold)
  tn = cm[0, 0]
  fp = cm[0, 1]
  fn = cm[1, 0]
  tp = cm[1, 1]
  tnr = tn / (tn + fp)
  roc_auc = roc_auc_score(r['test_labels']>threshold, r['test_results'])
  mae = mean_absolute_error(r['test_labels'], r['test_results'])

  precision, recall, thresholds = precision_recall_curve(r['test_labels']>threshold, r['test_results'])
  pr_auc = auc(recall, precision)

  for split in range(len(r['test_labels_array'])):
    rows.append(pd.DataFrame({'Name': r['experiment_name']['description'],
                              #'Reps': r['reps'],
                  'R2': r2_score(r['test_labels'], r['test_results']),
                  'TNR': tn / (tn + fp),
                              'TPR': recall_score(r['test_labels']>2.9, r['test_results']>threshold),
                                                            'Precision': precision_score(r['test_labels']>threshold, r['test_results']>threshold),
                'ROC AUC': roc_auc,
                              'PR AUC': pr_auc,
                              'MAE': mae,
                              'R2 Split': r2_splits[split],
                              'TPR Split': tpr_splits[split],
                              'TNR Split': tnr_splits[split],
                              'ROC AUC Split': rocauc_splits[split],
                              'MAE Split': mae_splits[split],
                              },
                              index=[0])),



results_df = pd.concat(rows)
results_df

Unnamed: 0,Name,R2,TNR,TPR,Precision,ROC AUC,PR AUC,MAE,R2 Split,TPR Split,TNR Split,ROC AUC Split,MAE Split
0,Fitbit + Demographics,0.090648,0.692308,0.736842,0.466667,0.748988,0.525986,1.108903,0.090648,0.736842,0.692308,0.748988,1.108903
0,Fitbit + Demographics + Fasting Glucose,0.496568,0.884615,0.736842,0.7,0.86336,0.730393,0.829395,0.496568,0.736842,0.884615,0.86336,0.829395
0,Fitbit + Demographics + Fasting Glucose + Lipi...,0.49438,0.846154,0.736842,0.636364,0.878543,0.723782,0.839759,0.49438,0.736842,0.846154,0.878543,0.839759
