In [12]:
import os
import torch
testing = False
GPU_NO = 3
os.environ["CUDA_VISIBLE_DEVICES"] = str(3)  # use the correct gpu
device = torch.device(f'cuda' if torch.cuda.is_available() else 'cpu')
print(f"device: {device}")

device: cuda


In [None]:
import pandas as pd
import torch.nn as nn
import sys

root = '/home/ugrads/a/aa_ron_su/physionet.org/files/clinical-t5/1.0.0/'
finetuned_model_path = root + '/model/meta_ft_classify.pt' # modify this line!
temivef_train_NOTE_TARGET1_FT_path = '/home/ugrads/a/aa_ron_su/data/till_end_mimic_iv_extra_features_train_NOTE_TARGET1_FT.csv'
temivef_test_NOTE_TARGET1_FT_path = '/home/ugrads/a/aa_ron_su/data/till_end_mimic_iv_extra_features_test_NOTE_TARGET1_FT.csv'
temivef_train_NOTE_path = '/home/ugrads/a/aa_ron_su/data/till_end_mimic_iv_extra_features_train_NOTE.csv'
temivef_test_NOTE_path = '/home/ugrads/a/aa_ron_su/data/till_end_mimic_iv_extra_features_test_NOTE.csv'
outdir = '/home/ugrads/a/aa_ron_su/data/final2/' # modify this line!
train_outpath = os.path.join(outdir, 'till_end_mimic_iv_extra_features_train.csv')
test_outpath = os.path.join(outdir, 'till_end_mimic_iv_extra_features_test.csv') 
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model_name = "Clinical-T5-Base"
tokenizer = AutoTokenizer.from_pretrained("Clinical-T5-Base")

if not os.path.exists(outdir):
    os.makedirs(outdir)

tensor_dir = "./tokenized_notes"
train_tensor_path = os.path.join(tensor_dir, "train_tensor.pt")
test_tensor_path = os.path.join(tensor_dir, "test_tensor.pt")

tokenized_train_notes = torch.load(train_tensor_path)
tokenized_test_notes = torch.load(test_tensor_path)

if testing:
    tokenized_train_notes = tokenized_train_notes[:12]
    tokenized_test_notes = tokenized_test_notes[:12]
    print(f"testing mode truncated tokenized_notes to length {len(tokenized_train_notes)}")


from torch.utils.data import DataLoader, TensorDataset
from time import time
import numpy as np
import torch

def generate_dataloader(tokenized_notes, batch_size, device):
    inputs = tokenized_notes
    labels = torch.tensor([-1] * len(inputs)) # not actually used in this case, since we are not evaluating loss
    dataset = TensorDataset(inputs.to(device), labels.to(device))
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=False) 
    # shuffle is false so that notes retain their order for concat with df
    return dataloader

batch_size = 3
train_dataloader = generate_dataloader(tokenized_train_notes, batch_size, device)
test_dataloader = generate_dataloader(tokenized_test_notes, batch_size, device)
print("dataloaders generated")

classifier = torch.load(finetuned_model_path)
classifier.encoder.eval() # makes sure dropout does not occur
classifier.classifier.eval()
print(f"loaded classifier from {finetuned_model_path}")

def extract_embeddings(dataloader, classifier):
    with(torch.no_grad()):
        start_time = time()
        embeddings = []
        for step, batch in enumerate(dataloader):
                inputs, _ = batch
                emb = classifier.forward(inputs, return_embeddings=True)
                embeddings.append(emb)
                print(f"Step {step}/{len(dataloader)} | Time {time() - start_time : .2f} seconds")

    embeddings = torch.cat(embeddings, dim=0)
    embeddings = embeddings.cpu()
    embeddings_df = pd.DataFrame(embeddings.detach().numpy()).add_prefix('emb')
    print(f"Extracted {len(embeddings_df)} note embeddings. Shape: {embeddings_df.shape}") # should be size 64
    return embeddings_df

train_embeddings_df = extract_embeddings(train_dataloader, classifier)
print("finished train embedding extraction") 
test_embeddings_df = extract_embeddings(test_dataloader, classifier)
print("finished test embedding extraction")


In [29]:
for mode in ["train", "test"]:
    print(f"reading from temivef_{mode}_NOTE_TARGET1_FT_path")
train = pd.read_csv(temivef_train_NOTE_TARGET1_FT_path)
test = pd.read_csv(temivef_test_NOTE_TARGET1_FT_path)

#concat notes with ICUSTAY column for merging later
train_df_small = pd.concat([train[['ICUSTAY_ID', 'NOTE_ID']], train_embeddings_df], axis = 1)
test_df_small = pd.concat([test[['ICUSTAY_ID', 'NOTE_ID']], test_embeddings_df], axis = 1)

print(f"concatenating train and train_embeddings_df with shape {train.shape} and {train_embeddings_df.shape} respectively")
print(f"concatenating test and test_embeddings_df with shape {test.shape} and {test_embeddings_df.shape} respectively")

reading from temivef_train_NOTE_TARGET1_FT_path
reading from temivef_test_NOTE_TARGET1_FT_path
concatenating train and train_embeddings_df with shape (14573, 4) and (14573, 64) respectively
concatenating test and test_embeddings_df with shape (2582, 4) and (2582, 64) respectively


In [27]:
for mode in ["train", "test"]:
    print(f"reading from temivef_{mode}_NOTE_path")
train_df_big = pd.read_csv(temivef_train_NOTE_path)
test_df_big = pd.read_csv(temivef_test_NOTE_path)

reading from temivef_train_NOTE_path
reading from temivef_test_NOTE_path


In [30]:
def merge_and_fill_embeddings(df_small, df_big):
    print(f"BEFORE merge: len = {len(df_big)}")
    out_df = df_big.merge(df_small, on = ['ICUSTAY_ID','NOTE_ID'], how = 'left')
    print(f"AFTER merge: len = {len(out_df)}")
    def fill_embedding_na(note_id_group):
        note_id_group = note_id_group.fillna(method='ffill').fillna(method='bfill')
        return note_id_group

    emb_cols = ['emb' + str(i) for i in range(64)]

    print(f"BEFORE fill na: len = {len(out_df)}")
    out_df[emb_cols] = out_df.groupby('NOTE_ID')[emb_cols].transform(fill_embedding_na) # transform preserves the shape of the original
    print(f"AFTER fill na: len = {len(out_df)}")


    print(f"copied {len(df_small)} embeddings into rows with the correct NOTE_ID")
    print(f"len(out_df): {len(out_df)}")
    print(f"No. nonnull embedding rows in out_df: {len(out_df[pd.notna(out_df['emb0'])])}")
    return out_df

train_out_df = merge_and_fill_embeddings(train_df_small, train_df_big)
print("merged and filled embeddings for train_out_df")
test_out_df = merge_and_fill_embeddings(test_df_small, test_df_big)
print("merged and filled embeddings for test_out_df")


BEFORE merge: len = 3710672
AFTER merge: len = 3710672
BEFORE fill na: len = 3710672
AFTER fill na: len = 3710672
copied 14573 embeddings into rows with the correct NOTE_ID
len(out_df): 3710672
No. nonnull embedding rows in out_df: 1627917
merged and filled embeddings for train_out_df
BEFORE merge: len = 1018912
AFTER merge: len = 1018912
BEFORE fill na: len = 1018912
AFTER fill na: len = 1018912
copied 2582 embeddings into rows with the correct NOTE_ID
len(out_df): 1018912
No. nonnull embedding rows in out_df: 413281
merged and filled embeddings for test_out_df


In [70]:
# format like mimic_iv_train
def format_cols(df):
    df.drop(['text', 'NOTE_ID', 't_start_DT','INTIME'], axis=1, inplace=True)
    df.rename(columns = {
        'SUBJECT_ID':'subject',
        'ICUSTAY_ID':'Icustay'
        }, inplace = True)
    return df

train_out_df = format_cols(train_out_df)
test_out_df = format_cols(test_out_df)

In [71]:
train_out_df.to_csv(train_outpath, index = False)
print("wrote to", train_outpath)
test_out_df.to_csv(test_outpath, index = False)
print("wrote to", test_outpath)

wrote to /home/ugrads/a/aa_ron_su/data/final2/till_end_mimic_iv_extra_features_train.csv
wrote to /home/ugrads/a/aa_ron_su/data/final2/till_end_mimic_iv_extra_features_test.csv


In [None]:
###################################################################3

In [None]:
# how do I prevent duplicate merges? merge on Icustay AND NOTE_ID?

In [None]:
# reading from temivef_train_NOTE_TARGET1_FT_path
# reading from temivef_test_NOTE_TARGET1_FT_path
# concatenating test and test_embeddings_df with shape (2582, 3) and (2580, 64) respectively

In [None]:
test_df_small

In [5]:
train_df_small.ICUSTAY_ID

0        30000484.0
1        30001947.0
2        30002521.0
3        30003202.0
4        30003226.0
            ...    
14568    39996870.0
14569    39998012.0
14570    39999230.0
14571    39999301.0
14572    39999301.0
Name: ICUSTAY_ID, Length: 14573, dtype: float64

In [6]:
train_df_small.ICUSTAY_ID.drop_duplicates()

0        30000484.0
1        30001947.0
2        30002521.0
3        30003202.0
4        30003226.0
            ...    
14567    39996783.0
14568    39996870.0
14569    39998012.0
14570    39999230.0
14571    39999301.0
Name: ICUSTAY_ID, Length: 13632, dtype: float64

In [7]:
14573 - 13632

941

In [None]:
def merge_and_fill_embeddings(df_small, df_big):
    print(f"BEFORE merge: len = {len(df_big)}")
    out_df = df_big.merge(df_small, on = 'ICUSTAY_ID', how = 'left')

    print(f"AFTER merge: len = {len(out_df)}")
    def fill_embedding_na(note_id_group):
        note_id_group = note_id_group.fillna(method='ffill').fillna(method='bfill')
        return note_id_group

    emb_cols = ['emb' + str(i) for i in range(64)]

    print(f"BEFORE fill na: len = {len(out_df)}")
    out_df[emb_cols] = out_df.groupby('NOTE_ID')[emb_cols].transform(fill_embedding_na) # transform preserves the shape of the original
    print(f"AFTER fill na: len = {len(out_df)}")


    print(f"copied {len(df_small)} embeddings into rows with the correct NOTE_ID")
    print(f"len(out_df): {len(out_df)}")
    print(f"No. nonnull embedding rows in out_df: {len(out_df[pd.notna(out_df['emb0'])])}")
    return out_df

train_out_df = merge_and_fill_embeddings(train_df_small, train_df_big)
print("merged and filled embeddings for train_out_df")
test_out_df = merge_and_fill_embeddings(test_df_small, test_df_big)
print("merged and filled embeddings for test_out_df")



In [None]:
# # format like mimic_iv_train
# def format_cols(df):
#     df.drop(['text', 'NOTE_ID', 't_start_DT','INTIME'], axis=1, inplace=True)
#     df.rename(columns = {
#         'SUBJECT_ID':'subject',
#         'ICUSTAY_ID':'Icustay'
#         }, inplace = True)
#     return df

# train_out_df = format_cols(train_out_df)
# test_out_df = format_cols(test_out_df)

# train_out_df.to_csv(train_outpath, index = False)
# print("wrote to", train_outpath)
# test_out_df.to_csv(test_outpath, index = False)
# print("wrote to", test_outpath)

In [None]:
#############################################################33

In [23]:
embeddings_df = pd.DataFrame(embeddings.detach().numpy()).add_prefix('emb')

# embeddings = classifier.forward(train_inputs, return_embeddings=True)
# embeddings_df = pd.DataFrame({'embedding': list(embeddings.detach().numpy())})
print(f"Extracted {len(embeddings_df)} note embeddings of shape {embeddings_df.shape}") # should be size 64

# # df.reset_index(drop=True, inplace=True)
# #concat notes with the train df
print(f"concatenating train and embeddings_df w shapes {train.shape}, {embeddings_df.shape}")
out_df_small = pd.concat([train, embeddings_df], axis = 1)
print(f"out_df shape: {out_df_small.shape}")
print(f"No. nonnull embedding rows: {len(out_df_small[pd.notna(out_df_small['emb1'])])}") # should be 13
print(f"Concatenated note embeddings to mimic_iv_train_NOTE")
print(f"len(out_df): {len(out_df_small)}")

Extracted 12 note embeddings of shape (12, 64)
concatenating train and embeddings_df w shapes (12, 3), (12, 64)
out_df shape: (12, 67)
No. nonnull embedding rows: 12
Concatenated note embeddings to mimic_iv_train_NOTE
len(out_df): 12


In [4]:
train_df_to_merge = pd.read_csv(temivef_train_NOTE_path)


In [None]:
out_df_small

In [67]:
tmp = out_df_small

In [None]:
tmp

In [91]:
out_df = train_df_to_merge.merge(out_df_small.drop(['text', 'delta_in_2_days'], axis = 1), on = 'ICUSTAY_ID', how = 'left')
# out_df = train_df_to_merge.merge(out_df_small, on = 'ICUSTAY_ID', how = 'left')

In [92]:
def fill_embedding_na(note_id_group):
    note_id_group = note_id_group.fillna(method='ffill').fillna(method='bfill')
    return note_id_group

emb_cols = ['emb' + str(i) for i in range(64)]

out_df[emb_cols] = out_df.groupby('NOTE_ID')[emb_cols].transform(fill_embedding_na) # transform preserves the shape of the original

In [93]:
print(f"copied {len(out_df_small)} notes into rows with the correct NOTE_ID")
print(f"len(out_df): {len(out_df)}")
print(f"No. nonnull embedding rows: {len(out_df[pd.notna(out_df['emb0'])])}")

copied 12 notes into rows with the correct NOTE_ID
len(out_df): 3710957
No. nonnull embedding rows: 2951


In [87]:
chkpt = out_df

In [None]:
out_df.columns.to_list()

In [95]:
out_df.drop(['text', 'NOTE_ID', 't_start_DT','INTIME'], axis=1, inplace=True)


In [96]:
out_df.rename(columns = {
    'SUBJECT_ID':'subject',
    'ICUSTAY_ID':'Icustay'
    }, inplace = True)

In [None]:
out_df.columns.tolist()

In [None]:
# train_df_to_merge[train_df_to_merge.ICUSTAY_ID.isin(out_df_small.ICUSTAY_ID)].NOTE_ID.unique()
# out_df[pd.notna(out_df.emb0)].NOTE_ID.unique()
