In [1]:
import pandas as pd
import os
import torch.nn as nn
import pandas as pd
import datasets
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig, T5Config
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import wandb
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4"

In [2]:
config = T5Config()
config

T5Config {
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_size": 32128
}

In [3]:
root = '/home/ugrads/a/aa_ron_su/physionet.org/files/clinical-t5/1.0.0/'
data_path = '/data/datasets/mimiciv_notes/physionet.org/files/mimic-iv-note/2.2/note/discharge.csv'
model_path = root + 'Clinical-T5-Base/'
finetune_model_path = root + 'Clinical-T5-Base_ft_vent/'
temivef_train_NOTE_TARGET1_FT_path = '/home/ugrads/a/aa_ron_su/JSS_SUBMISSION_NEW/data/till_end_mimic_iv_extra_features_train_NOTE_TARGET1_FT_rad.csv'
model_name = "Clinical-T5-Base"
out_dir = f"{model_name}_out"

In [4]:
from transformers import T5Config
from T5EncoderForSequenceClassification import T5EncoderForSequenceClassification
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
encoder = model.get_encoder() # we only need the clinical-t5 encoder for our purposes

config = T5Config(
    hidden_size=768,
    classifier_dropout=None,
    num_labels=2,
    hidden_dropout_prob=0.01,
    last_hidden_size=64,
    gradient_checkpointing=True

)
classifier = T5EncoderForSequenceClassification(encoder, config)



In [5]:
train = pd.read_csv(temivef_train_NOTE_TARGET1_FT_path)
print(f"reading notes and target from {temivef_train_NOTE_TARGET1_FT_path}")

reading notes and target from /home/ugrads/a/aa_ron_su/JSS_SUBMISSION_NEW/data/till_end_mimic_iv_extra_features_train_NOTE_TARGET1_FT_rad.csv


In [6]:
def group_train_test(ID):
    ID             = ID.astype(int)
    ID_unique_srtd = np.unique(ID)
    np.random.shuffle(ID_unique_srtd)    

    num_train_ids = int(.80 * len(ID_unique_srtd))
    train_ids = ID_unique_srtd[:num_train_ids]
    val_ids = ID_unique_srtd[num_train_ids:]

    train = ID[ID.isin(train_ids)]
    val = ID[ID.isin(val_ids)]

    assert(len(train) + len(val) == len(ID))
    assert(len(train_ids) + len(val_ids) == len(ID_unique_srtd))
    assert(len(train_ids) + len(val_ids) == len(ID_unique_srtd))

    return list(train.index), list(val.index)

train_idxs, val_idxs = group_train_test(train['ICUSTAY_ID'])

In [1]:
from datasets import Dataset
target = 'delta_in_2_days'
train = train.rename(columns = {target:'label'})

train_data = train.iloc[train_idxs]
val_data = train.iloc[val_idxs]

train_data = Dataset.from_pandas(train_data).select_columns(['text', 'label'])
val_data = Dataset.from_pandas(val_data).select_columns(['text', 'label'])

if not os.path.exists(f'{out_dir}/data_cache'):
    # define a function that will tokenize the model, and will return the relevant inputs for the model
    def tokenization(batched_text):
        return tokenizer(batched_text['text'], padding = 'max_length', truncation=True, max_length = 512)

    train_data = train_data.map(tokenization, batched = True, batch_size = len(train_data) // 10)
    val_data = val_data.map(tokenization, batched = True, batch_size = len(val_data) // 10)

    train_data.save_to_disk(f'{out_dir}/data_cache/tokenized_train_data')
    val_data.save_to_disk(f'{out_dir}/data_cache/tokenized_val_data')

else: 
    print(f'loading train, val from', f'{out_dir}/data_cache/')
    train_data = train_data.load_from_disk(f'{out_dir}/data_cache/tokenized_train_data')
    val_data = val_data.load_from_disk(f'{out_dir}/data_cache/tokenized_val_data')

train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

train_data = train_data.remove_columns('text')
val_data = val_data.remove_columns('text')

NameError: name 'train' is not defined

In [8]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # argmax(pred.predictions, axis=1)
    #pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [9]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = f'{out_dir}/results',
    num_train_epochs = 5,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 8,    
    per_device_eval_batch_size= 4,
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=200,
    weight_decay=0.01,
    logging_steps = 8,
    fp16 = True,
    logging_dir=f'{out_dir}/logs',
    dataloader_num_workers = 0,
    run_name = 't5_radiology_run1'
)

In [10]:
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=classifier,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

Using cuda_amp half precision backend


'cuda'

In [12]:
import wandb
wandb.init()
print(wandb.run.get_url())
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maa_ron_su[0m ([33maaron_team[0m). Use [1m`wandb login --relogin`[0m to force relogin


***** Running training *****
  Num examples = 60967
  Num Epochs = 5
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 4760
  Number of trainable parameters = 110258498
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


https://wandb.ai/aaron_team/1.0.0/runs/t6vit1ae




Epoch,Training Loss,Validation Loss


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/ugrads/a/aa_ron_su/miniconda3/envs/clinical1/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/ugrads/a/aa_ron_su/miniconda3/envs/clinical1/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ugrads/a/aa_ron_su/physionet.org/files/clinical-t5/1.0.0/T5EncoderForSequenceClassification.py", line 70, in forward
    encoder_outputs = self.encoder(
  File "/home/ugrads/a/aa_ron_su/miniconda3/envs/clinical1/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ugrads/a/aa_ron_su/miniconda3/envs/clinical1/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1040, in forward
    layer_outputs = layer_module(
  File "/home/ugrads/a/aa_ron_su/miniconda3/envs/clinical1/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ugrads/a/aa_ron_su/miniconda3/envs/clinical1/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 673, in forward
    self_attention_outputs = self.layer[0](
  File "/home/ugrads/a/aa_ron_su/miniconda3/envs/clinical1/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ugrads/a/aa_ron_su/miniconda3/envs/clinical1/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 579, in forward
    attention_output = self.SelfAttention(
  File "/home/ugrads/a/aa_ron_su/miniconda3/envs/clinical1/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/ugrads/a/aa_ron_su/miniconda3/envs/clinical1/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 542, in forward
    attn_weights = nn.functional.dropout(
  File "/home/ugrads/a/aa_ron_su/miniconda3/envs/clinical1/lib/python3.10/site-packages/torch/nn/functional.py", line 1252, in dropout
    return _VF.dropout_(input, p, training) if inplace else _VF.dropout(input, p, training)
RuntimeError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 10.92 GiB total capacity; 9.99 GiB already allocated; 81.44 MiB free; 10.31 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


In [22]:
# from torch.utils.data import DataLoader, TensorDataset, RandomSampler

# train_inputs = train_tensor
# train_labels = torch.tensor(train['delta_in_2_days'].to_numpy())
# train_dataset = TensorDataset(train_inputs.to(device), train_labels.to(device)) 

In [24]:
# from torch.utils.data import Subset
# batch_size = 4


# for fold, (train_idx, val_idx) in enumerate(kfold):
#     print(train_idx.shape, val_idx.shape)
#     val_set = Subset(train_dataset, val_idx)
#     val_dataloader = DataLoader(val_set, batch_size=batch_size, shuffle=True)
#     train_set = Subset(train_dataset, train_idx)
#     train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
#     print(len(data_loader))

(9723,) (4850,)
(9720,) (4853,)
(9703,) (4870,)


In [15]:
# notes_to_extract = df['text']
# texts = notes_to_extract.tolist()
# tokenized_notes_to_extract = tokenizer(texts, truncation=True, padding=True, return_tensors = "pt")

# # test_note_to_extract = notes_to_extract.iloc[0]
# # tokenized_test_note = tokenize_function(test_note_to_extract)
# # tokenized_test_note.keys()

In [23]:
# %load_ext autoreload
# %autoreload 2

In [36]:
# from T5EncoderForSequenceClassification import T5EncoderForSequenceClassification, T5EncoderClassificationHead

In [None]:
# classifier.named_parameters

In [6]:
# import torch
# import torch.nn as nn
# from torch.nn.utils import parameters_to_vector

# num_params = len(parameters_to_vector(encoder.parameters()))

In [7]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device

device(type='cuda')

In [None]:
# classifier.to(device)

In [15]:
# num_params

109618560

In [16]:
# len(parameters_to_vector(classifier.classifier.parameters()))

639938

In [None]:
# classifier.encoder.named_parameters

In [None]:
# classifier.classifier

T5EncoderClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (last_dense): Linear(in_features=768, out_features=64, bias=True)
  (dropout): Dropout(p=0.01, inplace=False)
  (out_proj): Linear(in_features=64, out_features=2, bias=True)
)

In [22]:
# from torch.utils.data import DataLoader, TensorDataset, RandomSampler
# from transformers import AdamW, get_linear_schedule_with_warmup
# import numpy as np
# import torch

# # Define your training data
# train_inputs = tokenized_notes_to_extract.input_ids
# # train_labels = torch.tensor(np.random.rand(len(train_inputs)))
# train_labels = torch.tensor(df['delta_in_2_days'].to_numpy())
# train_dataset = TensorDataset(train_inputs.to(device), train_labels.to(device))

In [27]:
# train_dataset[0][0].device.type

'cuda'

In [10]:
# batch_size = 32
# train_sampler = RandomSampler(train_dataset)
# train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

In [27]:
# batch_size = 16
# num_epochs = 1
# learning_rate = 5e-5
# adam_epsilon = 1e-8
# max_grad_norm = 1.0

# train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

In [28]:
# import torch.optim as optim

# criterion = nn.MSELoss()
# optimizer = optim.AdamW(classifier.classifier.parameters(), lr=learning_rate, eps=adam_epsilon)

In [29]:
# total_steps = len(train_dataloader) * num_epochs
# scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=0,
#     num_training_steps=total_steps
# )

In [30]:
# freeze encoder weights:
for param in classifier.encoder.parameters():
    param.requires_grad = False

In [None]:
# from tqdm import tqdm

# classifier.classifier.train()
# for epoch in tqdm(range(num_epochs)):
#     for step, batch in enumerate(train_dataloader):
#         inputs, labels = batch
#         optimizer.zero_grad()
#         outputs = classifier.forward(inputs, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(classifier.classifier.parameters(), max_grad_norm)
#         optimizer.step()
#         scheduler.step()

#         # Print progress every 10 steps
#         if step % 10 == 0:
#             print(f"Epoch {epoch + 1}/{num_epochs} | Step {step}/{len(train_dataloader)} | Loss {loss.item():.4f}")

### EXTRACT NOTES

In [41]:
# single_embedding = classifier.forward(train_inputs[0:2], labels = train_labels[0:2], return_embeddings=True)

torch.Size([512])

In [14]:
# embeddings = classifier.forward(train_inputs, labels = train_labels, return_embeddings=True)
# embeddings_df = pd.DataFrame({'embedding': list(embeddings.detach().numpy())})
# df.reset_index(drop=True, inplace=True)
# df = pd.concat([df, embeddings_df], axis = 1)
# df.to_csv(mimic_iv_train_NOTE_EMBEDDINGS_path, index = False)

In [25]:
# embeddings_df = pd.DataFrame({'embedding': list(embeddings.detach().numpy())})

In [47]:
# df_small = df.iloc[0:5].copy().reset_index(drop=True)

In [48]:
# df_small = pd.concat([df_small, embeddings_df], axis = 1)

In [None]:
# Merge embeddings with dataframe

In [54]:
# embeddings[0].shape

torch.Size([768])

In [53]:
# df_small

Unnamed: 0,SUBJECT_ID,ICUSTAY_ID,t_start,t_end,NOTE_ID,text,delta,Capillary refill rate_1.0,Capillary refill rate_nan,Ethnicity_1.0,...,Age,Height,Weight,#past_IVs,t_from_last_IV_t_start,t_from_last_IV_t_end,INTIME,t_start_DT,time_since_note,embedding
0,10248673,33680639,24.0,24.388,10248673-DS-5,\nName: ___ Unit No: ___...,0,0,1,0,...,69.516622,168.0,66.0,1,18.25,11.167,2177-06-20 13:36:43,2177-06-21 13:36:43.000000000,901.611944,"[0.08366300067315613, -0.09525524751737473, 0...."
1,10248673,33680639,24.388,25.038,10248673-DS-5,\nName: ___ Unit No: ___...,0,0,1,0,...,69.516622,168.0,66.0,1,19.25,12.167,2177-06-20 13:36:43,2177-06-21 13:59:59.800000000,901.999944,"[0.09037796095801452, -0.1034607368093436, 0.2..."
2,10248673,33680639,25.038,25.388,10248673-DS-5,\nName: ___ Unit No: ___...,0,0,1,0,...,69.516622,168.0,66.0,1,19.9,12.817,2177-06-20 13:36:43,2177-06-21 14:38:59.800000000,902.649944,"[0.10461136858709813, -0.08170781207205698, 0...."
3,10248673,33680639,25.388,25.421,10248673-DS-5,\nName: ___ Unit No: ___...,0,0,1,0,...,69.516622,168.0,66.0,1,20.25,13.167,2177-06-20 13:36:43,2177-06-21 14:59:59.800000000,902.999944,"[0.09262458924580079, -0.1298178264939714, 0.2..."
4,10248673,33680639,25.421,26.388,10248673-DS-5,\nName: ___ Unit No: ___...,0,0,1,0,...,69.516622,168.0,66.0,1,20.283,13.2,2177-06-20 13:36:43,2177-06-21 15:01:58.600000000,903.032944,"[0.09782734270128297, -0.11514283711793492, 0...."


In [None]:
# 

In [39]:
# outputs['classifier_last_hidden_state'][0].shape
# outputs['logits'][0]


tensor([0.0552], grad_fn=<SelectBackward0>)

In [184]:
# encoder_with_dense = T5EncoderWithDense(encoder = encoder, num_classes = 1)

In [186]:
# dense_layer_outputs = encoder_with_dense(inputs.input_ids)

In [188]:
# encoder_with_dense.hidden_states

[tensor([[0.0930]])]

In [187]:
# dense_layer_outputs

tensor([[0.0930]], grad_fn=<AddmmBackward0>)

In [178]:
# dense(pooled_output)

tensor([[-0.0374]], grad_fn=<AddmmBackward0>)

In [44]:
# # Freeze the weights of the encoder layers
# for param in model_encoder_only.parameters():
#     param.requires_grad = False

In [None]:
# from datasets import load_dataset
# # dataset = load_dataset('csv', data_files=mimic_iv_train_NOTE_path, split='train') # split = 'train
# df = pd.read_csv(mimic_iv_train_NOTE_path)[['NOTE_ID', 'text']]
# df.drop_duplicates(inplace=True)
# df.dropna(inplace=True) 
# df

In [136]:
# import torch
# inputs = tokenized_test_note
# outputs = tokenized_test_note
# outputs = model(input_ids = inputs.input_ids, attention_mask = inputs.attention_mask, decoder_input_ids = inputs.input_ids)

In [138]:
# outputs.keys()

odict_keys(['logits', 'past_key_values', 'encoder_last_hidden_state'])

In [None]:
# # import torch
# inputs = tokenized_test_note
# labels = torch.tensor([1]).unsqueeze(0)
# outputs = model_encoder_only

NameError: name 'test_note_to_encode' is not defined

In [90]:
# generated_outputs = []
# labels = torch.tensor([1]).unsqueeze(0)

# for i, row in notes_to_extract.iterrows():
#     input_text = row['text']
#     input_ids = tokenizer.encode(input_text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
#     outputs = model(**inputs, labels=labels)
#     generated_outputs.append(outputs)

NameError: name 'torch' is not defined

In [54]:
# import numpy as np

# # Generate a random target dataset with n = 1000
# n = 1000
# target_df = np.random.rand(n)