In [1]:
import pandas as pd
import numpy as np

In [None]:
mimicPath = '/YOUR/MIMICCXR/PATH/mimic.csv'
openIPath = '/YOUR/OPENI/PATH/openI.csv'

mimic = pd.read_csv(mimicPath, index_col=0)
openI = pd.read_csv(openIPath, index_col=0)

In [3]:
mimic.head(1)

Unnamed: 0,Cardiomegaly,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,TXT
42232,0.0,1.0,0.0,0.0,0.0,0.0,0.0,the lungs are well expanded. equivocal mild in...


In [4]:
openI.head(1)

Unnamed: 0,Cardiomegaly,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,TXT
222713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,The cardiac silhouette and mediastinum size ar...


In [5]:
cols = ['Cardiomegaly', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion']

In [6]:
from transformers import LongformerTokenizerFast
ltokenizer = LongformerTokenizerFast.from_pretrained('yikuan8/Clinical-Longformer')

In [7]:
train_encodings = ltokenizer(mimic.TXT.tolist(), truncation=True, padding=True, max_length=1024)
test_encodings = ltokenizer(openI.TXT.tolist(), truncation=True, padding=True,max_length=1024)

In [8]:
import torch

class ReadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReadDataset(train_encodings, mimic[cols].values)
test_dataset = ReadDataset(test_encodings, openI[cols].values)

In [9]:
from torch import nn
from transformers import Trainer

class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [10]:
import numpy as np
from sklearn.metrics import roc_auc_score
def sigmoid(x):  
    return np.exp(-np.logaddexp(0, -x))

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    res = {}
    for i, d in enumerate(cols):
        res[d] = roc_auc_score(labels[:,i], sigmoid(logits[:,i]))
    return res

In [None]:
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, LongformerForSequenceClassification

training_args = TrainingArguments(
    output_dir='/YOUR/OUTPUT/DIR/',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    fp16=True,
    fp16_backend="amp"    
)

model = LongformerForSequenceClassification.from_pretrained('yikuan8/Clinical-Longformer',num_labels = len(cols))

trainer = MultilabelTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()