In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
model_checkpoint = "GanjinZero/biobart-v2-large"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)



In [3]:
from nltk.tokenize import sent_tokenize
import pandas as pd
import nltk

nltk.download("punkt")
from evaluate import load

rouge_score = load("rouge")
bert_score = load("bertscore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\paava\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("itsanmolgupta/mimic-cxr-dataset")
dataset = dataset.filter(lambda x: x["findings"] is not None and x["impression"] is not None)

def preprocess_inputs(examples):
    examples["findings"] = str(examples["findings"]) if examples["findings"] else ""
    examples["impression"] = str(examples["impression"]) if examples["impression"] else ""
    return examples

dataset = dataset.map(preprocess_inputs)



In [5]:

max_input_length = 1024
max_target_length = 300

def preprocess_dataset(examples):
    # Ensure findings and impression are strings
    input_texts = examples["findings"]
    target_texts = examples["impression"]

    # Tokenize inputs
    model_inputs = tokenizer(
        input_texts, max_length=max_input_length, truncation=True, padding="max_length"
    )

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            target_texts, max_length=max_target_length, truncation=True, padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_dataset, batched=True)
tokenized_dataset.set_format("torch")



In [6]:
# Assuming tokenized_dataset["train"] exists and is a Dataset object
from datasets import Dataset

# Split the 'train' dataset using Hugging Face's train_test_split method
train_dataset = tokenized_dataset["train"]
split_datasets = train_dataset.train_test_split(test_size=0.2)

# Now you have the train and validation datasets
train_dataset = split_datasets["train"].select(range(10))
val_dataset = split_datasets["test"].select(range(10))


In [7]:
import torch
torch.cuda.empty_cache()
with torch.no_grad():
    torch.cuda.empty_cache()
print(torch.cuda.is_available())

True


In [8]:
tokenized_dataset["train"][6]

{'image': tensor([[[ 33,  33,  33,  ...,  33,  33,  33],
          [ 33,  33,  33,  ...,  33,  33,  33],
          [ 33,  33,  33,  ...,  33,  33,  33],
          ...,
          [237, 238, 238,  ...,  41,  42,  42],
          [238, 238, 239,  ...,  41,  42,  42],
          [238, 238, 239,  ...,  41,  42,  42]],
 
         [[ 33,  33,  33,  ...,  33,  33,  33],
          [ 33,  33,  33,  ...,  33,  33,  33],
          [ 33,  33,  33,  ...,  33,  33,  33],
          ...,
          [237, 238, 238,  ...,  41,  42,  42],
          [238, 238, 239,  ...,  41,  42,  42],
          [238, 238, 239,  ...,  41,  42,  42]],
 
         [[ 33,  33,  33,  ...,  33,  33,  33],
          [ 33,  33,  33,  ...,  33,  33,  33],
          [ 33,  33,  33,  ...,  33,  33,  33],
          ...,
          [237, 238, 238,  ...,  41,  42,  42],
          [238, 238, 239,  ...,  41,  42,  42],
          [238, 238, 239,  ...,  41,  42,  42]]], dtype=torch.uint8),
 'findings': 'Portable AP chest radiograph. The lungs 

In [9]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm

num_epochs = 3
num_training_steps = 3 * len(train_dataset)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)



In [10]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainerCallback
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
import json
from nltk.tokenize import sent_tokenize
from rouge_score import rouge_scorer

# Constants
num_epochs = 4
batch_size = 1
model_name = 'GanjinZero/biobart-v2-large'

# Define logging steps
logging_steps = 10

# Setup training arguments
args = Seq2SeqTrainingArguments(
    output_dir=f"/content/{model_name}-finetuned-mimiccxr",
    evaluation_strategy="epoch",  # Evaluate after each epoch
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=False,
)

# Compute ROUGE metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    
    # Calculate ROUGE scores for each prediction and reference pair
    rouge_results = [scorer.score(label, pred) for label, pred in zip(decoded_labels, decoded_preds)]

    # Aggregate scores for each metric
    rouge1 = np.mean([result["rouge1"].fmeasure for result in rouge_results])
    rouge2 = np.mean([result["rouge2"].fmeasure for result in rouge_results])
    rougeL = np.mean([result["rougeL"].fmeasure for result in rouge_results])

    return {
        "rouge1": round(rouge1 * 100, 4),
        "rouge2": round(rouge2 * 100, 4),
        "rougeL": round(rougeL * 100, 4),
    }

# Logging callback class for custom logging
class LoggingCallback(TrainerCallback):
    def __init__(self, log_path):
        self.log_path = log_path

    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            with open(self.log_path, "a") as f:
                f.write(json.dumps(logs) + "\n")

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Use the validation set for evaluation
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()
trainer.save_model('../models/biobert-trained')

# Save the fine-tuned model and tokenizer

# Evaluate the model on the validation set
evaluate_result = trainer.evaluate()
print(evaluate_result)


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


{'loss': 0.4121, 'grad_norm': 2.703561305999756, 'learning_rate': 4.2e-05, 'epoch': 1.0}


                                               
 25%|██▌       | 10/40 [01:42<04:12,  8.40s/it]

{'eval_loss': 0.18311989307403564, 'eval_rouge1': 14.9609, 'eval_rouge2': 1.6893, 'eval_rougeL': 12.4673, 'eval_runtime': 7.3327, 'eval_samples_per_second': 1.364, 'eval_steps_per_second': 1.364, 'epoch': 1.0}


 50%|█████     | 20/40 [03:04<02:45,  8.30s/it]

{'loss': 0.1187, 'grad_norm': 4.859731197357178, 'learning_rate': 2.8e-05, 'epoch': 2.0}


                                               
 50%|█████     | 20/40 [03:09<02:45,  8.30s/it]

{'eval_loss': 0.17209631204605103, 'eval_rouge1': 13.2717, 'eval_rouge2': 2.8571, 'eval_rougeL': 12.5574, 'eval_runtime': 5.2161, 'eval_samples_per_second': 1.917, 'eval_steps_per_second': 1.917, 'epoch': 2.0}


 75%|███████▌  | 30/40 [04:31<01:22,  8.27s/it]

{'loss': 0.0773, 'grad_norm': 1.498457670211792, 'learning_rate': 1.4e-05, 'epoch': 3.0}


                                               
 75%|███████▌  | 30/40 [04:37<01:22,  8.27s/it]

{'eval_loss': 0.1568511426448822, 'eval_rouge1': 14.2717, 'eval_rouge2': 2.5, 'eval_rougeL': 13.5574, 'eval_runtime': 5.6528, 'eval_samples_per_second': 1.769, 'eval_steps_per_second': 1.769, 'epoch': 3.0}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


{'loss': 0.0426, 'grad_norm': 1.4052495956420898, 'learning_rate': 0.0, 'epoch': 4.0}


                                               
100%|██████████| 40/40 [06:14<00:00,  9.37s/it]
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


{'eval_loss': 0.15562531352043152, 'eval_rouge1': 15.3243, 'eval_rouge2': 2.5, 'eval_rougeL': 14.6101, 'eval_runtime': 5.8136, 'eval_samples_per_second': 1.72, 'eval_steps_per_second': 1.72, 'epoch': 4.0}
{'train_runtime': 374.9626, 'train_samples_per_second': 0.107, 'train_steps_per_second': 0.107, 'train_loss': 0.1626589886844158, 'epoch': 4.0}


100%|██████████| 10/10 [00:05<00:00,  1.89it/s]

{'eval_loss': 0.15562531352043152, 'eval_rouge1': 15.3243, 'eval_rouge2': 2.5, 'eval_rougeL': 14.6101, 'eval_runtime': 6.0671, 'eval_samples_per_second': 1.648, 'eval_steps_per_second': 1.648, 'epoch': 4.0}





In [11]:
model.save_pretrained('../models/biobert-trained')
tokenizer.save_pretrained('../models/biobert-trained')

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


('../models/biobert-trained\\tokenizer_config.json',
 '../models/biobert-trained\\special_tokens_map.json',
 '../models/biobert-trained\\vocab.json',
 '../models/biobert-trained\\merges.txt',
 '../models/biobert-trained\\added_tokens.json',
 '../models/biobert-trained\\tokenizer.json')

In [12]:
evalute_result=trainer.evaluate()

100%|██████████| 10/10 [00:05<00:00,  1.98it/s]


In [13]:
evalute_result

{'eval_loss': 0.15562531352043152,
 'eval_rouge1': 15.3243,
 'eval_rouge2': 2.5,
 'eval_rougeL': 14.6101,
 'eval_runtime': 5.9891,
 'eval_samples_per_second': 1.67,
 'eval_steps_per_second': 1.67,
 'epoch': 4.0}

In [14]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained('../models/biobert-trained')
tokenizer = AutoTokenizer.from_pretrained('../models/biobert-trained')

input_text = "computed tomography of the abdomen showing the presence of the hypoplasia in the lower parietal section of the abdomen"

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
# Generate the output sequence (prediction)
output = model.generate(
    inputs['input_ids'],
    num_beams=4,
    max_length=50,  # Adjust as necessary
    early_stopping=True
)

# Decode the generated output into text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


No evidence of acute disease.
