In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install evaluate
!pip install rouge_score
!pip install bert_score

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [9

In [None]:
import torch

# device will determine whether to run the training on GPU or CPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
def count_params(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Module

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, BeitImageProcessor, AutoModel

# Vision
vision_module = 'microsoft/beit-base-patch16-224-pt22k-ft22k'
feature_extractor = BeitImageProcessor.from_pretrained(vision_module)

# Language
language_module = 'luqh/ClinicalT5-base'
tokenizer = T5Tokenizer.from_pretrained(language_module)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# DataSet, DataLoader

## Unzipping and Loading Images to a folder

In [None]:
# !unzip '/content/drive/MyDrive/NCKH/DS200_XuLyAnhYKhoa/Data/Images/train_images.zip'
!unzip '/content/drive/MyDrive/NCKH/DS200_XuLyAnhYKhoa/Data/Images/valid_images.zip'
# !unzip '/content/drive/MyDrive/NCKH/DS200_XuLyAnhYKhoa/Data/Images/test_images.zip'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: valid/ImageCLEFmedical_Caption_2024_valid_004973.jpg  
  inflating: valid/ImageCLEFmedical_Caption_2024_valid_004974.jpg  
  inflating: valid/ImageCLEFmedical_Caption_2024_valid_004975.jpg  
  inflating: valid/ImageCLEFmedical_Caption_2024_valid_004976.jpg  
  inflating: valid/ImageCLEFmedical_Caption_2024_valid_004977.jpg  
 extracting: valid/ImageCLEFmedical_Caption_2024_valid_004978.jpg  
  inflating: valid/ImageCLEFmedical_Caption_2024_valid_004979.jpg  
  inflating: valid/ImageCLEFmedical_Caption_2024_valid_004980.jpg  
  inflating: valid/ImageCLEFmedical_Caption_2024_valid_004981.jpg  
  inflating: valid/ImageCLEFmedical_Caption_2024_valid_004982.jpg  
  inflating: valid/ImageCLEFmedical_Caption_2024_valid_004983.jpg  
  inflating: valid/ImageCLEFmedical_Caption_2024_valid_004984.jpg  
  inflating: valid/ImageCLEFmedical_Caption_2024_valid_004985.jpg  
  inflating: valid/ImageCLEFmedical_Caption_2024_va

## Building Dataset, DataLoader

In [None]:
import glob
import torch
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from PIL import Image
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import torchvision.transforms as transforms

class ImageCLEF(Dataset):
  def __init__(self, tokenizer = None, feature_extractor = None, image_folder = None, data_csv_path = None, cuis_csv_path = None, data=None):

    self.tokenizer = tokenizer
    self.feature_extractor = feature_extractor
    # if data is None all others argument cant be None
    assert (tokenizer is not None
            and feature_extractor is not None
            and image_folder is not None
            and data_csv_path is not None) or data is not None, "All other arguments must be passed if data is None!"

    if data is None:
      self.load_data(image_folder, data_csv_path)
    else:
      self.data = data

  def load_data(self, image_folder, data_csv_path):
    self.data = []

    data_csv = pd.read_csv(data_csv_path)

    # Setup the total feature file
    image_paths = glob.glob(image_folder + '/*')

    for path in tqdm(image_paths):

    # Obtaining Image Id
      image_id = path.split('/')[-1].split('.')[0]

      # Mapping image id with other variables
      # caption = data_csv[data_csv['image_id'] == image_id]['caption'].item()
      caption = data_csv[data_csv['ID'] == image_id]['Caption'].item()

      sample = {
          'image_id': image_id,
          'path' : path,
          'captions': caption,
      }

      self.data.append(sample)

  def __getitem__(self, idx):
    sample = self.data[idx]

    return {
      'image_id': sample['image_id'],
      'path' : sample['path'],
      'captions': sample['captions'],
    }

  def split_data(self, validation_size, random_state=42):

    # Split train and evaluation set
    train_data, val_data = train_test_split(self.data,
                                                 test_size=validation_size,
                                                 random_state=random_state)

    return (ImageCLEF(tokenizer=self.tokenizer, data=train_data),
            ImageCLEF(tokenizer=self.tokenizer, data=val_data))

  def __len__(self):
    return len(self.data)

  def collate_fn(self, batch):

    images = [Image.open(each['path']).convert('RGB') for each in batch]
    # label_encode = [each['label_encode'] for each in batch]

    raw_captions = [each['captions'] for each in batch]
    image_ids = [each['image_id'] for each in batch]

    extracted_images = self.feature_extractor(images = images, return_tensors = 'pt')
    tokenized_captions = self.tokenizer(raw_captions, padding = True, truncation = True, max_length = 128, return_tensors = 'pt')
    # label_encode = pad_sequence(self.transform_tensor(label_encode), batch_first = True, padding_value = self.num_classes())

    sample = {
      'ids' : image_ids,
      'raw_captions' : raw_captions,
      'pixel_values' : extracted_images.pixel_values, # tensor
      'labels' : tokenized_captions.input_ids, # tensor
      'attention_mask' : tokenized_captions.attention_mask
    }

    return sample

## Load Data

In [None]:
data = ImageCLEF(tokenizer = tokenizer,
                feature_extractor = feature_extractor,
                image_folder ='/content/valid/',
                data_csv_path = '/content/drive/MyDrive/NCKH/DS200_XuLyAnhYKhoa/Data/valid_captions.csv',
                )

  0%|          | 0/9972 [00:00<?, ?it/s]

In [None]:
from torch.utils.data import DataLoader
dataloader = DataLoader(data, batch_size=16, shuffle=False, collate_fn=data.collate_fn)

In [None]:
len(dataloader)

624

# Model Architecture

## Definition

In [None]:
from BeiT_T5_NoConcepts import BeIT_T5

In [None]:
model = BeIT_T5(vision_module = vision_module,
                language_module = language_module,
                device= device)

  pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)
All Flax model weights were used when initializing T5ForConditionalGeneration.

Some weights of T5ForConditionalGeneration were not initialized from the Flax model and are newly initialized: ['decoder.embed_tokens.weight', 'encoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/NCKH/DS200_XuLyAnhYKhoa/CheckPoints/State_dict_BeIT5_Epoch_2_3400.pth', map_location = torch.device(device)))

<All keys matched successfully>

# Prediction

## Compute Metrics

In [None]:
import evaluate
import numpy as np

meteor = evaluate.load('meteor')
rouge = evaluate.load('rouge')
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
def calculate_mean(numbers):
    total = sum(numbers)
    count = len(numbers)
    mean = total / count
    return mean

def compute_metrics(pred_ans, ground_t):

  print(f'Prediction : {pred_ans}')
  print(f'Ground_truth : {ground_t}')
  print('       ')

  # Compute BLEU, ROUGE, METEOR
  bleu1_score = bleu.compute(predictions=pred_ans, references=ground_t, max_order=1)['bleu']
  bleu2_score = bleu.compute(predictions=pred_ans, references=ground_t, max_order=2)['bleu']
  bleu3_score = bleu.compute(predictions=pred_ans, references=ground_t, max_order=3)['bleu']
  bleu4_score = bleu.compute(predictions=pred_ans, references=ground_t, max_order=4)['bleu']
  rouge_score = rouge.compute(predictions=pred_ans, references=ground_t)['rougeL']
  meteor_score = meteor.compute(predictions=pred_ans, references=ground_t)['meteor']

  # Compute Bert Score
  bert_score = bertscore.compute(predictions=pred_ans, references=ground_t, model_type = 'microsoft/deberta-xlarge-mnli', device = device)
  bert_score_F1_mean = calculate_mean(bert_score['f1'])

  return np.array([bert_score_F1_mean, bleu1_score, bleu2_score, bleu3_score, bleu4_score, rouge_score, meteor_score])

## Prediction Function

In [None]:
def predict_caption_ver1(model, pixel_value, tokenizer, device, max_length = 50):

  pixel_value = pixel_value.to(device)
  # Encoding Image
  image_features = model.image_encoder(pixel_values = pixel_value.unsqueeze(dim = 0)).last_hidden_state
  image_features_ = model.layer_norm(image_features)

  # Generate Answer
  count = 0

  input_ids = torch.tensor([model.captions_decoder.config.decoder_start_token_id]).to(device) # Decoder Start Token

  while count <= max_length:

    decoded_tokens = model.captions_decoder(input_ids = input_ids.unsqueeze(dim = 0),
                        encoder_hidden_states = image_features_,
                        ).last_hidden_state # batch_size, answer_length, 768

    output_tokens = model.lm_head(decoded_tokens) # batch_size, answer_length, vocab_size

    tokens_ids = output_tokens.argmax(dim = -1) # batch_size, answer_length

    # if tokens_ids[:, -1] == tokenizer.eos_token_id :
    #   print(f'Found Eos Token in length : {count}')
    #   break
    input_ids = torch.cat([input_ids, tokens_ids[:, -1]], dim = -1)

    count += 1

  res = tokenizer.decode(input_ids, skip_special_tokens = True)
  return res

In [None]:
def predict(model, dataloader, tokenizer, device = 'cpu'):

  res_ids, res_predictions, res_captions = [], [], []
  total_scores = 0

  model.to(device)
  model.eval()
  with torch.no_grad():

    for i, samples in enumerate(tqdm(dataloader)):
      if i == 50 : break
      pixel_values = samples['pixel_values']

      predicted_captions = [(predict_caption_ver1(model = model,
                                            pixel_value = pixel_value,
                                            tokenizer = tokenizer,
                                            device = device)) for pixel_value in pixel_values]

      total_scores += compute_metrics(predicted_captions, samples['raw_captions'])

      res_ids.extend(samples['ids'])
      res_captions.extend(samples['raw_captions'])
      res_predictions.extend(predicted_captions)

    total_scores = total_scores/len(dataloader)

  res_score = {
      'bert_score' : total_scores[0],
      'bleu_score' : {
          'BLEU1' : total_scores[1],
          'BLEU2' : total_scores[2],
          'BLEU3' : total_scores[3],
          'BLEU4' : total_scores[4],
      },
      'rouge_score' : total_scores[5],
      'meteor_score' : total_scores[6],
  }

  return res_score, res_ids, res_captions, res_predictions

## Get Results

In [None]:
scores, ids, captions, predictions = predict(model = model,
                                             dataloader = dataloader,
                                             tokenizer = tokenizer,
                                             device = device)

df = pd.DataFrame({'ID': ids, 'Caption': captions, 'Prediction': predictions})

In [None]:
df.head(10)

Unnamed: 0,ID,Caption,Prediction
0,ImageCLEFmedical_Caption_2024_valid_001535,Contrast enhanced magnetic resonance imaging (...,T1-weighted MRI showing enhancing mass right l...
1,ImageCLEFmedical_Caption_2024_valid_001766,The pulp length (L) and width (W) measurements...,Transthoracic echocardiogram showing aortic va...
2,ImageCLEFmedical_Caption_2024_valid_002898,Ultrasound approach of the left genitofemoral ...,Ultrasound image demonstrating aortic arch (ar...
3,ImageCLEFmedical_Caption_2024_valid_007720,Ultrasound biomicroscopy (UBM) of the left eye...,Ultrasound image demonstrating apical apical a...
4,ImageCLEFmedical_Caption_2024_valid_001259,Infiltrative shadow in the left lower lung fie...,Chest X-ray showing left-sided pleural effusio...
5,ImageCLEFmedical_Caption_2024_valid_002932,Contrast-enhanced computed tomography. Multip...,Abdominal computed tomography scan showing hep...
6,ImageCLEFmedical_Caption_2024_valid_004224,AP radiograph of the left shoulder: There are ...,X-ray right shoulder showing calcifications (a...
7,ImageCLEFmedical_Caption_2024_valid_007282,Arteriography obtained using a 4-Fr catheter ...,Angiography demonstrating stenosis right inter...
8,ImageCLEFmedical_Caption_2024_valid_003191,Sagittal section through the fetal cervical re...,MRI scan showing splenic artery thrombus MRI: ...
9,ImageCLEFmedical_Caption_2024_valid_001833,Axial abdominal CT at admission: 20 cm nodular...,Computed tomography scan abdomen showing hepat...


In [None]:
df.shape

(800, 3)

In [None]:
scores

In [None]:
scores['meteor_score'] * len(dataloader)/50