In [1]:
import numpy as np
import torch
import datasets
import os
from PIL import Image
from tqdm import tqdm
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision import transforms

from transformers import Seq2SeqTrainer ,Seq2SeqTrainingArguments
from transformers import AutoTokenizer, default_data_collator
from transformers import ViTFeatureExtractor, VisionEncoderDecoderModel

In [2]:
data = pd.read_csv("/kaggle/input/chest-xrays/chest_xray_results_final.csv")
train_df, test_df = train_test_split(data, test_size=0.2)

In [3]:
class CONFIG:
  train_batch_size = 4
  test_batch_size = 4
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  epochs = 1
  lr = 1e-5
  max_len = 256
  summary_len = 64
  image_size = (244, 244)
  decoder = 'gpt2'
  encoder = "google/vit-base-patch16-224"

    

In [4]:
cfg = CONFIG()

In [5]:
class Dataset_image(Dataset):
  def __init__(self, caption_df, root, img_feature_extractor, tokenizer, img_transform=None ):
    self.caption_df = caption_df
    self.root_dir = root
    self.tokenizer = tokenizer
    self.feature_extractor = img_feature_extractor
    self.transform = img_transform
    self.max_length = 64

  def __len__(self):
    return len(self.caption_df)

  def __getitem__(self, idx):
    ## I am using a df with two colums; image and caption respectively
    caption = self.caption_df.iloc[idx]['caption']
    image = self.caption_df.iloc[idx]['image']
    image_path = os.path.join(self.root_dir, image)

    img = Image.open(image_path).convert('RGB')

    if self.transform is not None:
      img = self.transform(img)
    pixel_values = self.feature_extractor(img, return_tensors='pt').pixel_values
    caption_tokenize = self.tokenizer(caption, padding='max_length',  max_length=self.max_length).input_ids
    caption_tokenize = [caption if caption!=self.tokenizer.pad_token else -100 for caption in caption_tokenize]
    encoding = {'image': pixel_values.squeeze(), 'caption':torch.tensor(caption_tokenize)}
    return encoding

transform_img = transforms.Compose(
    [
        transforms.Resize(cfg.image_size), 
        transforms.ToTensor(),
        transforms.Normalize(
            mean=0, 
            std=1
        )
   ]
)

In [6]:
feature_extractor = ViTFeatureExtractor.from_pretrained(cfg.encoder)


OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like google/vit-base-patch16-224 is not the path to a directory containing a file named preprocessor_config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.decoder)
tokenizer.pad_token = tokenizer.unk_token

In [None]:
#rouge function metric

rouge = datasets.load_metric('rouge')

def compute_rouge(pred, target):
  predictions = tokenizer.batch_decode(pred, skip_special_tokens = True)

  target[target==-100] = tokenizer.pad_token_id
  labels = tokenizer.batch_decode(target, skip_special_tokens=True)

  metric = rouge.compute(predictions = predictions, references = labels, rouge_types = ['rouge2'] )["rouge2"].mid

  rouge_result = {'rouge_precision':metric.precision, 'rouge_recall':metric.recall, 'rouge_fmeasure':metric.fmeasure}
  return rouge_result