### **Vision Encoder:** ViT

### **Text Decoder:** GPT-2

In [None]:
import pandas as pd 

In [None]:
df2 = pd.read_csv('/kaggle/input/chest-xrays-indiana-university/indiana_projections.csv')
df1 = pd.read_csv('/kaggle/input/indiana-pro-reports/indiana_PROreports.csv')

In [None]:
df2=df2[df2['projection'] == 'Frontal']

In [None]:
df2.head()

In [None]:
from transformers import (
    AutoFeatureExtractor, 
    AutoTokenizer, 
    VisionEncoderDecoderModel,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer, 
    default_data_collator,
)

from torch.utils.data import Dataset

import pandas as pd
from sklearn.model_selection import train_test_split

from pathlib import Path
from PIL import Image

In [None]:
df1

In [None]:
images_captions_df = pd.DataFrame({'imgs': [],
                                    'captions': []})
for i in range(len(df2)):
    uid = df2.iloc[i]['uid']
    image = df2.iloc[i]['filename']
    index = df1.loc[df1['uid'] ==uid]
    
    if not index.empty:    
        index = index.index[0]
        caption = df1.iloc[index]['findings']
        if type(caption) == float:
         
            continue 
        images_captions_df = pd.concat([images_captions_df, pd.DataFrame([{'imgs': image, 'captions': caption}])], ignore_index=True)
images_captions_df.head()

In [None]:
print(images_captions_df)

In [None]:
encoder_checkpoint = "google/vit-base-patch16-224-in21k"
decoder_checkpoint = "gpt2"

feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
p = '/kaggle/input/chest-xrays-indiana-university/images/images_normalized/'
images_captions_df['imgs'] = p+ images_captions_df['imgs']
images_captions_df.head()

In [None]:
# maximum length for the captions
max_length = 384
sample = images_captions_df.iloc[99]

# sample image
image = Image.open(sample['imgs']).convert('RGB')
# sample caption
caption = sample['captions']

# apply feature extractor on the sample image
inputs = feature_extractor(images=image, return_tensors='pt')
# apply tokenizer
outputs = tokenizer(
            caption, 
            max_length=max_length, 
            truncation=True, 
            padding='max_length',
            return_tensors='pt',
        )

In [None]:
print(f"Inputs:\n{inputs}\nOutputs:\n{outputs}")

In [None]:
from torch.utils.data import Dataset
from PIL import Image

class LoadDataset(Dataset):
    def __init__(self, df):
        self.images = df['imgs'].values
        self.captions = df['captions'].values
    
    def __getitem__(self, idx):
        # everything to return is stored inside this dict
        inputs = dict()

        # load the image and apply feature_extractor
        image_path = str(self.images[idx])
        image = Image.open(image_path).convert("RGB")
        image = feature_extractor(images=image, return_tensors='pt')

        # load the caption and apply tokenizer
        caption = self.captions[idx]
        labels = tokenizer(
            caption, 
            max_length=max_length, 
            truncation=True, 
            padding='max_length',
            return_tensors='pt',
        )['input_ids'][0]
        
        # store the inputs, labels, and image path in the dict we created
        inputs['pixel_values'] = image['pixel_values'].squeeze()   
        inputs['labels'] = labels
        
        return inputs
    
    def __len__(self):
        return len(self.images)


In [None]:
train_df, test_df = train_test_split(images_captions_df, test_size=0.2, shuffle=True, random_state=42)

train_ds = LoadDataset(train_df)
test_ds = LoadDataset(test_df)

In [None]:
import cv2

In [None]:
import torch
import matplotlib.pyplot as plt

In [None]:
len(test_df)

In [None]:
I=cv2.imread(test_df['imgs'].iloc[0])
plt.imshow(I)

In [None]:
len(test_ds)

In [None]:
out=test_ds[90]['labels']
tokenizer.decode(out, skip_special_tokens=True)

In [None]:
len(test_ds[90]['labels'])

In [None]:
next(iter(test_ds))

In [None]:
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_checkpoint, 
    decoder_checkpoint
)
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# model.config.vocab_size = model.config.decoder.vocab_size
model.config.num_beams = 4

In [None]:
batch = next(iter(train_ds))

model(pixel_values=batch['pixel_values'].unsqueeze(0), labels=batch['labels'].unsqueeze(0))

## Training block

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="image-caption-generator",  # name of the directory to store training outputs
    evaluation_strategy="epoch",           # evaluate after each epoch
    per_device_train_batch_size=8,         # batch size during training
    per_device_eval_batch_size=8,          # batch size during evaluation
    learning_rate=5e-5,
    weight_decay=0.01,                     # weight decay for AdamW optimizer
    num_train_epochs=4,                    # number of epochs to train
    save_strategy='epoch',                 # save checkpoints after each epoch
    report_to='none',                      # prevents logging to wandb, mlflow...
    gradient_accumulation_steps=4          # accumulate gradients over 4 steps
)

trainer = Seq2SeqTrainer(
    model=model, 
    tokenizer=feature_extractor, 
    data_collator=default_data_collator,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    args=training_args,
)

In [None]:
trainer.train()

In [None]:
i=245
inputs = test_ds[i]['pixel_values']
model.eval()
with torch.no_grad():


    # model prediction 
    out = model.generate(
        inputs.unsqueeze(0).to('cuda'), # move inputs to GPU
        num_beams=4, 
        max_length=max_length
        )
# convert token ids to string format
print('DS:')
print(tokenizer.decode(test_ds[i]['labels'],skip_special_tokens=True))
print('GPT2:')
decoded_out = tokenizer.decode(out[0], skip_special_tokens=True)

print(decoded_out)
plt.axis('off')


In [None]:
inputs = test_ds[43]['pixel_values']
model.eval()
with torch.no_grad():


    # model prediction 
    out = model.generate(
        inputs.unsqueeze(0).to('cuda'), # move inputs to GPU
        num_beams=4, 
        max_length=max_length
        )
# convert token ids to string format
decoded_out = tokenizer.decode(out[0], skip_special_tokens=True)

print(decoded_out)
plt.axis('off')
plt.imshow(torch.permute(inputs, (1, 2, 0)));

In [None]:
inputs = test_ds[89]['pixel_values']
model.eval()
with torch.no_grad():
   

    # model prediction 
    out = model.generate(
        inputs.unsqueeze(0).to('cuda'), # move inputs to GPU
        num_beams=4, 
        max_length=max_length
        )
# convert token ids to string format
decoded_out = tokenizer.decode(out[0], skip_special_tokens=True)

print(decoded_out)
plt.axis('off')
plt.imshow(torch.permute(inputs, (1, 2, 0)));

In [None]:
torch.save(model.state_dict(), '/kaggle/working/image-caption-generator/dense-caption-generator_pro.pt')

In [None]:
len(test_ds)

## Inference Block

In [None]:
import numpy as np

In [None]:
from tqdm import tqdm

In [None]:
inputs = test_ds[56]['pixel_values']
model.eval()
with torch.no_grad():
   
    # model prediction 
    out = model.generate(
        inputs.unsqueeze(0).to('cuda'), # move inputs to GPU
        num_beams=4, 
        max_length=max_length
        )
# convert token ids to string format
decoded_out = tokenizer.decode(out[0], skip_special_tokens=True)

print(decoded_out)
plt.axis('off')
plt.imshow(torch.permute(inputs, (1, 2, 0)));

In [None]:
inputs = test_ds[12]['pixel_values']
model.eval()
with torch.no_grad():
    
    # model prediction 
    out = model.generate(
        inputs.unsqueeze(0).to('cuda'), # move inputs to GPU
        num_beams=4, 
        max_length=max_length
        )
# convert token ids to string format
decoded_out = tokenizer.decode(out[0], skip_special_tokens=True)

print(decoded_out)
plt.axis('off')
plt.imshow(torch.permute(inputs, (1, 2, 0)));

In [None]:
DS=[]
GPT=[]
model.eval()
for i in tqdm(range(0,250)):
    inputs = test_ds[i]['pixel_values']
    
    with torch.no_grad():
       

        # model prediction 
        out = model.generate(
            inputs.unsqueeze(0).to('cuda'), # move inputs to GPU
            num_beams=4, 
            max_length=max_length
            )
    # convert token ids to string format

    y_hat=tokenizer.decode(test_ds[i]['labels'],skip_special_tokens=True)
    DS.append(y_hat)

    y_pred=tokenizer.decode(out[0], skip_special_tokens=True)
    GPT.append(y_pred)




In [None]:
!pip install evaluate

In [None]:
import evaluate

In [None]:
!pip install transformers
!pip install bert-score

In [None]:
from transformers import BertTokenizer, BertModel
from bert_score import BERTScorer

In [None]:
reference =DS
candidate = GPT
scorer = BERTScorer(model_type='bert-base-uncased')
P, R, F1 = scorer.score(candidate, reference)

In [None]:
print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")