In [1]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from Data import CS10KDataset, MixDataset
import torch
import matplotlib.pyplot as plt
import re
from rouge_score import rouge_scorer
from tqdm import tqdm
import numpy as np
import pickle

In [2]:
CS10KDATA_DIR = "../Data/ExtractedData/CS10K/"
NUM_BATCHES = 400
RESIZE_SHAPE = None
VERBOSE = False
NUM_WORKERS = 10

cs10kdataset = CS10KDataset(CS10KDATA_DIR, NUM_BATCHES, RESIZE_SHAPE, VERBOSE, NUM_WORKERS)
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

In [None]:
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)

In [3]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large", truncation_side="left")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").cuda()

In [None]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [4]:
def show_img(img):
    plt.imshow(img)
    plt.show()

def get_prefix(label):
    try:
        regex = r"^(?:Fig(?:ure)?\.?|Figure:?|Fig\.?|Figure)\s*(\d+)"
        match = re.search(regex, label)
        if match:
            return match.group(0) if match.group(0) is not None else ""
    except:
        return "Fig"

def avg(l):
    return sum(l) / len(l)



In [None]:
rouge1 = []
rouge2 = []
rougeL = []
f1score = 0
for batch in range(60):
    print(f'Batch {batch}')
    batch = cs10kdataset[len(cs10kdataset) - batch - 1]   
    loop = tqdm(batch)
    for idx, item in enumerate(loop):
        try:
            loop.set_description(f'Preparing Inputs\tRouge Score: {f1score}')
            image = item['figure']
            if len(image.shape) ==2:
                image = np.dstack((image, image, image))
            label = item['label']
            text = get_prefix(label)
            text = '' if text is None else text
            inputs = processor(image, text, return_tensors="pt").to("cuda")
            loop.set_description(f'Running Model\tRouge Score: {f1score}')
            output = processor.decode(model.generate(**inputs, max_length = len(label),early_stopping=True)[0], skip_special_tokens=True)
            loop.set_description(f'Calculating Score\tRouge Score: {f1score}')
            score = scorer.score(label.lower(), output.lower())
            rouge1.append(score['rouge1'].fmeasure)
            rouge2.append(score['rouge2'].fmeasure)
            rougeL.append(score['rougeL'].fmeasure)
            f1score = rouge1[-1]
        except:
            print(f'Error')