# ViLT model

First we need to install the Transformers library (optionally, also Accelerate):

In [None]:
!pip install transformers
!pip install accelerate

In [None]:
from transformers import ViltProcessor, ViltForQuestionAnswering
import requests
from PIL import Image
import math
import os
import numpy as np
import shutil
import pandas as pd

# prepare ViLT model:
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# define paths to the directories:
data_path = 'numerosity_naming_images/'
categories = ['apples', 'butterflies', 'dots', 'people', 'fastcards']
n_categories = len(categories)

# Get a list of all the filenames in this directory
filenames = os.listdir(data_path)

# Loop through the filenames
i = 0
for filename in filenames:

  # prepare target question:
  question = "How many things are there?" # NB: this might be replaced with a category-specific question

  # prepare input for the VQA model
  image = Image.open(data_path + filename).convert('RGB')
  encoding = processor(image, question, return_tensors="pt")

  # forward pass
  outputs = model(**encoding)
  logits = outputs.logits
  idx = logits.argmax(-1).item()
  response = int(model.config.id2label[idx])

  if response < 0 or response > 20 or math.isnan(response):
    print(filename)
    print('Unexpected response:', response)


# BLIP-2 model

In [None]:
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
import requests
from PIL import Image
import math
import os
import numpy as np
import shutil
import pandas as pd
import re
from word2number import w2n

# prepare BLIP-2 model:
model_names = ["blip2-opt-6.7b", "blip2-flan-t5-xl", "blip2-flan-t5-xl-coco"]
processor = AutoProcessor.from_pretrained("Salesforce/" + model_names[1])
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/" + model_names[1], torch_dtype=torch.float16)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(device)

# define paths to the directories:
root_path = 'numerosity_naming_images/'
categories = ['apples', 'butterflies', 'dots', 'people', 'fastcards']
n_categories = len(categories)

# Get a list of all the filenames in this directory
filenames = os.listdir(data_path)

# Loop through the filenames
i = 0
for filename in filenames:

  # prepare target question:
  question = "How many things are there?" # NB: this might be replaced with a category-specific question

  # prepare input for the VQA model
  image = Image.open(data_path + filename).convert('RGB')
  encoding = processor(image, text=question, return_tensors="pt").to(device, torch.float16)

  # forward pass
  generated_ids = model.generate(**encoding, max_new_tokens=10)
  response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

  response_num = re.findall(r'\d+', response)
  if not response_num:
    try:
      response_num = w2n.word_to_num(response)
    except ValueError:
      print(filename)
      print('Unexpected response: ', response)

  elif len(response_num) > 1:
    print(filename)
    print('More than one number returned: ', response)
  else:
    response_num = response_num[0]

  i = i + 1
