In [None]:
import requests
from PIL import Image
import pandas as pd
import torch

from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype = torch.float16
)

In [None]:
from transformers import pipeline

model_id = "llava-hf/llava-v1.6-vicuna-13b-hf"

pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})

In [3]:
materials_list = [
    'rock',
    'leaf',
    'water',
    'wood',
    'plastic-bag',
    'ceramic',
    'metal',
    'dirt',
    'cloth',
    'plastic',
    'tile',
    'gravel',
    'paper',
    'drywall',
    'glass',
    'grass',
    'carpet'
]
url_csv = pd.read_csv('./extracted_VGGSound.csv')
url_csv = url_csv.values.tolist()

In [None]:
import csv

with open('./extracted_VGGSound.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    column = [row['label'] for row in reader]

unique_sounds = set(column)
print(unique_sounds, "\n", len(unique_sounds))

In [None]:
max_new_tokens = 200
unique_sounds = list(unique_sounds)
print(unique_sounds)

predicted_materials = []
predicted_sounds = []

for url in url_csv:
  # print(url)
  id, _, label, type, url = url
  torch.cuda.empty_cache()
  image = Image.open(requests.get(url, stream=True).raw)

  prompt = f"Question: <image>\nWhat is the main material of this video? Please choose from the ones on the {materials_list} and tell me. If there are no materials in {materials_list}, say None.\nAnswer:"
  outputs1 = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
  print("Predicted:", outputs1[0]["generated_text"].split("Answer: ")[1], url)
  predicted_materials.append(outputs1[0]["generated_text"].split("Answer: ")[1])

  prompt = f"Question: <image>\nThis is a video thumbnail, what do you think this video will make? You should choose from the ones on the {unique_sounds}.\nAnswer:"
  outputs2 = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
  print(outputs2)
  print(label, "|", outputs2[0]["generated_text"].split("Answer: ")[1], end="\n\n")
  predicted_sounds.append(outputs2[0]["generated_text"].split("Answer: ")[1])

In [10]:
df = pd.read_csv("./extracted_VGGSound.csv")

def create(list):
    required_length = len(df) - len(list)

    extended_list = list + [None] * required_length
    return extended_list

df["predicted_materials"] = create(predicted_materials)
df["predicted_sounds"] = create(predicted_sounds)

df.to_csv("updated.csv", index=False)

In [7]:
import re

def contains_any_word(sentence1, sentence2):   
    def clean_and_split(sentence):
        cleaned_sentence = re.sub(r'[^\w\s]', '', sentence.lower())
        return set(cleaned_sentence.split())
    
    set1 = clean_and_split(sentence1)
    set2 = clean_and_split(sentence2)

    common_words = set1.intersection(set2)

    return bool(common_words)

sentence1 = "A bell ringing"
sentence2 = "church bell ringing"
result = contains_any_word(sentence1, sentence2)
print("Is there any common word?", result)

Is there any common word? True


## Test ACC

In [11]:
import pandas as pd
import csv

updated = pd.read_csv('./updated.csv')
updated = updated.values.tolist()

print(len(updated))

success_count = 0

for row in updated:
  # print(url)
  id, time, label, type, url, predicted_material, predicted_sound = row
  if contains_any_word(label, predicted_sound):
    success_count += 1
  # if str(label) in str(predicted_sound).lower()
print(success_count)

acc = success_count / len(updated)

print(acc)


40713
17649
0.4334978999336821
