# Running Llava: a large multi-modal model on Google Colab

In [1]:
import requests
from PIL import Image
import pandas as pd
import torch

from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype = torch.float16
)

  from .autonotebook import tqdm as notebook_tqdm


## Preparing the quantization config to load the model in 4bit precision

In order to load the model in 4-bit precision, we need to pass a `quantization_config` to our model. Let's do that in the cells below

## Load the model using `pipeline`

We will leverage the `image-to-text` pipeline from transformers !

In [2]:
from transformers import pipeline

model_id = "llava-hf/llava-1.5-7b-hf"

pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 3/3 [00:09<00:00,  3.02s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
materials_list = [
    'rock',
    'leaf',
    'water',
    'wood',
    'plastic-bag',
    'ceramic',
    'metal',
    'dirt',
    'cloth',
    'plastic',
    'tile',
    'gravel',
    'paper',
    'drywall',
    'glass',
    'grass',
    'carpet'
]
url_csv = pd.read_csv('./extracted_VGGSound.csv')
url_csv = url_csv.values.tolist()

In [4]:
import csv

# Open the CSV file
with open('./extracted_VGGSound.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    # Extract a specific column, e.g., 'ColumnName'
    column = [row['label'] for row in reader]

# Now 'column' contains all the data from 'ColumnName' column
# print(column)
unique_sounds = set(column)
print(unique_sounds, "\n", len(unique_sounds))

{'stream burbling', 'wind rustling leaves', 'helicopter', 'playing snare drum', 'fire crackling', 'people whistling', 'pumping water', 'subway, metro, underground', 'race car, auto racing', 'wind noise', 'lighting firecrackers', 'sailing', 'car engine knocking', 'church bell ringing', 'firing cannon', 'typing on typewriter', 'waterfall burbling', 'railroad car, train wagon', 'people clapping', 'using sewing machines', 'sliding door', 'hair dryer drying', 'people running', 'car passing by', 'opening or closing drawers', 'people burping', 'cap gun shooting', 'horse clip-clop', 'motorboat, speedboat acceleration', 'typing on computer keyboard', 'splashing water', 'lawn mowing', 'car engine starting', 'bird wings flapping', 'engine accelerating, revving, vroom', 'skateboarding', 'door slamming', 'fireworks banging', 'people farting', 'driving buses', 'people finger snapping', 'toilet flushing', 'eating with cutlery', 'machine gun shooting', 'raining', 'thunder', 'driving motorcycle', 'vacu

In [9]:
max_new_tokens = 200
unique_sounds = list(unique_sounds)
print(unique_sounds)

predicted_materials = []
predicted_sounds = []

for url in url_csv:
  # print(url)
  id, _, label, type, url = url
  torch.cuda.empty_cache()
  image = Image.open(requests.get(url, stream=True).raw)

  prompt = f"Question: <image>\nWhat is the main material of this video? Please choose from the ones on the {materials_list} and tell me. If there are no materials in {materials_list}, say None.\nAnswer:"
  outputs1 = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
  print("Predicted:", outputs1[0]["generated_text"].split("Answer: ")[1], url)
  predicted_materials.append(outputs1[0]["generated_text"].split("Answer: ")[1])

  prompt = f"Question: <image>\nThis is a video thumbnail, what do you think this video will make? You should choose from the ones on the {unique_sounds}.\nAnswer:"
  outputs2 = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
  print(outputs2)
  print(label, "|", outputs2[0]["generated_text"].split("Answer: ")[1], end="\n\n")
  predicted_sounds.append(outputs2[0]["generated_text"].split("Answer: ")[1])
  # material_extract_list.append(outputs1[0]["generated_text"])


['stream burbling', 'wind rustling leaves', 'helicopter', 'playing snare drum', 'fire crackling', 'people whistling', 'pumping water', 'subway, metro, underground', 'race car, auto racing', 'wind noise', 'lighting firecrackers', 'sailing', 'car engine knocking', 'church bell ringing', 'firing cannon', 'typing on typewriter', 'waterfall burbling', 'railroad car, train wagon', 'people clapping', 'using sewing machines', 'sliding door', 'hair dryer drying', 'people running', 'car passing by', 'opening or closing drawers', 'people burping', 'cap gun shooting', 'horse clip-clop', 'motorboat, speedboat acceleration', 'typing on computer keyboard', 'splashing water', 'lawn mowing', 'car engine starting', 'bird wings flapping', 'engine accelerating, revving, vroom', 'skateboarding', 'door slamming', 'fireworks banging', 'people farting', 'driving buses', 'people finger snapping', 'toilet flushing', 'eating with cutlery', 'machine gun shooting', 'raining', 'thunder', 'driving motorcycle', 'vacu

KeyboardInterrupt: 

In [10]:
df = pd.read_csv("./extracted_VGGSound.csv")

def create(list):
    # Calculate the number of missing items
    required_length = len(df) - len(list)

    # Extend the list with None for the missing items
    extended_list = list + [None] * required_length
    return list

df["predicted_materials"] = create(predicted_materials)
df["predicted_sounds"] = create(predicted_sounds)

df.to_csv("updated.csv", index=False)

It is important to prompt the model wth a specific format, which is:
```bash
USER: <image>\n<prompt>\nASSISTANT:
```

In [None]:
  # print(outputs[0]["generated_text"])

The model has managed to successfully describe the image with accurate result ! We also support other variants of Llava, such as [`bakLlava`](https://huggingface.co/llava-hf/bakLlava-v1-hf) which should be all posted inside the [`llava-hf`](https://huggingface.co/llava-hf) organization on 🤗 Hub