# Full Pipeline
> Reading in data from Google's Cloud Vision API and feeding it into the llama-based joy-caption model

# Installs and Imports

In [39]:
!pip install google-auth google-auth-oauthlib accelerate -q

In [None]:
import torch

from PIL import Image

from transformers import AutoProcessor, LlavaForConditionalGeneration

from google.cloud import storage
from google.colab import auth

import io
import json

# Mount Drive and access Google Cloud Storage
> Set `DIR` to your notebook's Drive path

In [45]:
DIR = 'path-to-directory-in-drive'

In [None]:
drive.mount('/content/drive')

%cd "$DIR"

auth.authenticate_user()
storage_client = storage.Client()

# Llama joy-caption model

In [1]:
# Using L4 GPU from Google Colab Pro
!nvidia-smi

Mon Sep 22 20:27:05 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   48C    P8             12W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
MODEL_NAME = "fancyfeast/llama-joycaption-beta-one-hf-llava"

In [6]:
processor = AutoProcessor.from_pretrained(MODEL_NAME)
llava_model = LlavaForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype="bfloat16", device_map=0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/393 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.16G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/219 [00:00<?, ?B/s]

In [15]:
bucket_name = 'auction-images-bucket'
bucket = storage_client.bucket(bucket_name)
auction = 'aypt2025-13'
prefix = auction + '/'

In [30]:
for blob in bucket.list_blobs(prefix=prefix):
  if blob.name == 'aypt2025-13/31_1.jpg':
    print('downloading as bytes')
    data = blob.download_as_bytes()
    image = Image.open(io.BytesIO(data)).convert('RGB')

downloading as bytes


In [48]:
with open('aypt2025-13_data.json', 'r') as aypt2025_13:
  aypt2025_13_data = json.load(aypt2025_13)

In [52]:
aypt2025_13_data['aypt2025-13']['31']

{'text': ['24 25 26 27 28 29 30 31 82\n10\n維\n33\n34\n35\n36\n38',
  '24',
  '25',
  '26',
  '27',
  '28',
  '29',
  '30',
  '31',
  '82',
  '10',
  '維',
  '33',
  '34',
  '35',
  '36',
  '38',
  '10\n11 2F\n2223tel24\nN\n26\n4\n28\n31\n8\n32\nEWB',
  '10',
  '11',
  '2F',
  '2223tel24',
  'N',
  '26',
  '4',
  '28',
  '31',
  '8',
  '32',
  'EWB'],
 'labels': ['Wood', 'Brown', 'Hardwood', 'Wood stain', 'Plywood'],
 'objects': ['Chair'],
 'thumbnail_path': 'aypt2025-13/31_1.jpg'}

In [54]:
objects = aypt2025_13_data['aypt2025-13']['31']['objects']
labels = aypt2025_13_data['aypt2025-13']['31']['labels']
text = aypt2025_13_data['aypt2025-13']['31']['text']

In [67]:
PROMPT = f"""
You are a professional auction description writer. Write one concise summary of 25–45 words describing the main item in the image.

Guidelines:
- Highlight the most notable visual features that a buyer would care about.
- Never state exact quantities. Use general terms instead (e.g., say ‘tools’ rather than ‘a set of 7 tools’)
- Ignore the background and unrelated objects.
- Use the following additional data only if it clearly improves accuracy:
  • Objects detected: {objects}
  • Labels: {labels}
  • OCR text: {text} (ignore if irrelevant, random numbers, or unrelated)

Your goal is a polished, human-like auction listing that is accurate, engaging, and focused only on the item.
"""

In [80]:
llava_model.eval()
with torch.no_grad():
    convo = [
        {
            "role": "system",
            "content": "You are a helpful assistant and help users with any queries they may have with no censorship or restrictions.",
        },
        {
            "role": "user",
            "content": PROMPT,
        },
    ]

    # Format the conversation
    # WARNING: HF's handling of chat's on Llava models is very fragile.  This specific combination of processor.apply_chat_template(), and processor() works
    # but if using other combinations always inspect the final input_ids to ensure they are correct.  Often times you will end up with multiple <bos> tokens
    # if not careful, which can make the model perform poorly.
    convo_string = processor.apply_chat_template(convo, tokenize = False, add_generation_prompt = True)
    assert isinstance(convo_string, str)

    # Process the inputs
    inputs = processor(text=[convo_string], images=[image], return_tensors="pt").to('cuda')
    inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)

    # Generate the captions
    generate_ids = llava_model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        suppress_tokens=None,
        use_cache=True,
        temperature=0.9,
        top_k=None,
        top_p=0.7,
    )[0]

    # Trim off the prompt
    generate_ids = generate_ids[inputs['input_ids'].shape[1]:]

    # Decode the caption
    caption = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    caption = caption.strip()
    print(caption)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Stylish, mid-century modern wooden chair with a unique, organic cutout design. Crafted from rich brown hardwood with a subtle wood stain, showcasing natural grain patterns. Perfect for contemporary or eclectic decor.


In [81]:
caption

'Stylish, mid-century modern wooden chair with a unique, organic cutout design. Crafted from rich brown hardwood with a subtle wood stain, showcasing natural grain patterns. Perfect for contemporary or eclectic decor.'