In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!pip install -U bitsandbytes



In [9]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig
from transformers.image_utils import load_image
import os
import json
from tqdm import tqdm
import pickle

# Specify device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Define paths directly
images_path = "drive/MyDrive/smolvlm/archive/food_images/food_images"
pickle_path = "drive/MyDrive/smolvlm/test_titles.pkl"

class Inference:
    def __init__(self, images_path, pickle_path):
        self.images_path = images_path
        self.pickle_path = pickle_path
        # Load the pickle file
        with open(pickle_path, 'rb') as f:
            # A dictionary with keys 'image_name' and 'title'
            self.titles = pickle.load(f)

        self.processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Base")
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        self.model = AutoModelForVision2Seq.from_pretrained(
            "HuggingFaceTB/SmolVLM-Instruct",
            quantization_config=quantization_config,
        )


    def load_image(self, image_path):
        image = load_image(image_path)
        return image

    def generate_description(self, image_path):
        image = self.load_image(image_path)
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": "Can you describe the image?"}
                ]
            },
        ]
        prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
        inputs = inputs.to(DEVICE)
        generated_ids = self.model.generate(**inputs, max_new_tokens=50)
        generated_texts = self.processor.batch_decode(
            generated_ids,
            skip_special_tokens=True,
        )
        return (generated_texts[0], os.path.basename(image_path))

    def save_to_json(self, descriptions, output_file='descriptions.json'):
        with open(output_file, 'w') as f:
            json.dump(descriptions, f)

    def process_images(self, output_file='descriptions.json'):
        descriptions = []
        image_names = os.listdir(self.images_path)
        image_names = [
                      image_name for image_name in image_names
                      if os.path.splitext(os.path.basename(image_name))[0] in self.titles
                      ]

        progress_bar = tqdm(image_names, desc="Processing Images", unit="image")
        for image_name in progress_bar:
            image_path = os.path.join(self.images_path, image_name)
            if os.path.isfile(image_path):
                description = self.generate_description(image_path)
                descriptions.append(description)
        self.save_to_json(descriptions, output_file)

# Instantiate the class
inference = Inference(images_path, pickle_path)

# Process the images and save descriptions
output_file = "descriptions.json"  # Define output file
inference.process_images(output_file)

# Load and view saved descriptions
with open(output_file, 'r') as f:
    descriptions = json.load(f)

# Display some results
print(descriptions[:5])


Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


ImportError: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`