In [None]:
!pip install -q -U transformers==4.37.2
!pip install -q bitsandbytes==0.41.3 accelerate==0.25.0

In [2]:
import os
import torch
from transformers import BitsAndBytesConfig, pipeline
from PIL import Image

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Verifica se há uma GPU disponível
use_gpu = torch.cuda.is_available()

# Configuração de quantização, usada apenas se houver GPU disponível
if use_gpu:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )
else:
    quantization_config = None

# Caminho da pasta onde as imagens estão armazenadas
folder_path = '/content/drive/MyDrive/img_test'

# Listar arquivos de imagem na pasta
image_files = [f for f in os.listdir(folder_path) if f.endswith('.jpeg') or f.endswith('.jpg')]

# Identificador do modelo
model_id = "llava-hf/llava-1.5-7b-hf"

# Inicialização do pipeline
if quantization_config:
    pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})
else:
    pipe = pipeline("image-to-text", model=model_id)

# Prompt para a geração do texto
prompt = "USER: <image>\nIs this child biting nails or using a pacifier or with their finger in their mouth?\nASSISTANT:"

# Processar cada imagem na pasta
for file_name in image_files:
    image_path = os.path.join(folder_path, file_name)
    image = Image.open(image_path).convert('RGB')

    outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 20})
    print(f'Caption for {file_name}:', outputs[0]["generated_text"])
