In [None]:
# Ensures that there is enough memory allocation for the model to load
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
from transformers import pipeline
import torch
# Loads the model 
pipe = pipeline(
    "image-text-to-text",
    model="google/gemma-3-4b-it",
    device="cuda",
    torch_dtype=torch.bfloat16
)


In [None]:
import json
def save_labels(labels_list, file,):
    # Saves the labels in a json format 
    data ={
        'Labels' : labels_list
    }
    
    new_path = '/home/jovyan/Evaluation/Data'
    
    if not os.path.exists(new_path):
        os.makedirs(new_path) # Create the directory if it doesn't exist
        print(f"{new_path} successfully created")
    
    json_filename = file
    json_filename = f'{json_filename}.json'
    # Creates the new json file.
    file_name = os.path.join(new_path , json_filename)
    with open(file_name , 'w') as json_file:
            json.dump(data, json_file, indent=4)
    print(f"File : {json_filename} successfully created!")

In [None]:

# Path to the directory (library)
directory_path = "/home/jovyan/images"
# List all files in the directory
files = os.listdir(directory_path)

# Filter out directories and show only files
files = [file for file in files if os.path.isfile(os.path.join(directory_path, file))]

# Print the list of files
for index, file in enumerate(files):
    # Optimize memory usage 
    torch.cuda.empty_cache()
    print(f"Processing image {file}")
    
    image_path = os.path.join(directory_path, file)

    # Prompt for the Google's Gemma 
    prompt = f"""
    Detect all the generic objects in the image and list them out,
    Only Include the name of the objects.

    Do not include the header.
    The output should be in this format: "dog , woman, ball"
    """


    # Google's Gemma
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are a helpful assistant."}]
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": f"{image_path}" },
                {"type": "text", "text": f"{prompt}"}
            ]
        }
    ]

    output = pipe(text=messages, max_new_tokens=200)
    output_text = output[0]["generated_text"][-1]["content"]
    ## Take the case where theres no objects detected to break out of the loop.
    
    # Converts all the text into lower caps.
    output_text = output_text.lower()
    # To save the labels in a list 
    labels_list = output_text.strip().split(', ')
    
    save_labels(labels_list,file)
    
    print(f"Successfully processed Image {file} ,{index+1}/{len(files)} Images Processed")

print(f"All {len(files)} have been processed.")