In [None]:
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
import os
import pandas as pd
from tqdm import tqdm

from description_generator_helper import custom_prompt

In [None]:
# Loading the metadata
directory = r"D:\LLM_Project\Multimodel Chatbot\Data\train"
master_csv_file_path = os.path.join(directory, "master_csv.csv")
master_df = pd.read_csv(master_csv_file_path, dtype={"image_id": str})

In [None]:
# Loading the multimodal reasoning model
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "Qwen/Qwen2-VL-2B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name, torch_dtype="auto", 
).to(device)
model.eval()

processor = AutoProcessor.from_pretrained(model_name)

In [None]:
custom_prmpt = custom_prompt(model, processor, device)

In [None]:
# Creating description and attributes of each image for a better embedding
all_descriptions = []
all_attributes = []

for idx, row in tqdm(master_df.iterrows(), total=len(master_df)):
    image_id = row['image_id']
    item_id = row['item_id']
    
    image_directory = r"D:\LLM_Project\Multimodel Chatbot\Data\train\cropped_image_unique"
    retreival_path = os.path.join(image_directory, f"{image_id}_{item_id}.jpg")
    image = Image.open(retreival_path).convert("RGB")
    
    description_text = custom_prmpt.description_of_image(image)
    all_descriptions.append(description_text[0])

    attributes = custom_prmpt.attributes_of_image(description_text[0])
    all_attributes.append(attributes)

master_df["description"] = all_descriptions
master_df["attributes"] = all_attributes

In [None]:
# Saving the DataFrame as a new metadata
master_df.to_csv(r"D:\LLM_Project\Multimodel Chatbot\Data\train\new_master_csv.csv", index=False)