In [None]:
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoModel
import os
import pandas as pd

from multimodal_reasoning_helper import chatbot

In [None]:
# Loading the meta data
master_df = pd.read_csv(r"../CSVs/new_master_csv.csv", dtype={"image_id": str})

In [None]:
# Loading the reasoning model
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "Qwen/Qwen2-VL-2B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name, torch_dtype="auto", 
).to(device)
model.eval()

processor = AutoProcessor.from_pretrained(model_name)

In [None]:
# Loading the retreival model
retreival_model_name = 'google/siglip2-base-patch16-224'
retreival_model = AutoModel.from_pretrained(retreival_model_name, torch_dtype="auto").to(device)
retreival_model.eval()
retreival_processor = AutoProcessor.from_pretrained(retreival_model_name)

In [None]:
# Initialization
chtbot = chatbot(model, processor, retreival_model, retreival_processor)
conversation = []
faiss_index_directory = r"../Data"

In [None]:
# Testing on hand-written shopping list
image_directory = r"../Data/Example"
image_path = os.path.join(image_directory, "hand_written_list_low_res.jpg")
image = Image.open(image_path).convert("RGB")
image

In [None]:
%%time

# Running the chatbot

class_ = chtbot.classifier(image)
print(class_)

# First, detecting which class the image belongs to
if class_[0] == 'text':   # it's an OCR application
    item_list = chtbot.process_shopping_list(image)
    print(item_list)

    for item in item_list:  # iterating through each item on the list
        result = chtbot.comapct_chatbot([], faiss_index_directory, master_df, user_query=item, user_image=None, top_k=[10, 3], alpha=0.4)
        conversation = result['conversation']     # storing the conversation for follow-up query

else:
    result = chtbot.comapct_chatbot(conversation, faiss_index_directory, master_df, user_query=user_query, user_image=None, top_k=[10, 6], alpha=0.4)
    conversation = result['conversation']

In [None]:
conversation = []   # initializing for next test

In [None]:
# Testing on clothing image
user_query = "Do you have any dress like this?"

image_directory = r"../Data/Example"
image_path = os.path.join(image_directory, "internet_example13.jpg")
image = Image.open(image_path).convert("RGB")
image

In [None]:
%%time

class_ = chtbot.classifier(image)
print(class_)

if class_[0] == 'text':
    item_list = chtbot.process_shopping_list(image)
    print(item_list)

    for item in item_list:
        result = chtbot.comapct_chatbot([], faiss_index_directory, master_df, user_query=item, user_image=None, top_k=[10, 3], alpha=0.4)
        conversation = result['conversation']

else:  # Now the code goes through this section
    result = chtbot.comapct_chatbot(conversation, faiss_index_directory, master_df, user_query=user_query, user_image=image, top_k=[10, 6], alpha=0.4)
    conversation = result['conversation']

In [None]:
user_query = "Do you have the dress in blue color?"   # Follow-up query

In [None]:
%%time

result = chtbot.comapct_chatbot(conversation, faiss_index_directory, new_master_df, user_query=user_query, user_image=None, top_k=[10, 6], alpha=0.4)
conversation = result['conversation']

In [None]:
user_query = "Do you have any t-shirt that matches with this blue dress?"  # More follow-up

In [None]:
%%time

result = chtbot.comapct_chatbot(conversation, faiss_index_directory, master_df, user_query=user_query, user_image=None, top_k=[10, 6], alpha=0.4)
conversation = result['conversation']

In [None]:
user_query = "I want the t-shirt in red color. "

In [None]:
%%time

result = chtbot.comapct_chatbot(conversation, faiss_index_directory, master_df, user_query=user_query, user_image=None, top_k=[10, 6], alpha=0.4)
conversation = result['conversation']