In [None]:
import torch
import requests
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor, AutoModel
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import faiss
import warnings

from multimodal_reasoning_helper import chatbot

In [None]:
new_master_df = pd.read_csv(r"../CSVs/new_master_csv.csv", dtype={"image_id": str})

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "Qwen/Qwen2-VL-2B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name, torch_dtype="auto", 
).to(device)
model.eval()

processor = AutoProcessor.from_pretrained(model_name)

In [None]:
retreival_model_name = 'google/siglip2-base-patch16-224'
retreival_model = AutoModel.from_pretrained(retreival_model_name, torch_dtype="auto").to(device)
retreival_model.eval()
retreival_processor = AutoProcessor.from_pretrained(retreival_model_name)

In [None]:
chtbot = chatbot(model, processor, retreival_model, retreival_processor)

In [None]:
user_query = "Do you have any dress like this?"

image_directory = r"../Data/Example"
image_path = os.path.join(image_directory, "internet_example13.jpg")
image = Image.open(image_path).convert("RGB")
image

In [None]:
%%time
conversation = []
output_text = chtbot.generate_description(conversation=conversation, user_query=user_query, user_image=image)
print(output_text[0])

faiss_index_directory = r"../Data"
dist_img, idx_img, dist_text, idx_text = chtbot.retreive_index(faiss_index_directory, output_text, user_query=user_query, user_image=image, top_k=10)

combined_scores = chtbot.calculate_combined_score(idx_img, dist_img, idx_text, dist_text, alpha=0.4)
print(combined_scores)

top_k_keys, suggested_images = chtbot.retreive_top_products(combined_scores, top_k=5)

conversation.append({
    "role": "user",
    "content": [{"type": "text", "text": user_query}]
})
conversation.append({
    "role": "assistant",
    "content": [{"type": "text", "text": output_text[0]}]
})/

In [None]:
%%time
user_query = "Do you have the dress in blue color?"
user_image = suggested_images[4]

output_text = chtbot.generate_description(conversation=conversation, user_query=user_query, user_image=None)
print(output_text[0])

faiss_index_directory =  r"../Data"
dist_img, idx_img, dist_text, idx_text = chtbot.retreive_index(faiss_index_directory, output_text, user_query=user_query, user_image=None, top_k=10)

combined_scores = chtbot.calculate_combined_score(idx_img, dist_img, idx_text, dist_text, alpha=0.4)
print(combined_scores)

top_k_keys, suggested_images = chtbot.retreive_top_products(combined_scores, new_master_df, top_k=6)

conversation.append({
    "role": "user",
    "content": [{"type": "text", "text": user_query}]
})
conversation.append({
    "role": "assistant",
    "content": [{"type": "text", "text": output_text[0]}]
})

In [None]:
%%time
user_query = "Do you have any t-shirt that matches with this blue dress?"
user_image = suggested_images[1]

output_text = chtbot.generate_description(conversation=conversation, user_query=user_query, user_image=None)
print(output_text[0])

faiss_index_directory =  r"../Data"
dist_img, idx_img, dist_text, idx_text = chtbot.retreive_index(faiss_index_directory, output_text, user_query=user_query, user_image=None, top_k=10)

combined_scores = chtbot.calculate_combined_score(idx_img, dist_img, idx_text, dist_text, alpha=0.4)
print(combined_scores)

top_k_keys, suggested_images = chtbot.retreive_top_products(combined_scores, new_master_df, top_k=6)

conversation.append({
    "role": "user",
    "content": [{"type": "text", "text": user_query}]
})
conversation.append({
    "role": "assistant",
    "content": [{"type": "text", "text": output_text[0]}]
})

In [None]:
%%time
user_query = "I want the t-shirt in red color. "
user_image = suggested_images[2]

output_text = chtbot.generate_description(conversation=conversation, user_query=user_query, user_image=None)
print(output_text[0])

faiss_index_directory =  r"../Data"
dist_img, idx_img, dist_text, idx_text = chtbot.retreive_index(faiss_index_directory, output_text, user_query=user_query, user_image=None, top_k=10)

combined_scores = chtbot.calculate_combined_score(idx_img, dist_img, idx_text, dist_text, alpha=0.4)
print(combined_scores)

top_k_keys, suggested_images = chtbot.retreive_top_products(combined_scores, new_master_df, top_k=6)

conversation.append({
    "role": "user",
    "content": [{"type": "text", "text": user_query}]
})
conversation.append({
    "role": "assistant",
    "content": [{"type": "text", "text": output_text[0]}]
})