In [13]:
import torch
import pandas as pd
import sys
import os

In [14]:
sys.path.append('../src')
from embedding import initialize_clip_model, generate_embedding
from retrieval import hybrid_retrieval, PostgresVectorRetrieval, TextSearchRetrieval, FaissVectorRetrieval

In [15]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
print(f"Device is {device}")

Device is mps


In [16]:
df = pd.read_csv('benchmark.csv')
df.head()

Unnamed: 0,Pid,Name,Query
0,230765.156074.66EA81FE0710E584.26DCA022C55D334...,TEMU 26pcs Silicone Daisy Beads Set,silicone daisy beads
1,230765.156074.66EA81FE0710E584.26DCA022C55D334...,TEMU 26pcs Silicone Daisy Beads Set,daisy bead making kit for keychains and bracelets
2,230765.156074.66EA81FE0710E584.26DCA022C55D334...,TEMU 26pcs Silicone Daisy Beads Set,TEMU 26pcs silicone daisy crafting beads set f...
3,178866.156074.820F1205554371C6.94435B3E5252BCD...,This Annoying Home Life: A Mindless Coloring B...,annoying life coloring book
4,178866.156074.820F1205554371C6.94435B3E5252BCD...,This Annoying Home Life: A Mindless Coloring B...,stress relief coloring book for adults


In [17]:
# Models to test
clip_model = "openai/clip-vit-base-patch32"
initialize_clip_model(clip_model)

(CLIPProcessor:
 - image_processor: CLIPImageProcessor {
   "crop_size": {
     "height": 224,
     "width": 224
   },
   "do_center_crop": true,
   "do_convert_rgb": true,
   "do_normalize": true,
   "do_rescale": true,
   "do_resize": true,
   "image_mean": [
     0.48145466,
     0.4578275,
     0.40821073
   ],
   "image_processor_type": "CLIPImageProcessor",
   "image_std": [
     0.26862954,
     0.26130258,
     0.27577711
   ],
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
   "size": {
     "shortest_edge": 224
   }
 }
 
 - tokenizer: CLIPTokenizerFast(name_or_path='openai/clip-vit-base-patch32', vocab_size=49408, model_max_length=77, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
 	49406: AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_wo

In [21]:
# Database configuration
DB_CONFIG = {
    'dbname': os.getenv('PGDATABASE', 'finly'),
    'user': os.getenv('PGUSER', 'postgres'),
    'password': os.getenv('PGPASSWORD', 'postgres'),
    'host': os.getenv('PGHOST', 'localhost'),
    'port': os.getenv('PGPORT', '5432')
}
TOP_K = 5

# Create components
components = [
    #PostgresVectorRetrieval('text_embedding', DB_CONFIG),
    FaissVectorRetrieval(index_type='text'),
    PostgresVectorRetrieval('image_embedding', DB_CONFIG),
    TextSearchRetrieval('ts_rank_cd', DB_CONFIG)
]

weights = [1, 0, 0]  # Must sum to 1

# Initialize counters
hits = 0
total = len(df)

# Process each query individually
for i, row in df.iterrows():
    query = row['Query']
    target_name = row['Name']
    query_embedding = generate_embedding(query_text=query)
    
    # Run hybrid search for single query
    pids, scores = hybrid_retrieval(
        query=query,
        query_embedding=query_embedding,
        components=components,
        weights=weights,
        top_k=TOP_K
    )
    
    # Get unique Pids and their corresponding names
    unique_pids = list(dict.fromkeys(pids))  # Remove duplicates while preserving order
    retrieved_names = df[df['Pid'].isin(unique_pids)]['Name'].drop_duplicates().tolist()
    
    # Check if the ground truth Pid is in the results
    hit = row['Pid'] in pids
    if hit:
        hits += 1
    
    # Print results for this query
    print(f"\nQuery: {query}")
    print(f"Target: {target_name}")
    print(f"Hit: {hit}")
    print("Retrieved items:")
    for name in retrieved_names:
        print(f"- {name}")
    print("-" * 80)

recall_at_k = hits / total
print(f"\nOverall Recall@{TOP_K}: {recall_at_k:.4f}")


Query: silicone daisy beads
Target: TEMU 26pcs Silicone Daisy Beads Set
Hit: False
Retrieved items:
--------------------------------------------------------------------------------

Query: daisy bead making kit for keychains and bracelets
Target: TEMU 26pcs Silicone Daisy Beads Set
Hit: False
Retrieved items:
--------------------------------------------------------------------------------

Query: TEMU 26pcs silicone daisy crafting beads set for handmade accessories and jewelry making
Target: TEMU 26pcs Silicone Daisy Beads Set
Hit: False
Retrieved items:
--------------------------------------------------------------------------------

Query: annoying life coloring book
Target: This Annoying Home Life: A Mindless Coloring Book for the Super Stressed
Hit: True
Retrieved items:
- This Annoying Home Life: A Mindless Coloring Book for the Super Stressed
--------------------------------------------------------------------------------

Query: stress relief coloring book for adults
Target: Th