In [29]:
import torch
import pandas as pd
import sys
import os
import mlflow
import mlflow.pytorch
from datetime import datetime

In [30]:
sys.path.append('../src')
from embedding import initialize_clip_model, generate_embedding
from retrieval import hybrid_retrieval, PostgresVectorRetrieval, TextSearchRetrieval, FaissVectorRetrieval

In [31]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
print(f"Device is {device}")

Device is mps


In [32]:
df = pd.read_csv('benchmark_query.csv')
df.head()

Unnamed: 0,Pid,Name,Description,Category,Basic_query,Attribute_query,Natural_query
0,230765.156074.8EA6270009853D26.9B6C065E0E0C70F...,"TEMU 3pcs Cat Toy Set, Hemp Rope And Feather M...",Faster shipping. Better service,Animals & Pet Supplies,3pcs hemp,temu 3pcs hemp rope,TEMU 3pcs Cat Toy Set with fast delivery
1,230765.156074.8EA6270009853D26.5997EEBEEF688B9...,TEMU High-density Aquarium Biochemical Filter ...,Faster shipping. Better service,Animals & Pet Supplies,high density,temu high density aquarium,TEMU High density Aquarium Biochemical for bus...
2,159496.2.E9BF3C1C3B82E113.98C4E18825C4C542.737...,Life Extension Florassist Daily Bowel Regulari...,Defend Against Occasional Constipation. Gluten...,Animals & Pet Supplies,florassist daily,life extension florassist daily,Life Extension Florassist Daily Bowel that wor...
3,159496.2.E9BF3C1C3B82E113.57057A47B3BB8BF4.088...,Carlson Co-Q10 100 mg - 60 Softgels,100 mg of CoEnzyme Q10. Promotes Normal Energy...,Animals & Pet Supplies,softgels coenzyme,carlson softgels coenzyme promotes,Carlson Co Q10 100 mg for busy households
4,159496.2.E9BF3C1C3B82E113.1AC1A04CB4B08FD2.853...,Youtheory Collagen 6000 mg - 290 Tablets,"Skin, Hair & Nail Formula. Enhanced with Vitam...",Animals & Pet Supplies,collagen 6000,youtheory collagen 6000 tablets,Youtheory Collagen 6000 mg 290 that works well...


In [33]:
# Models to test
clip_model = "openai/clip-vit-base-patch32"
initialize_clip_model(clip_model)

(CLIPProcessor:
 - image_processor: CLIPImageProcessor {
   "crop_size": {
     "height": 224,
     "width": 224
   },
   "do_center_crop": true,
   "do_convert_rgb": true,
   "do_normalize": true,
   "do_rescale": true,
   "do_resize": true,
   "image_mean": [
     0.48145466,
     0.4578275,
     0.40821073
   ],
   "image_processor_type": "CLIPImageProcessor",
   "image_std": [
     0.26862954,
     0.26130258,
     0.27577711
   ],
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
   "size": {
     "shortest_edge": 224
   }
 }
 
 - tokenizer: CLIPTokenizerFast(name_or_path='openai/clip-vit-base-patch32', vocab_size=49408, model_max_length=77, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
 	49406: AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_wo

In [34]:
# Initialize MLflow
mlflow.set_tracking_uri(uri="http://35.209.59.178:8591")
mlflow.set_experiment("finly_search_experiments")

<Experiment: artifact_location='mlflow-artifacts:/879865724907041499', creation_time=1746677508382, experiment_id='879865724907041499', last_update_time=1746677508382, lifecycle_stage='active', name='finly_search_experiments', tags={}>

In [35]:
# Database configuration
DB_CONFIG = {
    'dbname': os.getenv('PGDATABASE', 'finly'),
    'user': os.getenv('PGUSER', 'postgres'),
    'password': os.getenv('PGPASSWORD', 'postgres'),
    'host': os.getenv('PGHOST', 'localhost'),
    'port': os.getenv('PGPORT', '5432')
}
TOP_K = 5

# Create components
components = [
    PostgresVectorRetrieval('text_embedding', DB_CONFIG),
    #FaissVectorRetrieval(index_type='text'),
    PostgresVectorRetrieval('image_embedding', DB_CONFIG),
    TextSearchRetrieval('ts_rank_cd', DB_CONFIG)
]

weights = [0, 0, 1]  # Must sum to 1

In [36]:
# Wrap the experiment in MLflow
with mlflow.start_run(run_name=f"experiment_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    # Log parameters
    component_details = []
    for c in components:
        if isinstance(c, PostgresVectorRetrieval):
            component_details.append(f"PostgresVectorRetrieval({c.column_name})")
        elif isinstance(c, FaissVectorRetrieval):
            component_details.append(f"FaissVectorRetrieval({c.index_type})")
        else:
            component_details.append(type(c).__name__)
    
    mlflow.log_params({
        "clip_model": clip_model,
        "top_k": TOP_K,
        "weights": weights,
        "components": component_details
    })
    
    # Initialize counters for each query type
    query_types = ['Basic_query', 'Attribute_query', 'Natural_query']
    results = {
        'overall': {'hits': 0, 'total': 0},
        'Basic_query': {'hits': 0, 'total': 0},
        'Attribute_query': {'hits': 0, 'total': 0},
        'Natural_query': {'hits': 0, 'total': 0}
    }
    
    # Process each query type
    for query_type in query_types:
        print(f"\nProcessing {query_type}...")
        print("-" * 80)
        
        # Process each query of this type
        for i, row in df.iterrows():
            query = row[query_type]
            target_name = row['Name']
            target_pid = row['Pid']
            
            # Generate embedding and run hybrid search
            query_embedding = generate_embedding(query_text=query)
            pids, scores = hybrid_retrieval(
                query=query,
                query_embedding=query_embedding,
                components=components,
                weights=weights,
                top_k=TOP_K
            )
            
            # Get unique Pids and their corresponding names
            unique_pids = list(dict.fromkeys(pids))
            retrieved_names = df[df['Pid'].isin(unique_pids)]['Name'].drop_duplicates().tolist()
            
            # Check if the ground truth Pid is in the results
            hit = target_pid in pids
            
            # Update counters
            results[query_type]['total'] += 1
            results['overall']['total'] += 1
            if hit:
                results[query_type]['hits'] += 1
                results['overall']['hits'] += 1
    
    # Calculate and log recall@K for each category
    for category in results:
        recall = results[category]['hits'] / results[category]['total']
        print(f"\n{category} Recall@{TOP_K}: {recall:.4f}")
        mlflow.log_metric(f"{category}_recall_at_k", recall)
        mlflow.log_metric(f"{category}_total_queries", results[category]['total'])
        mlflow.log_metric(f"{category}_total_hits", results[category]['hits'])

🏃 View run experiment_20250507_211604 at: http://35.209.59.178:8591/#/experiments/879865724907041499/runs/00aea0fa4bc149248650c4181fd629ff
🧪 View experiment at: http://35.209.59.178:8591/#/experiments/879865724907041499


AttributeError: 'PostgresVectorRetrieval' object has no attribute 'embedding_type'