In [2]:
import torch
import pandas as pd
import numpy as np
from PIL import Image
from transformers import AutoModel, AutoProcessor
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
print(f"Device is {device}")

Device is cuda


In [3]:
# Load data
df = pd.read_parquet('../data/merged_output_sample_100k.parquet')
df.shape

(100000, 30)

In [4]:
df.columns

Index(['Pid', 'Name', 'ShortDescription', 'Description', 'CategoryId',
       'Category', 'ImageURL', 'Price', 'PriceCurrency', 'SalePrice',
       'FinalPrice', 'Discount', 'isOnSale', 'IsInStock', 'Keywords', 'Brand',
       'Manufacturer', 'MPN', 'UPCorEAN', 'SKU', 'Color', 'Gender', 'Size',
       'Condition', 'AlternateImageUrl', 'AlternateImageUrl2',
       'AlternateImageUrl3', 'AlternateImageUrl4', 'DeepLinkURL', 'LinkUrl'],
      dtype='object')

In [12]:
SAMPLE_SIZE = 1000

if SAMPLE_SIZE < len(df):
    df = df.sample(n=SAMPLE_SIZE, random_state=42)
    print(f"Using {SAMPLE_SIZE} samples for evaluation")
else:
    print(f"Using all {len(df)} samples for evaluation")

# Models to test
models = {
    "CLIP-512": "openai/clip-vit-base-patch32",
    "CLIP-768": "openai/clip-vit-large-patch14",
    "SigLIP-512": "google/siglip-base-patch16-512",
    "SigLIP-1024": "google/siglip-so400m-patch14-384"
}

model_results = {
    'Model': [],
    'Accuracy (%)': [],
    'Avg Similarity': [],
    'Std Similarity': [],
    'Correct Predictions': [],
    'Total Predictions': []

}

for name, model_id in models.items():
    print(f"\nTesting: {name} — {model_id}")
    
    # Load model and processor
    processor = AutoProcessor.from_pretrained(model_id)
    model = AutoModel.from_pretrained(model_id).to(device)
    model.eval()
    
    correct_predictions = 0
    total_predictions = 0
    similarities_matrix = []
    
    batch_size = 10
    num_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)
    
    for batch_start in tqdm(range(0, len(df), batch_size), total=num_batches, desc=f"Processing {name}"):
        batch_end = min(batch_start + batch_size, len(df))
        batch_df = df.iloc[batch_start:batch_end]
        
        # Process all texts in the batch
        descriptions = batch_df['Name'].tolist()
        text_inputs = processor(text=descriptions, padding=True, truncation=True, return_tensors="pt").to(device)
        
        with torch.no_grad():
            text_embeddings = model.get_text_features(**text_inputs)
            text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
        
        # Process all images in the batch
        images = []
        for pid in batch_df['Pid']:
            image_path = f"../data/images/{pid}.jpeg"
            try:
                image = Image.open(image_path).convert("RGB")
                images.append(image)
            except Exception as e:
                print(f"Error loading image {pid}: {e}")
                continue
        
        if not images:
            continue
            
        image_inputs = processor(images=images, return_tensors="pt", padding=True).to(device)
        
        with torch.no_grad():
            image_embeddings = model.get_image_features(**image_inputs)
            image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)
        
        # Calculate similarity matrix for the batch
        similarity_matrix = torch.mm(image_embeddings, text_embeddings.t())
        
        # For each image, find the most similar text
        predicted_indices = torch.argmax(similarity_matrix, dim=1)
        
        # Count correct predictions (diagonal elements should be highest)
        correct_predictions += (predicted_indices == torch.arange(len(images)).to(device)).sum().item()
        total_predictions += len(images)
        
        # Store similarities for analysis
        similarities_matrix.extend(similarity_matrix.cpu().numpy())
    
    # Calculate metrics
    similarities_array = np.array(similarities_matrix)
    accuracy = (correct_predictions / total_predictions) * 100 if total_predictions > 0 else 0
    
    # Store results in the dictionary
    model_results['Model'].append(name)
    model_results['Accuracy (%)'].append(round(accuracy, 2))
    model_results['Avg Similarity'].append(round(np.mean(similarities_array), 4))
    model_results['Std Similarity'].append(round(np.std(similarities_array), 4))
    model_results['Correct Predictions'].append(correct_predictions)
    model_results['Total Predictions'].append(total_predictions)

results_df = pd.DataFrame(model_results)
results_df.set_index('Model', inplace=True)

print("\nModel Evaluation Results:")
print(results_df)

Using 1000 samples for evaluation

Testing: CLIP-512 — openai/clip-vit-base-patch32


Processing CLIP-512:   0%|          | 0/100 [00:00<?, ?it/s]


Testing: CLIP-768 — openai/clip-vit-large-patch14


Processing CLIP-768:   0%|          | 0/100 [00:00<?, ?it/s]


Testing: SigLIP-512 — google/siglip-base-patch16-512


Processing SigLIP-512:   0%|          | 0/100 [00:00<?, ?it/s]


Testing: SigLIP-1024 — google/siglip-so400m-patch14-384


Processing SigLIP-1024:   0%|          | 0/100 [00:00<?, ?it/s]


Model Evaluation Results:
             Accuracy (%)  Avg Similarity  Std Similarity  \
Model                                                       
CLIP-512             88.8          0.1754          0.0644   
CLIP-768             91.3          0.1097          0.0719   
SigLIP-512           68.7         -0.0459          0.0706   
SigLIP-1024          94.3         -0.0296          0.0675   

             Correct Predictions  Total Predictions  
Model                                                
CLIP-512                     888               1000  
CLIP-768                     913               1000  
SigLIP-512                   687               1000  
SigLIP-1024                  943               1000  
