In [3]:
import torch
import pandas as pd
import numpy as np
import shutil
from pathlib import Path
import tempfile
from PIL import Image
from transformers import AutoModel, AutoProcessor
from tqdm.auto import tqdm
import pyarrow.parquet as pq

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def calculate_embeddings(df, save_path='embeddings.npz', batch_size=100):
    text_embeddings = []
    image_embeddings = []
    product_ids = []
    
    # Keep track of valid indices
    valid_indices = []
    
    # Batch image embedding first to determine which samples are valid
    image_paths = [f"../data/images/{pid}.jpeg" for pid in df['Pid'].tolist()]
    total_image_batches = (len(image_paths) + batch_size - 1) // batch_size
    
    for batch_num, i in enumerate(range(0, len(image_paths), batch_size), 1):
        batch_images = []
        batch_valid_indices = []
        
        for idx, path in enumerate(image_paths[i:i+batch_size]):
            try:
                # Open and convert image to RGB
                image = Image.open(path).convert("RGB")
                batch_images.append(image)
                batch_valid_indices.append(i + idx)  # Store the global index
            except Exception as e:
                print(f"Skipping problematic image {path}: {e}")
        
        if batch_images:
            try:
                # Process images using the CLIP processor
                inputs = processor(
                    images=batch_images,
                    return_tensors="pt",
                    padding=True
                ).to(device)
                
                with torch.no_grad():
                    batch_features = model.get_image_features(**inputs)
                    batch_features /= batch_features.norm(dim=-1, keepdim=True)
                
                image_embeddings.extend(batch_features.cpu().numpy())
                valid_indices.extend(batch_valid_indices)
                
            except Exception as e:
                print(f"Error processing batch {batch_num}: {e}")
                # Skip the problematic batch
                continue
                
        print(f"\rImage embedding batch {batch_num}/{total_image_batches} processed", end='', flush=True)
    
    print(f"\nProcessed {len(valid_indices)} valid images out of {len(image_paths)} total images")
    
    # Now process text only for valid indices
    texts = df['Name'].iloc[valid_indices].tolist()
    ids = df['Pid'].iloc[valid_indices].tolist()
    
    total_text_batches = (len(texts) + batch_size - 1) // batch_size
    for batch_num, i in enumerate(range(0, len(texts), batch_size), 1):
        batch_texts = texts[i:i+batch_size]
        inputs = processor(text=batch_texts, return_tensors="pt", padding=True, truncation=True).to(device)
        
        with torch.no_grad():
            batch_features = model.get_text_features(**inputs)
            batch_features /= batch_features.norm(dim=-1, keepdim=True)
            
        text_embeddings.extend(batch_features.cpu().numpy())
        product_ids.extend(ids[i:i+batch_size])
        print(f"\rText embedding batch {batch_num}/{total_text_batches} processed", end='', flush=True)
    
    print(f"\nFinal dataset size: {len(text_embeddings)} pairs")
    
    # Save
    np.savez(save_path, 
             text_embeddings=np.array(text_embeddings),
             image_embeddings=np.array(image_embeddings),
             product_ids=np.array(product_ids))
    
    return text_embeddings, image_embeddings, product_ids

In [7]:
def create_description(row, column_names):
    """
    Combines values from specified columns of a Pandas DataFrame row into a descriptive sentence,
    excluding columns with NaN values for that row.

    Args:
        row (pd.Series): A single row from a Pandas DataFrame.
        column_names (list): A list of column names to include in the description.  The order
            of names in this list determines the order they appear in the sentence.

    Returns:
        str: A string containing the combined text description, or an empty string if all
            specified columns are NaN.
    """
    description = ""
    valid_columns = []

    for col in column_names:
        if pd.notna(row[col]):
            valid_columns.append(col)

    if not valid_columns:
        return ""  # Return empty string if all columns are NaN

    # Construct the description based on available data.
    if "Name" in valid_columns:
        description += f"The product name is {row['Name']}"
        if "Color" in valid_columns and "Brand" in valid_columns:
            description += f" with color {row['Color']} from brand {row['Brand']}"
        elif "Color" in valid_columns:
            description += f" with color {row['Color']}"
        elif "Brand" in valid_columns:
            description += f" from brand {row['Brand']}"

    if "Price" in valid_columns and "PriceCurrency" in valid_columns:
        description += f". The price is {row['Price']} {row['PriceCurrency']}"

    if "Gender" in valid_columns:
        description += f". It is designed for {row['Gender']}"

    if "Size" in valid_columns:
        description += f". The size is {row['Size']}"
    return description



In [None]:
def zip_product_images(df, output_zip_path="product_images.zip"):
    """
    Creates a zip file containing all product images that exist in the data/images directory.
    
    Args:
        df (pd.DataFrame): DataFrame containing the 'Pid' column
        output_zip_path (str): Path where the zip file should be saved
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        # Get the list of Pids from the DataFrame
        pids = df['Pid'].tolist()
        
        # Counter for existing images
        existing_count = 0
        
        # Copy existing images to temp directory
        for pid in pids:
            src_path = f"../data/images/{pid}.jpeg"
            if os.path.exists(src_path):
                dst_path = os.path.join(temp_dir, f"{pid}.jpeg")
                shutil.copy2(src_path, dst_path)
                existing_count += 1
        
        print(f"Found {existing_count} existing images out of {len(pids)} Pids")
        
        # Create zip file
        shutil.make_archive(
            output_zip_path.replace('.zip', ''),  # Remove .zip as make_archive adds it
            'zip',
            temp_dir
        )
        
        print(f"Created zip file: {output_zip_path}")

In [5]:
# Set device
device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
print(f"Device is {device}")

SAMPLE_SIZE = 100000

# Load data
df = pd.read_parquet("../data/filtered_data.parquet", engine="pyarrow")
# Randomly sample 100k rows
df = df.sample(n=SAMPLE_SIZE, random_state=42)
print(f"Loaded {len(df)} rows of data")



Device is mps
Loaded 100000 rows of data


In [None]:
column_names = ['Name', 'Price', 'PriceCurrency', 'MergedBrand', 'Color', 'Gender', 'Size']
df["combined"] = df.apply(lambda row: create_description(row, column_names), axis=1)
df.to_csv('samples.csv')
zip_product_images(df)

In [None]:
zip_product_images(df)

In [None]:
# Load model
model_id = "openai/clip-vit-base-patch32"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()

# Calculate embeddings
text_embeddings, image_embeddings, product_ids = calculate_embeddings(df)