# SEM INDEX


In [2]:
import duckdb
import pandas as pd
import os
from lotus.dtype_extensions import ImageArray
from lotus.types import CascadeArgs, ProxyModel
import pandas as pd
from dotenv import load_dotenv

load_dotenv()


FASHION_DATASET_DIR = os.getenv("FASHION_DATASET_DIR")
FASHION_PARQUET = os.path.join(FASHION_DATASET_DIR, "styles.parquet")
FASHION_DETAILS_PARQUET = os.path.join(FASHION_DATASET_DIR, "styles_details.parquet")
FASHION_IMAGES_DIR = os.path.join(FASHION_DATASET_DIR, "images")
DATASET_CAPTION_DB = os.path.join(FASHION_DATASET_DIR, "fashion_dataset_caps_blip-image-captioning-large.db")

sample_size_percentage = 100
seed = 80
df = duckdb.query(f"""
with images as (
    SELECT *
    FROM parquet_scan('{FASHION_PARQUET}')
    USING SAMPLE {sample_size_percentage} PERCENT (reservoir, {seed})
    )
    select
     images.id ,images.subcategory, images.articletype, images.basecolour, details.price, images.productDisplayName,
     -- styleimages.default.resolutions."360X480"  as imageURL
     styleimages.default.imageURL  as imageURL
    -- *
    from images, parquet_scan('{FASHION_DETAILS_PARQUET}') details
    where images.id = details.id
    -- and details.price <1000
    order by images.id
""").to_df()

df["image"] = ImageArray(df["id"].apply(lambda i: os.path.join(FASHION_IMAGES_DIR, f"{int(i)}.jpg")))
df["image_url"] = ImageArray(df["imageURL"])




#### Creating the index

In [3]:
from lotus.fts_store.db_fts_store import SQLiteFTSStore
from lotus.vector_store import FaissVS
import lotus
from lotus.models import LM, SentenceTransformersRM

gpt_4o_mini = LM("gpt-4o-mini")
gpt_4o = LM("gpt-4o")

# CLIP embedding model – works for both text & image
# rm  = SentenceTransformersRM(model="clip-ViT-B-32")
rm  = SentenceTransformersRM(model="clip-ViT-L-14", max_batch_size=32)

lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini, rm=rm, vs=FaissVS(), cs=SQLiteFTSStore())

2025-08-25 17:28:58,410 - INFO - Load pretrained SentenceTransformer: clip-ViT-L-14
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
df = df.sem_index("image", index_dir=f"image_{sample_size_percentage}_index")
df = df.sem_index("productDisplayName", index_dir=f"productDisplayName_{sample_size_percentage}_index")





In [6]:
# df = df.load_sem_index("image", index_dir=f"image_{sample_size_percentage}_index")
# df = df.load_sem_index("image_url", index_dir=f"image_{sample_size_percentage}_index")
# df = df.load_sem_index("productDisplayName", index_dir=f"productDisplayName_{sample_size_percentage}_index")

df = df.sem_captions_index.attach_index("image", index_dir=DATASET_CAPTION_DB)
df = df.sem_captions_index.load("image")

df_f = df.sem_captions_index.search("white man shirt", "image", K=500)
df_f = df_f.sem_captions_index.search("jeans", "image", K=500)


# Full LLM calls

In [9]:
# merged_df_filtered_white_tshirts = df.sem_filter("{_image} is a product of white T-shirt", return_stats=False)
# merged_df_filtered_socks = df.sem_filter("{_image} is a product of a sock", return_stats=False)
# merged_df_filtered_wallet = df.sem_filter("{_image} shows a wallet", return_stats=False)
merged_df_filtered_black_footwear = df.sem_filter("tshirt", col_li=["image_url"], return_stats=False)


Filtering: 100%|██████████ 50/50 LM calls [00:07<00:00,  6.46it/s]


# Binary search filter

In [8]:

cascade_args = CascadeArgs(
    recall_target=0.9,
    precision_target=0.9,
    sampling_percentage=0.1,
    proxy_model=ProxyModel.EMBEDDING_MODEL,
)

merged_df_filtered =  df.sem_filter("tshirt", col_li=["image_url"], cascade_args=cascade_args, return_stats=True, find_top_k=True)


Filtering: 100%|██████████ 1/1 LM calls [00:04<00:00,  4.78s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:03<00:00,  3.38s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:03<00:00,  3.66s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:04<00:00,  4.31s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:08<00:00,  8.60s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:03<00:00,  3.54s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:06<00:00,  6.16s/it]


In [11]:

from join_optimizer.lotus.evaluate import evaluate_filter

metrics, FP, FN = evaluate_filter(
    dataset_df=merged_df_filtered_black_footwear,
    filtered_df=merged_df_filtered,
    article_type=None,
    base_colour=None
)
print(metrics)

{'TP': 11, 'FP': 1, 'FN': 1, 'precision': 0.9166666666666666, 'recall': 0.9166666666666666, 'f1': 0.9166666666666666}


#### Sampling

In [None]:
import sys
import importlib
%load_ext autoreload
%autoreload 2

importlib.reload(sys.modules['lotus.sem_ops.cascade_utils'])
importlib.reload(sys.modules['lotus.sem_ops.sem_filter'])
importlib.reload(sys.modules['lotus'])
importlib.reload(sys.modules['pandas'])
import lotus.sem_ops.sem_filter
import lotus

# Ensure you import the module (not just the function) so autoreload can update it:
import lotus.sem_ops.sem_filter as sem_filter_mod


In [None]:
cascade_args = CascadeArgs(
    recall_target=0.95,
    precision_target=0.9,
    sampling_percentage=0.1,
    proxy_model=ProxyModel.EMBEDDING_MODEL,
    cascade_IS_weight=1,
    cascade_num_calibration_quantiles = 100,
    failure_probability=0.1,
    cascade_IS_random_seed=114,
    cascade_IS_max_sample_range=444

)

merged_df_filtered_big =  df.sem_filter("{_image} shows a wallet", cascade_args=cascade_args ,return_stats=False, find_top_k=False)


In [None]:
from join_optimizer.join_optimizer.productDisplayName_index.evaluate import *

metrics, FP, FN = evaluate_filter(
    dataset_df=merged_df_filtered_wallet,
    filtered_df=merged_df_filtered_big,
    article_type=None,
    base_colour=None
)
print(metrics)

#### Just sim_search

In [None]:
# sem_search with similarity scores returned
sim_df_with_scores = df.sem_search(
    "productDisplayName",
    # "You will receive an image of a product. Determine whether the product can be worn on the feet, like shoes, sandals, flip-flops, ... The predominant color of the depicted product should be black. If there are multiple products in the picture, always refer to the most promiment one.",
    "black wallet",
    K=10,
    return_scores=True,
    suffix="_similarity_score"
)


#### Joins

In [None]:
res_sim_join = merged_df_filtered.sem_sim_join(df, left_on='_image', right_on='productDisplayName', K=1, keep_index=True)

In [None]:
expr = "Does {productDisplayName} exactly and precisely match the {_image}?"
from lotus.types import CascadeArgs, ProxyModel

cascade_args = CascadeArgs(
    recall_target=0.9,
    precision_target=0.75,
    sampling_percentage=0.04,
    proxy_model=ProxyModel.EMBEDDING_MODEL,
)

res = merged_df_filtered.sem_join(df, expr,cascade_args=cascade_args, return_stats=True)

# print(f"Joined {df.shape[0]} rows from df1 with {filtered_df2.shape[0]} rows from df2")
# print(f"    Join cascade took {stats['join_resolved_by_large_model']} LM calls")
# print(f"    Helper resolved {stats['join_resolved_by_helper_model']} LM calls")
# print(f"Join cascade used {stats['total_LM_calls']} LM calls in total")
# print(f"Naive join would require {df.shape[0]*df2.shape[0]} LM calls")


In [None]:
res = 0

In [None]:

expr = "given {productDisplayName}, which {_image} does more exactly and precisely match?"

ranked, stats = res[0].sem_topk(
    expr,
    K=1,
    group_by=["productDisplayName"],
    method="quick",
    return_stats=True
)
ranked


In [None]:
join_topk = ranked.query('_id == id')
sim_1 = res_sim_join.query('_id == id')