# SEM INDEX


In [20]:
import duckdb
import pandas as pd
import os
from lotus.dtype_extensions import ImageArray
from lotus.types import CascadeArgs, ProxyModel
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

OFF_DATASET_DIR = os.getenv("OFF_DATASET_DIR")

OFF_PARQUET = os.path.join(OFF_DATASET_DIR, "products.parquet")
OFF_IMAGES_DIR = os.path.join(OFF_DATASET_DIR, "images")
INDEX_DB = os.path.join(OFF_DATASET_DIR, "off_uk_top2000_with_images_caps_instructblip-flan-t5-xl.db")

sample_size_percentage = 100
seed = 80
df = duckdb.query(f"""
    SELECT *
    FROM parquet_scan('{OFF_PARQUET}')

    USING SAMPLE {sample_size_percentage} PERCENT (reservoir, {seed})
    ORDER BY code ASC

""").to_df()

df["image"] = ImageArray(df["code"].apply(lambda i: os.path.join(OFF_IMAGES_DIR, f"{str(i)}.jpg")))
df["image_url"] = ImageArray(df["image_front_url"])




#### Creating the index

In [21]:
from lotus.fts_store.db_fts_store import SQLiteFTSStore
from lotus.vector_store import FaissVS
import lotus
from lotus.models import LM, SentenceTransformersRM

gpt_4o_mini = LM("gpt-4o-mini")
gpt_4o = LM("gpt-4o")

# CLIP embedding model – works for both text & image
# rm  = SentenceTransformersRM(model="clip-ViT-B-32")
rm  = SentenceTransformersRM(model="clip-ViT-L-14", max_batch_size=16)

lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini, rm=rm, vs=FaissVS(), cs=SQLiteFTSStore())

2025-08-25 20:24:51,117 - INFO - Load pretrained SentenceTransformer: clip-ViT-L-14


In [None]:
df = df.sem_index("image", index_dir=f"{OFF_DATASET_DIR}/image{sample_size_percentage}_index")



In [24]:
df = df.load_sem_index("image", index_dir=f"{OFF_DATASET_DIR}/image{sample_size_percentage}_index")
# df = df.load_sem_index("image_url", index_dir=f"image_{sample_size_percentage}_index")
# df = df.load_sem_index("productDisplayName", index_dir=f"productDisplayName_{sample_size_percentage}_index")

df = df.sem_captions_index.attach_index("image", index_dir=INDEX_DB)
df = df.sem_captions_index.load("image")

df_f = df.sem_captions_index.search("vegetarian", "image", K=200)


# Full LLM calls

In [None]:
# merged_df_filtered_white_tshirts = df.sem_filter("{_image} is a product of white T-shirt", return_stats=False)
# merged_df_filtered_socks = df.sem_filter("{_image} is a product of a sock", return_stats=False)
# merged_df_filtered_wallet = df.sem_filter("{_image} shows a wallet", return_stats=False)
merged_df_filtered_black_footwear = df.sem_filter("dark chocolate", col_li=["image_url"], return_stats=False)


# Binary search filter

In [None]:

cascade_args = CascadeArgs(
    recall_target=0.9,
    precision_target=0.9,
    sampling_percentage=0.1,
    proxy_model=ProxyModel.EMBEDDING_MODEL,
)

merged_df_filtered =  df.sem_filter("tshirt", col_li=["image_url"], cascade_args=cascade_args, return_stats=True, find_top_k=True)


In [None]:

from join_optimizer.lotus.evaluate import evaluate_filter

metrics, FP, FN = evaluate_filter(
    dataset_df=merged_df_filtered_black_footwear,
    filtered_df=merged_df_filtered,
    article_type=None,
    base_colour=None
)
print(metrics)

#### Sampling

In [None]:
import sys
import importlib
%load_ext autoreload
%autoreload 2

importlib.reload(sys.modules['lotus.sem_ops.cascade_utils'])
importlib.reload(sys.modules['lotus.sem_ops.sem_filter'])
importlib.reload(sys.modules['lotus'])
importlib.reload(sys.modules['pandas'])
import lotus.sem_ops.sem_filter
import lotus

# Ensure you import the module (not just the function) so autoreload can update it:
import lotus.sem_ops.sem_filter as sem_filter_mod


In [None]:
cascade_args = CascadeArgs(
    recall_target=0.95,
    precision_target=0.9,
    sampling_percentage=0.1,
    proxy_model=ProxyModel.EMBEDDING_MODEL,
    cascade_IS_weight=1,
    cascade_num_calibration_quantiles = 100,
    failure_probability=0.1,
    cascade_IS_random_seed=114,
    cascade_IS_max_sample_range=444

)

merged_df_filtered_big =  df.sem_filter("{_image} shows a wallet", cascade_args=cascade_args ,return_stats=False, find_top_k=False)


In [None]:
from join_optimizer.join_optimizer.productDisplayName_index.evaluate import *

metrics, FP, FN = evaluate_filter(
    dataset_df=merged_df_filtered_wallet,
    filtered_df=merged_df_filtered_big,
    article_type=None,
    base_colour=None
)
print(metrics)

#### Just sim_search

In [25]:
# sem_search with similarity scores returned
sim_df_with_scores = df_f.sem_search(
    "image",
    # "You will receive an image of a product. Determine whether the product can be worn on the feet, like shoes, sandals, flip-flops, ... The predominant color of the depicted product should be black. If there are multiple products in the picture, always refer to the most promiment one.",
    "sausage",
    K=10,
    return_scores=True,
    suffix="_similarity_score"
)


#### Joins

In [None]:
res_sim_join = merged_df_filtered.sem_sim_join(df, left_on='_image', right_on='productDisplayName', K=1, keep_index=True)

In [None]:
expr = "Does {productDisplayName} exactly and precisely match the {_image}?"
from lotus.types import CascadeArgs, ProxyModel

cascade_args = CascadeArgs(
    recall_target=0.9,
    precision_target=0.75,
    sampling_percentage=0.04,
    proxy_model=ProxyModel.EMBEDDING_MODEL,
)

res = merged_df_filtered.sem_join(df, expr,cascade_args=cascade_args, return_stats=True)

# print(f"Joined {df.shape[0]} rows from df1 with {filtered_df2.shape[0]} rows from df2")
# print(f"    Join cascade took {stats['join_resolved_by_large_model']} LM calls")
# print(f"    Helper resolved {stats['join_resolved_by_helper_model']} LM calls")
# print(f"Join cascade used {stats['total_LM_calls']} LM calls in total")
# print(f"Naive join would require {df.shape[0]*df2.shape[0]} LM calls")


In [None]:
res = 0

In [None]:

expr = "given {productDisplayName}, which {_image} does more exactly and precisely match?"

ranked, stats = res[0].sem_topk(
    expr,
    K=1,
    group_by=["productDisplayName"],
    method="quick",
    return_stats=True
)
ranked


In [None]:
join_topk = ranked.query('_id == id')
sim_1 = res_sim_join.query('_id == id')