# SEM INDEX


In [24]:
import os

import duckdb
from dotenv import load_dotenv
import pandas as pd
from lotus.dtype_extensions import ImageArray
from lotus.types import CascadeArgs, ProxyModel

load_dotenv()

LOAD_INDEX = True

FASHION_DATASET_DIR = os.getenv("FASHION_DATASET_DIR")
FASHION_PARQUET = os.path.join(FASHION_DATASET_DIR, "styles.parquet")
FASHION_DETAILS_PARQUET = os.path.join(FASHION_DATASET_DIR, "styles_details.parquet")
FASHION_IMAGES_DIR = os.path.join(FASHION_DATASET_DIR, "images")
DATASET_CAPTION_DB_BLIP = os.path.join(FASHION_DATASET_DIR, "fashion_dataset_caps_blip-image-captioning-large.db")
DATASET_CAPTION_DB_INSTRUCTBLIP = os.path.join(FASHION_DATASET_DIR, "fashion_dataset_caps_instructblip-flan-t5-xl.db")

sample_size_percentage = 100
seed = 80
df = duckdb.query(f"""
with images as (
    SELECT *
    FROM parquet_scan('{FASHION_PARQUET}')
    USING SAMPLE {sample_size_percentage} PERCENT (reservoir, {seed})
    )
    select
     images.id ,images.subcategory, images.articletype, images.basecolour, details.price, images.productDisplayName,
     -- styleimages.default.resolutions."360X480"  as imageURL
     styleimages.default.imageURL  as imageURL
    -- *
    from images, parquet_scan('{FASHION_DETAILS_PARQUET}') details
    where images.id = details.id
    -- and details.price <1000
    order by images.id
""").to_df()

df["image"] = ImageArray(df["id"].apply(lambda i: os.path.join(FASHION_IMAGES_DIR, f"{int(i)}.jpg")))
df["image_url"] = ImageArray(df["imageURL"])




#### Creating the index

In [25]:
from lotus.fts_store.db_fts_store import SQLiteFTSStore
from lotus.vector_store import FaissVS
import lotus
from lotus.models import LM, SentenceTransformersRM

gpt_4o_mini = LM("gpt-4o-mini")
gpt_4o = LM("gpt-4o")

# CLIP embedding model – works for both text & image
# rm  = SentenceTransformersRM(model="clip-ViT-B-32")
rm = SentenceTransformersRM(model="clip-ViT-L-14", max_batch_size=32)

lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini, rm=rm, vs=FaissVS(), cs=SQLiteFTSStore())

2025-08-26 16:16:01,934 - INFO - Load pretrained SentenceTransformer: clip-ViT-L-14


In [26]:
if not LOAD_INDEX:
    df = df.sem_index("image", index_dir=f"{FASHION_DATASET_DIR}/image_{sample_size_percentage}_index")
    df = df.sem_index("productDisplayName",
                      index_dir=f"{FASHION_DATASET_DIR}/productDisplayName_{sample_size_percentage}_index")



In [27]:
df = df.load_sem_index("image", index_dir=f"{FASHION_DATASET_DIR}/image_{sample_size_percentage}_index")
df = df.load_sem_index("image_url", index_dir=f"{FASHION_DATASET_DIR}/image_{sample_size_percentage}_index")
df = df.load_sem_index("productDisplayName",
                       index_dir=f"{FASHION_DATASET_DIR}/productDisplayName_{sample_size_percentage}_index")

just take few samples for testing purposes

In [28]:
df = df.sample(n=1000, random_state=seed)

# Prompt

In [29]:
prompt = "dark formal clothes"

# Full LLM calls

In [30]:
df_res_llm = df.sem_filter(prompt, col_li=["image_url"], return_stats=False)


Filtering:   0%|           0/1000 LM calls [00:00<?, ?it/s]2025-08-26 16:16:04,008 - INFO - Retrying request to /chat/completions in 0.475701 seconds
Filtering: 100%|██████████ 1000/1000 LM calls [00:32<00:00, 30.63it/s]


# Binary search filter

In [31]:

cascade_args = CascadeArgs(
    recall_target=0.9,
    precision_target=0.9,
    sampling_percentage=0.1,
    proxy_model=ProxyModel.EMBEDDING_MODEL,
)

df_res_binary_s = df.sem_filter(prompt, col_li=["image_url"], cascade_args=cascade_args,
                                              return_stats=True, find_top_k=True)


Filtering: 100%|██████████ 1/1 LM calls [00:02<00:00,  2.67s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:00<00:00,  1.10it/s]
Filtering: 100%|██████████ 1/1 LM calls [00:01<00:00,  1.97s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:01<00:00,  1.04s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:02<00:00,  2.96s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:02<00:00,  2.00s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:02<00:00,  2.73s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:02<00:00,  2.13s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:02<00:00,  2.54s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:02<00:00,  2.16s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:01<00:00,  1.45s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:01<00:00,  1.77s/it]


In [None]:

from join_optimizer.lotus.evaluate import evaluate_filter

metrics, FP, FN = evaluate_filter(
    dataset_df=df_res_llm,
    filtered_df=df_res_binary_s,
)
print(metrics)

# Sampling

In [None]:
cascade_args = CascadeArgs(
    recall_target=0.95,
    precision_target=0.9,
    sampling_percentage=0.1,
    proxy_model=ProxyModel.EMBEDDING_MODEL,
    cascade_IS_weight=1,
    cascade_num_calibration_quantiles=100,
    failure_probability=0.1,
    cascade_IS_random_seed=114,
)

df_res_lotus = df.sem_filter(prompt, col_li=["image_url"], cascade_args=cascade_args, return_stats=False,
                                           find_top_k=False)


In [None]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_res_llm,
    filtered_df=df_res_lotus,
)
print(metrics)

# Caption Search BLIP


In [None]:
df = df.sem_captions_index.attach_index("image", index_dir=DATASET_CAPTION_DB_BLIP)
df = df.sem_captions_index.load("image")
df_res_blip = df.sem_captions_index.search(prompt, "image")

In [None]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_res_llm,
    filtered_df=df_res_blip,
)
print(metrics)


### with prompt augmentation

In [None]:
def augment_prompt(prompt, augmentation_prompt):
    prompt_as_df = pd.DataFrame({"query": [prompt]})
    return prompt_as_df.sem_map(augmentation_prompt, suffix="augmented_prompt")["augmented_prompt"][0].replace("'", " ").replace("-", " ")


In [None]:
prompt_augmentation_prompt = "you will receive a {prompt} to do a full text search filter on a dataset. since the search is sintactical, provide 10 other prompts similar to the one provided, so that similar items can be obtained. Separate the results with a simple space and without delimiters like \" or \«. only respond with the result."

prompt_augmentation_prompt = """
You will receive a plain-language search {query} and must return a SINGLE valid SQLite FTS5 MATCH expression (the right-hand side of `... MATCH <expr>`). Return ONLY the expression, with no quotes around the whole thing, no SQL, no code fences, and no explanations.




REQUIREMENTS
1) Output must be valid FTS5 boolean syntax using ONLY: parentheses `()`, `AND`, `OR`, `NOT`, double-quoted phrases, and (optionally) `NEAR` when allowed below. Do NOT use field qualifiers, weights, or other SQL.
2) Group synonyms/near-lexicon with OR inside parentheses. Use AND between concept buckets.
   - Example shape: `(concept1_a OR concept1_b OR "concept1 phrase") AND (concept2_a OR concept2_b) ...`
3) Expand the user_query into 2–5 concept buckets (meaningful facets like style, color/tone, item types, descriptors, etc.). Inside each bucket, include common synonyms, close lexical variants, and singular/plural irregulars. The database already handles case, diacritics, and stemming—only add explicit variants when helpful (e.g., "tuxedo OR tuxedos", "black-tie OR \"black tie\"").
4) If must_include is provided, ensure each term/phrase is present by adding extra AND groups for them (quoted as needed).
5) If exclude is provided, append `AND NOT (...)` with OR-joined terms/phrases to filter them out.
6) Phrases must use double quotes (e.g., "black tie"). Do NOT wrap the entire output in quotes.
7) Avoid `*` wildcards unless the input explicitly asks for prefix search.
8) Proximity:
   - If require_proximity = true, use NEAR **only in properly nested binary form** and at most to tie TWO buckets: `((bucketA) NEAR (bucketB)) AND (bucketC) ...`. Never chain `A NEAR B NEAR C` without nesting.
   - If require_proximity = false (default), do NOT use NEAR.
9) Keep the expression concise (<1000 characters).

OUTPUT
- Only the MATCH expression, For example:
  `(elegant OR upscale OR formal) AND ("black tie" OR "black-tie" OR dressy) AND (suit OR suits OR tuxedo OR tuxedos OR dress OR dresses OR gown OR gowns OR tie OR ties OR "bow tie" OR "bow-tie" OR blazer OR blazers OR jacket OR jackets OR trousers OR pants OR slacks OR skirt OR skirts) AND (dark OR black OR charcoal OR onyx OR midnight OR navy OR indigo OR burgundy OR maroon)`

"""

augmented_prompt = augment_prompt(prompt, prompt_augmentation_prompt)
print(augmented_prompt)

In [None]:
df_res_blip_augmented = df.sem_captions_index.search(augmented_prompt, "image")


In [None]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_res_llm,
    filtered_df=df_res_blip,
)
print(metrics)


# Caption Search INSTRUCTBLIP

In [None]:
df = df.sem_captions_index.attach_index("image", index_dir=DATASET_CAPTION_DB_INSTRUCTBLIP)
df = df.sem_captions_index.load("image")
df_res_instructblip = df.sem_captions_index.search(prompt, "image")

In [None]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_res_llm,
    filtered_df=df_res_instructblip,
)
print(metrics)


### Augmented

In [None]:
df_res_instructblip_augmented = df.sem_captions_index.search(augmented_prompt, "image")

In [None]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_res_llm,
    filtered_df=df_res_instructblip_augmented,
)
print(metrics)


### Pass through small model

In [45]:
df_res_captions_small_model = df.sem_filter(
    prompt, col_li=["image_cap"])

Filtering: 100%|██████████ 1000/1000 LM calls [00:20<00:00, 47.99it/s]


In [46]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_res_llm,
    filtered_df=df_res_captions_small_model,
)
print(metrics)


{'TP': 11, 'FP': 3, 'FN': 24, 'precision': 0.7857142857142857, 'recall': 0.3142857142857143, 'f1': 0.44897959183673464}


##### FN example

"The image features a woman wearing a green dress and a black dress. She is posing in front of a white background, with her hands placed on her hips. The woman is holding a handbag in her right hand, while her left hand is positioned closer to the camera. The handbag can be seen"

dark formal clothes ? yes or no

Answer:

No — the description does not suggest **dark formal clothes**.

* The **green dress** is not typically considered dark formal attire.
* The mention of a **black dress** could be, but since the description is ambiguous (she is said to be wearing *both* a green and a black dress), it’s unclear whether the outfit is actually formal.
* The **context** (posing with a handbag, hands on hips, white background) suggests more of a **fashion photo shoot** rather than formal wear.

✅ So the answer is **No** — this does not clearly depict dark formal clothes.

http://assets.myntassets.com/v1/images/style/properties/Femella-Women-Green---Black-Dress_599cf4fdf7051ee5aed2629240ecdc5b_images.jpg