In [1]:
import os
from typing import Any

import duckdb
import pandas as pd
from dotenv import load_dotenv

import lotus
from join_optimizer.lotus.evaluate import evaluate_filter
from lotus.dtype_extensions import ImageArray
from lotus.fts_store.db_fts_store import SQLiteFTSStore
from lotus.models import LM, SentenceTransformersRM
from lotus.types import BayesStoppingArgs
from lotus.types import CascadeArgs, ProxyModel
from lotus.vector_store import FaissVS

load_dotenv()

LOAD_INDEX = True

OFF_DATASET_DIR = os.getenv("OFF_DATASET_DIR")

OFF_PARQUET = os.path.join(OFF_DATASET_DIR, "products.parquet")
OFF_IMAGES_DIR = os.path.join(OFF_DATASET_DIR, "images")
DATASET_CAPTION_DB_BLIP = os.path.join(OFF_DATASET_DIR,
                                       "off_uk_top2000_with_images_caps_blip-image-captioning-large.db")
DATASET_CAPTION_DB_INSTRUCTBLIP = os.path.join(OFF_DATASET_DIR,
                                               "off_uk_top2000_with_images_caps_instructblip-flan-t5-xl.db")
DATASET_CAPTION_DB_GPT_5_NANO = os.path.join(OFF_DATASET_DIR, "llm.db")

stats = {}

sample_size_percentage = 100
seed = 80
df = duckdb.query(f"""
    SELECT *
    FROM parquet_scan('{OFF_PARQUET}')

    USING SAMPLE {sample_size_percentage} PERCENT (reservoir, {seed})
    ORDER BY code ASC

""").to_df()

df["image"] = ImageArray(df["code"].apply(lambda i: os.path.join(OFF_IMAGES_DIR, f"{str(i)}.jpg")))
df["image_url"] = ImageArray(df["image_front_url"]
                             # .apply(lambda i: i.replace('.400.', '.full.'))
                             )


  from tqdm.autonotebook import tqdm, trange


#### Creating the index

In [2]:
gpt_5_nano = LM("gpt-5-nano")
gpt_5_mini = LM("gpt-5-mini")

gpt_5_____sure = LM("gpt-5")

gpt_4o_mini = LM("gpt-4o-mini")
gpt_4o = LM("gpt-4o")

filter_llm = gpt_5_mini

# CLIP embedding model – works for both text & image
# rm  = SentenceTransformersRM(model="clip-ViT-B-32")
rm = SentenceTransformersRM(model="clip-ViT-L-14", max_batch_size=32)

lotus.settings.configure(lm=gpt_5_mini, helper_lm=gpt_5_nano, rm=rm, vs=FaissVS(), cs=SQLiteFTSStore())

2025-09-06 12:39:33,846 - INFO - Load pretrained SentenceTransformer: clip-ViT-L-14
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
if not LOAD_INDEX:
    df = df.sem_index("image", index_dir=f"{OFF_DATASET_DIR}/image{sample_size_percentage}_index")



In [4]:
df = df.load_sem_index("image", index_dir=f"{OFF_DATASET_DIR}/image{sample_size_percentage}_index")
df = df.load_sem_index("image_url", index_dir=f"{OFF_DATASET_DIR}/image{sample_size_percentage}_index")

In [5]:
df = df.sample(n=100, random_state=seed)


# Prompt

In [6]:
prompt = "sugar free dairy product"

stats helper

In [7]:
def add_stats(method: str, _metrics: dict[str, Any] = None):
    if _metrics is None:
        _metrics = {}
    stats[method] = _metrics
    stats[method]["Virtual Cost"] = filter_llm.stats.virtual_usage.total_cost
    stats[method]["Virtual Tokens"] = filter_llm.stats.virtual_usage.total_tokens
    filter_llm.reset_stats()


# Full LLM calls

In [8]:
method_name = "full LLM calls"

df_resd_llm = df.sem_filter(prompt, col_li=["image"], return_stats=False)

metrics, FP, FN = evaluate_filter(
    dataset_df=df_resd_llm,
    filtered_df=df_resd_llm,
    id_column='code'
)
add_stats(method_name, metrics)
print(stats[method_name])

Filtering: 100%|██████████ 100/100 LM calls [00:11<00:00,  8.92it/s]

{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.04542360000000001, 'Virtual Tokens': 94472}





# Binary search filter

In [9]:
method_name = "binary search filter"

cascade_args = CascadeArgs(
    recall_target=0.9,
    precision_target=0.9,
    sampling_percentage=0.1,
    proxy_model=ProxyModel.EMBEDDING_MODEL,
)
df_resd_binary_s = df.sem_filter(prompt, col_li=["image"], cascade_args=cascade_args,
                                 return_stats=True, find_top_k=True)


Filtering: 100%|██████████ 1/1 LM calls [00:04<00:00,  4.49s/it]


In [10]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_resd_llm,
    filtered_df=df_resd_binary_s,
    id_column='code'
)
add_stats(method_name, metrics)
print(stats[method_name])

{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.00058775, 'Virtual Tokens': 1371}


# Sampling

In [11]:
method_name = "Lotus sampling"
cascade_args = CascadeArgs(
    recall_target=0.95,
    precision_target=0.9,
    sampling_percentage=0.1,
    proxy_model=ProxyModel.EMBEDDING_MODEL,
    cascade_IS_weight=1,
    cascade_num_calibration_quantiles=100,
    failure_probability=0.1,
    cascade_IS_random_seed=114,
)
filter_llm.reset_stats()
df_resd_lotus = df.sem_filter(prompt, col_li=["image"], cascade_args=cascade_args, return_stats=False,
                              find_top_k=False)
filter_llm.print_total_usage()



Running oracle for threshold learning: 100%|██████████ 10/10 LM calls [00:04<00:00,  2.23it/s]
2025-09-06 12:39:56,047 - INFO - Sample recall: 0.0
2025-09-06 12:39:56,048 - INFO - Sample precision: 0.0
2025-09-06 12:39:56,048 - INFO - Learned cascade thresholds: (1.0, 0)
2025-09-06 12:39:56,048 - INFO - Num routed to smaller model: 0
Running predicate evals with oracle LM: 100%|██████████ 100/100 LM calls [00:16<00:00,  6.20it/s]


=== Usage Statistics ===
Virtual  = Total usage if no caching was used
Physical = Actual usage with caching applied

Virtual Cost:     $0.049806
Physical Cost:    $0.049806
Virtual Tokens:   103,522
Physical Tokens:  103,522
Cache Hits:       0






In [12]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_resd_llm,
    filtered_df=df_resd_lotus,
    id_column='code'
)
add_stats(method_name, metrics)
print(stats[method_name])

{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.04980649999999999, 'Virtual Tokens': 103522}


# Caption Search BLIP


In [13]:
method_name = "Caption blip"
df = df.sem_captions_index.attach_index("image", index_dir=DATASET_CAPTION_DB_BLIP)
df = df.sem_captions_index.load("image")
df_resd_blip = df.sem_captions_index.search(prompt, "image")

In [14]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_resd_llm,
    filtered_df=df_resd_blip,
    id_column='code'
)
add_stats(method_name, metrics)
print(stats[method_name])


{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.0, 'Virtual Tokens': 0}


### with prompt augmentation

In [15]:
def augment_prompt(prompt, augmentation_prompt):
    prompt_as_df = pd.DataFrame({"query": [prompt]})
    return prompt_as_df.sem_map(augmentation_prompt, suffix="augmented_prompt")["augmented_prompt"][0].replace("'",
                                                                                                               " ").replace(
        "-", " ")


In [16]:
prompt_augmentation_prompt = "you will receive a {query} to do a full text search filter on a dataset. since the search is sintactical, provide 10 other prompts similar to the one provided, so that similar items can be obtained. Separate the results with a simple space and without delimiters like \" or \«. only respond with the result."

prompt_augmentation_prompt = """
You will receive a plain-language search {query} and must return a SINGLE valid SQLite FTS5 MATCH expression (the right-hand side of `... MATCH <expr>`). Return ONLY the expression, with no quotes around the whole thing, no SQL, no code fences, and no explanations.

REQUIREMENTS
1) Output must be valid FTS5 boolean syntax using ONLY: parentheses `()`, `AND`, `OR`, `NOT`, double-quoted phrases, and (optionally) `NEAR` when allowed below. Do NOT use field qualifiers, weights, or other SQL.
2) Group synonyms/near-lexicon with OR inside parentheses. Use AND between concept buckets.
   - Example shape: `(concept1_a OR concept1_b OR "concept1 phrase") AND (concept2_a OR concept2_b) ...`
3) Expand the user_query into 2–5 concept buckets (meaningful facets like style, color/tone, item types, descriptors, etc.). Inside each bucket, include common synonyms, close lexical variants, and singular/plural irregulars. The database already handles case, diacritics, and stemming—only add explicit variants when helpful (e.g., "tuxedo OR tuxedos", "black-tie OR \"black tie\"").
6) Phrases must use double quotes (e.g., "black tie"). Do NOT wrap the entire output in quotes.
7) Avoid `*` wildcards unless the input explicitly asks for prefix search.

OUTPUT
- Only the MATCH expression
"""

augmented_prompt = augment_prompt(prompt, prompt_augmentation_prompt)
print(augmented_prompt)

Mapping: 100%|██████████ 1/1 LM calls [00:13<00:00, 13.36s/it]

("sugar free" OR "sugar free" OR sugarfree OR "no sugar" OR "no sugar added" OR "no sugar" OR unsweetened OR sugarless OR "low sugar" OR "reduced sugar") AND (dairy OR "dairy product" OR "dairy products" OR milk OR "milk product" OR cheese OR cheeses OR yogurt OR yoghurt OR yogurts OR yoghurts OR butter OR cream OR "ice cream" OR icecream OR "cream cheese" OR kefir OR buttermilk)





In [17]:
method_name = "caption blip with prompt augmentation"
df_resd_blip_augmented = df.sem_captions_index.search(augmented_prompt, "image")

In [18]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_resd_llm,
    filtered_df=df_resd_blip,
    id_column='code'
)
add_stats(method_name, metrics)
print(stats[method_name])


{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.0021237499999999998, 'Virtual Tokens': 1376}


# Caption Search INSTRUCTBLIP

In [19]:
method_name = "instruct blip"
df = df.sem_captions_index.attach_index("image", index_dir=DATASET_CAPTION_DB_INSTRUCTBLIP)
df = df.sem_captions_index.load("image")
df_resd_instructblip = df.sem_captions_index.search(prompt, "image")

In [20]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_resd_llm,
    filtered_df=df_resd_instructblip,
    id_column='code'
)
add_stats(method_name, metrics)
print(stats[method_name])


{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.0, 'Virtual Tokens': 0}


### Augmented

In [21]:
method_name = "instruct blip + prompt augmentation"
df_resd_instructblip_augmented = df.sem_captions_index.search(augmented_prompt, "image")

In [22]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_resd_llm,
    filtered_df=df_resd_instructblip_augmented,
    id_column='code'
)
add_stats(method_name, metrics)
print(stats[method_name])


{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.0, 'Virtual Tokens': 0}


### Pass through small model

In [23]:
method_name = "call LLM on instruct blip captions"
df_resd_captions_small_model = df.sem_filter(
    prompt, col_li=["image_cap"])


Filtering: 100%|██████████ 100/100 LM calls [00:12<00:00,  7.91it/s]


In [24]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_resd_llm,
    filtered_df=df_resd_captions_small_model,
    id_column='code'
)
add_stats(method_name, metrics)
print(stats[method_name])

{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.03265499999999997, 'Virtual Tokens': 28588}


# gpt-5-nano Captions

In [25]:
method_name = "Gpt-5-nano captions"
df = df.sem_captions_index.attach_index("image", index_dir=DATASET_CAPTION_DB_GPT_5_NANO)
df = df.sem_captions_index.load("image")
df_resd_llm_captions = df.sem_captions_index.search(prompt, "image")

In [26]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_resd_llm,
    filtered_df=df_resd_llm_captions,
    id_column='code'
)
add_stats(method_name, metrics)
print(stats[method_name])


{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.0, 'Virtual Tokens': 0}


#### Augmented

In [27]:
method_name = "Gpt-5-nano captions + prompt augmentation"
df_resd_llm_captions_augmented = df.sem_captions_index.search(augmented_prompt, "image")
metrics, FP, FN = evaluate_filter(
    dataset_df=df_resd_llm,
    filtered_df=df_resd_llm_captions_augmented,
    id_column='code'
)
add_stats(method_name, metrics)
print(stats[method_name])


{'TP': 0, 'FP': 1, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.0, 'Virtual Tokens': 0}


#### Pass Captions through llm

In [28]:
method_name = "pass Gpt-5-nano captions through model"
df_resd_llm_captions_small_model = df.sem_filter(
    prompt, col_li=["image_cap"])

metrics, FP, FN = evaluate_filter(
    dataset_df=df_resd_llm,
    filtered_df=df_resd_llm_captions_small_model,
    id_column='code'
)
add_stats(method_name, metrics)
print(stats[method_name])

Filtering: 100%|██████████ 100/100 LM calls [00:08<00:00, 11.90it/s]

{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.03495475000000001, 'Virtual Tokens': 34651}





# Bayesian stopping

In [29]:
method_name = "rolling beta posterior"
bayes_args = BayesStoppingArgs(
    block_size=40,
    min_depth=20,
    max_depth=None,
    delta=0.9,
    discount=0.9,
    alpha0=10.0,
    beta0=1.0,
    verifier_recall=1.0,
    default=True,
    verbose=True)

df_resd_bayes = df.sem_filter(prompt, col_li=["image"], cascade_args=cascade_args, find_top_k=False, bayesian_scan=True,
                              bayes_stopping_args=bayes_args)
filter_llm.print_total_usage()

[scan] Starting scan_with_bayesian_stopping (no bins)
[scan] n_docs=100, block_size=40, min_depth=20, max_depth=None
[scan] delta=0.9 (continue while P(next block ≥1 positive) ≥ delta)
[scan] discount=0.9, alpha0=10.0, beta0=1.0
[scan] verifier_recall=1.0, default=True

[scan] ── Processing block ─────────────────────────────────────────
[scan] Block indices: [0:40) (size=40)
[scan] Calling sem_filter(...) on this block


Filtering: 100%|██████████ 40/40 LM calls [00:06<00:00,  6.25it/s]


[scan] Discounting prior evidence by factor 0.9
[scan] Before discount | succ=0.000 fail=0.000
[scan] After  discount | succ=0.000 fail=0.000
[scan] sem_filter tallies in this block: pos=0, neg=40
[scan] Updated posteriors    | succ=0.000 fail=40.000
[scan] Look-ahead next block indices: [40:80) (size=40)
[scan] Posterior alpha=10.000, beta=41.000, P(next block ≥1 pos) = 0.998204
[scan] Continuing: 0.998204 ≥ delta=0.9

[scan] ── Processing block ─────────────────────────────────────────
[scan] Block indices: [40:80) (size=40)
[scan] Calling sem_filter(...) on this block


Filtering: 100%|██████████ 40/40 LM calls [00:07<00:00,  5.51it/s]
2025-09-06 12:41:01,561 - INFO - Bayesian scan stopped after 80 of 100 items
2025-09-06 12:41:01,561 - INFO - Found 0 positives


[scan] Discounting prior evidence by factor 0.9
[scan] Before discount | succ=0.000 fail=40.000
[scan] After  discount | succ=0.000 fail=36.000
[scan] sem_filter tallies in this block: pos=0, neg=40
[scan] Updated posteriors    | succ=0.000 fail=76.000
[scan] Look-ahead next block indices: [80:100) (size=20)
[scan] Posterior alpha=10.000, beta=77.000, P(next block ≥1 pos) = 0.888838
[scan] Stopping: 0.888838 < delta=0.9

[scan] ── Final summary ───────────────────────────────────────────
[scan] Scanned stop_index=80 docs out of n=100
[scan] Positives found: 0 | Negatives (scanned prefix): 80
[scan] P(any positive in next block) at stop: 0.888838

=== Usage Statistics ===
Virtual  = Total usage if no caching was used
Physical = Actual usage with caching applied

Virtual Cost:     $0.038481
Physical Cost:    $0.038481
Virtual Tokens:   77,316
Physical Tokens:  77,316
Cache Hits:       0



In [30]:
metrics, FP, FN = evaluate_filter(
    dataset_df=df_resd_llm,
    filtered_df=df_resd_bayes,
    id_column='code'
)
add_stats(method_name, metrics)
print(stats[method_name])


{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.038480999999999994, 'Virtual Tokens': 77316}


In [32]:
for _method_name, _stats in stats.items():
    print(_method_name)
    print(_stats)

full LLM calls
{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.04542360000000001, 'Virtual Tokens': 94472}
binary search filter
{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.00058775, 'Virtual Tokens': 1371}
Lotus sampling
{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.04980649999999999, 'Virtual Tokens': 103522}
Caption blip
{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.0, 'Virtual Tokens': 0}
caption blip with prompt augmentation
{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.0021237499999999998, 'Virtual Tokens': 1376}
instruct blip
{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.0, 'Virtual Tokens': 0}
instruct blip + prompt augmentation
{'TP': 0, 'FP': 0, 'FN': 0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'Virtual Cost': 0.0,