In [8]:
import duckdb
import pandas as pd
import os
from lotus.dtype_extensions import ImageArray

parquet_path_sampeled = 'fashion_product_images_001/styles.parquet'
parquet_path = 'fashion_product_images_001/styles.parquet'
details_path = 'fashion_product_images_001/styles_details.parquet'
sample_size = 0.3
df = duckdb.query(f"""
with images as (
    SELECT *
    FROM parquet_scan('{parquet_path}')
    USING SAMPLE {sample_size * 100} PERCENT (reservoir, 80)
    )
    select
     images.id ,images.subcategory, images.articletype, images.basecolour, details.price, images.productDisplayName, styleimages.default.imageURL
    -- *
    from images, parquet_scan('{details_path}') details
    where images.id = details.id
    -- and details.price <1000

""").to_df()



In [9]:
image_file_names = df["id"]
image_URLs = df["imageURL"]
image_paths = [os.path.join("fashion_product_images/images_resized", str(image) + ".jpg") for image in image_file_names]
df2 = pd.DataFrame({"image": ImageArray(image_URLs), "label": image_file_names, "image_path": image_paths, "image_URLs": image_URLs , "articleType": df["articleType"], "baseColour": df["baseColour"]})
merged_df = pd.merge(df, df2,  left_on='id', right_on='label')
merged_df.columns = ['_' + col for col in merged_df.columns]


In [None]:
x.p


In [None]:
import pandas as pd

import lotus
from lotus.models import LM, SentenceTransformersRM
from lotus.types import CascadeArgs
from lotus.vector_store import FaissVS
lm = LM(model="gemini/gemini-2.0-flash-lite")
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
vs = FaissVS()
lotus.settings.configure(lm=lm, rm=rm, vs=vs)

lm.print_total_usage()

filtered_df2 = (df2
                .sem_filter("the content  of {image} shows a white t-shirt")
                )
lm.print_total_usage()


In [None]:
image_paths_big = [os.path.join("fashion_product_images/images", str(image) + ".jpg") for image in image_file_names if str(image)[10] == 1]
df2_big = pd.DataFrame({"image": ImageArray(image_paths_big), "label": image_file_names, "image_path": image_paths})

lm_2 = LM(model="gemini/gemini-2.0-flash-lite")
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
vs = FaissVS()
lotus.settings.configure(lm=lm_2, rm=rm, vs=vs)

lm_2.print_total_usage()

filtered_df2_big = (df2_big
                .sem_filter("the content  of {image} shows a white t-shirt")
                )
lm_2.print_total_usage()


In [None]:
filtered_df2


In [None]:
filtered_df = (df
                .sem_filter("The {articleType}, {baseColour} and {productDisplayName} might show a t-shirt that might be white")
                )
lm.print_total_usage()



In [None]:
filtered_df


In [None]:
expr = "Does {productDisplayName} exactly and precisely match the {image}?"


cascade_args = CascadeArgs(recall_target=0.8, precision_target=0.8)
res = filtered_df.sem_join(filtered_df2, expr, return_stats=True, strategy="zs-cot")


print(f"Joined {df.shape[0]} rows from df1 with {filtered_df2.shape[0]} rows from df2")
# print(f"    Join cascade took {stats['join_resolved_by_large_model']} LM calls")
# print(f"    Helper resolved {stats['join_resolved_by_helper_model']} LM calls")
# print(f"Join cascade used {stats['total_LM_calls']} LM calls in total")
# print(f"Naive join would require {df.shape[0]*df2.shape[0]} LM calls")
res

In [None]:
lm.print_total_usage()


In [None]:
expr = "given {productDisplayName}, which {image} does more exactly and precisely match?"


ranked, stats = res.sem_topk(
    expr,
    K = 1,
    group_by=["productDisplayName"],
    method="quick",
    return_stats=True
)
ranked

In [None]:
lm.print_total_usage()


# SEM INDEX


#### Creating the index

In [20]:
from lotus.vector_store import FaissVS
import lotus
from lotus.models import LM, SentenceTransformersRM

gpt_4o_mini = LM("gpt-4o-mini")
gpt_4o = LM("gpt-4o")

# CLIP embedding model – works for both text & image
# rm  = SentenceTransformersRM(model="clip-ViT-B-32")
rm  = SentenceTransformersRM(model="clip-ViT-L-14", max_batch_size=32)

lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini, rm=rm, vs=FaissVS())

merged_df = merged_df.sem_index("_image", index_dir="image_index")
df = df.sem_index("productDisplayName", index_dir="productDisplayName_index")



2025-08-08 14:31:33,100 - INFO - Use pytorch device_name: cpu
2025-08-08 14:31:33,102 - INFO - Load pretrained SentenceTransformer: clip-ViT-L-14


##### Binary search filter

In [21]:
from lotus.types import CascadeArgs, ProxyModel

cascade_args = CascadeArgs(
    recall_target=0.9,
    precision_target=0.75,
    sampling_percentage=0.3,
    proxy_model=ProxyModel.EMBEDDING_MODEL,
)

merged_df_filtered =  merged_df.sem_filter("{_image} is a watch",cascade_args=cascade_args, return_stats=True, find_top_k=True)




Filtering: 100%|██████████ 1/1 LM calls [00:02<00:00,  2.60s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:02<00:00,  2.32s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:04<00:00,  4.96s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:02<00:00,  2.74s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:05<00:00,  5.37s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:03<00:00,  3.11s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:04<00:00,  4.67s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:01<00:00,  1.74s/it]
Filtering: 100%|██████████ 1/1 LM calls [00:06<00:00,  6.40s/it]


In [48]:
from join_optimizer.join_optimizer.productDisplayName_index.evaluate import *

metrics = evaluate_filter(
    merged_df=merged_df,
    filtered_df=merged_df_filtered,
    article_type='Tshirts',
    base_colour='White'
)
print(metrics)

{'TP': 12, 'FP': 3, 'FN': 2, 'precision': 0.8, 'recall': 0.8571428571428571, 'f1': 0.8275862068965518}


#### full LLM calls

In [24]:
cascade_args = CascadeArgs(
    recall_target=0.9,
    precision_target=0.75,
    sampling_percentage=0.1,
    proxy_model=ProxyModel.EMBEDDING_MODEL,
)

merged_df_filtered_big =  merged_df.sem_filter("{_image} is a watch", cascade_args=cascade_args ,return_stats=False, find_top_k=False)


Running oracle for threshold learning: 100%|██████████ 13/13 LM calls [00:09<00:00,  1.32it/s]
2025-08-09 15:43:47,071 - INFO - Sample recall: 1.0
2025-08-09 15:43:47,073 - INFO - Sample precision: 1.0
2025-08-09 15:43:47,074 - INFO - Learned cascade thresholds: (1.0, 0.1390555500984192)
2025-08-09 15:43:47,075 - INFO - Num routed to smaller model: 45
Running predicate evals with oracle LM: 100%|██████████ 88/88 LM calls [00:11<00:00,  7.99it/s]


In [47]:
metrics = evaluate_filter(
    merged_df=merged_df,
    filtered_df=merged_df_filtered_big,
    article_type='Tshirts',
    base_colour='White'
)
print(metrics)

{'TP': 8, 'FP': 1, 'FN': 6, 'precision': 0.8888888888888888, 'recall': 0.5714285714285714, 'f1': 0.6956521739130435}


#### Joins

In [55]:
res_sim_join = merged_df_filtered.sem_sim_join(df, left_on='_image', right_on='productDisplayName', K=1, keep_index=True)

In [1]:
expr = "Does {productDisplayName} exactly and precisely match the {_image}?"
from lotus.types import CascadeArgs, ProxyModel

cascade_args = CascadeArgs(
    recall_target=0.9,
    precision_target=0.75,
    sampling_percentage=0.04,
    proxy_model=ProxyModel.EMBEDDING_MODEL,
)

res = merged_df_filtered.sem_join(df, expr,cascade_args=cascade_args, return_stats=True)

# print(f"Joined {df.shape[0]} rows from df1 with {filtered_df2.shape[0]} rows from df2")
# print(f"    Join cascade took {stats['join_resolved_by_large_model']} LM calls")
# print(f"    Helper resolved {stats['join_resolved_by_helper_model']} LM calls")
# print(f"Join cascade used {stats['total_LM_calls']} LM calls in total")
# print(f"Naive join would require {df.shape[0]*df2.shape[0]} LM calls")


  from tqdm.autonotebook import tqdm, trange


NameError: name 'merged_df_filtered' is not defined

In [None]:
res = 0

In [None]:

expr = "given {productDisplayName}, which {_image} does more exactly and precisely match?"

ranked, stats = res[0].sem_topk(
    expr,
    K=1,
    group_by=["productDisplayName"],
    method="quick",
    return_stats=True
)
ranked


In [58]:
join_topk = ranked.query('_id == id')
sim_1 = res_sim_join.query('_id == id')