In [8]:
import duckdb
import pandas as pd
import os
from lotus.dtype_extensions import ImageArray

parquet_path_sampeled = 'fashion_product_images_001/styles.parquet'
parquet_path = 'fashion_product_images_001/styles.parquet'
details_path = 'fashion_product_images_001/styles_details.parquet'
sample_size = 0.001
df = duckdb.query(f"""
with images as (
    SELECT *
    FROM parquet_scan('{parquet_path}')
    -- USING SAMPLE {sample_size * 100} PERCENT (reservoir, 80)
    )
    select
     images.id ,images.subcategory, images.articletype, images.basecolour, details.price, images.productDisplayName, styleimages.default.imageURL
    -- *
    from images, parquet_scan('{details_path}') details
    where images.id = details.id
    -- and details.price <1000
    limit 50

""").to_df()



In [None]:
image_file_names = df["id"]
image_URLs = df["imageURL"]
image_paths = [os.path.join("fashion_product_images/images_resized", str(image) + ".jpg") for image in image_file_names]
df2 = pd.DataFrame({"image": ImageArray(image_URLs), "label": image_file_names, "image_path": image_paths, "image_URLs": image_URLs , "articleType": df["articleType"], "baseColour": df["baseColour"]})
merged_df = pd.merge(df, df2,  left_on='id', right_on='label')
merged_df


In [None]:
import pandas as pd

import lotus
from lotus.models import LM, SentenceTransformersRM
from lotus.types import CascadeArgs
from lotus.vector_store import FaissVS
lm = LM(model="gemini/gemini-2.0-flash-lite")
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
vs = FaissVS()
lotus.settings.configure(lm=lm, rm=rm, vs=vs)

lm.print_total_usage()

filtered_df2 = (df2
                .sem_filter("the content  of {image} shows a white t-shirt")
                )
lm.print_total_usage()


In [None]:
image_paths_big = [os.path.join("fashion_product_images/images", str(image) + ".jpg") for image in image_file_names if str(image)[10] == 1]
df2_big = pd.DataFrame({"image": ImageArray(image_paths_big), "label": image_file_names, "image_path": image_paths})

lm_2 = LM(model="gemini/gemini-2.0-flash-lite")
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
vs = FaissVS()
lotus.settings.configure(lm=lm_2, rm=rm, vs=vs)

lm_2.print_total_usage()

filtered_df2_big = (df2_big
                .sem_filter("the content  of {image} shows a white t-shirt")
                )
lm_2.print_total_usage()


In [None]:
filtered_df2


In [None]:
filtered_df = (df
                .sem_filter("The {articleType}, {baseColour} and {productDisplayName} might show a t-shirt that might be white")
                )
lm.print_total_usage()



In [None]:
filtered_df


In [None]:
expr = "Does {productDisplayName} exactly and precisely match the {image}?"


cascade_args = CascadeArgs(recall_target=0.8, precision_target=0.8)
res = filtered_df.sem_join(filtered_df2, expr, return_stats=True, strategy="zs-cot")


print(f"Joined {df.shape[0]} rows from df1 with {filtered_df2.shape[0]} rows from df2")
# print(f"    Join cascade took {stats['join_resolved_by_large_model']} LM calls")
# print(f"    Helper resolved {stats['join_resolved_by_helper_model']} LM calls")
# print(f"Join cascade used {stats['total_LM_calls']} LM calls in total")
# print(f"Naive join would require {df.shape[0]*df2.shape[0]} LM calls")
res

In [None]:
lm.print_total_usage()


In [None]:
expr = "given {productDisplayName}, which {image} does more exactly and precisely match?"


ranked, stats = res.sem_topk(
    expr,
    K = 1,
    group_by=["productDisplayName"],
    method="quick",
    return_stats=True
)
ranked

In [None]:
lm.print_total_usage()


# Independent


In [None]:
from lotus.vector_store import FaissVS
from lotus.models import SentenceTransformersRM
import lotus

# CLIP embedding model – works for both text & image
# rm  = SentenceTransformersRM(model="clip-ViT-B-32")
rm  = SentenceTransformersRM(model="clip-ViT-B-32")

lotus.settings.configure(rm=rm, vs=FaissVS())

merged_df = merged_df.sem_index("image", index_dir="image_index")


In [None]:
cands = merged_df.sem_search("image", "watch", K=5, return_scores=True)


In [None]:
cands