In [47]:
import duckdb, pandas as pd, os
import palimpzest as pz
from dotenv import load_dotenv
load_dotenv(override=False)
import palimpzest.core.elements.records

parquet_path = "../join_optimizer/fashion_product_images/styles.parquet"
details_path = "../join_optimizer/fashion_product_images/styles_details.parquet"
sample_size = 0.002

In [48]:

df = duckdb.query(f"""
WITH images AS (
  SELECT * FROM parquet_scan('{parquet_path}')
  USING SAMPLE {sample_size*100} PERCENT (reservoir, 80)
)
SELECT
  images.id,
  images.subcategory,
  images.articletype,
  images.basecolour,
  details.price,
  images.productDisplayName,
  styleimages.default.imageURL AS image_url
FROM images, parquet_scan('{details_path}') details
WHERE images.id = details.id
ORDER BY images.id
""").to_df()

In [49]:
schema = [
    {"name": "id", "type": str, "desc": "product id"},
    {"name": "productDisplayName", "type": str, "desc": "display name"},
    {"name": "price", "type": float, "desc": "price"},
    {"name": "image_url", "type": str, "desc": "remote image url"},
]

class DFReader(pz.DataReader):
    def __init__(self, df):
        super().__init__(schema=schema)
        self.df = df.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self, i):
        r = self.df.iloc[i]
        return {
            "id": str(r["id"]),
            "productDisplayName": str(r["productDisplayName"]),
            "price": float(r["price"]) if pd.notna(r["price"]) else None,
            "image_url": r["image_url"],
        }

ds = pz.Dataset(DFReader(df))


In [50]:
predicate = (
    "Keep rows where the product in the image (from productDisplayName) "
    "is topwear. "
    "If multiple products appear, decide by the most prominent one."
)



In [55]:
filtered = ds.sem_filter(predicate, depends_on=["productDisplayName"])
out = filtered.run(min_time=True)  # config optional; defaults are fine for a quick pass
result = out.to_df(cols=[
    "id",
    "productDisplayName",
    "price",
    "image_url"
])
print(result.head())


Output()




Total time: 14.75s
Total cost: $0.0045


KeyboardInterrupt: 