In [2]:
!pip install -r req.txt

Collecting pandas
  Downloading pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting numpy
  Downloading numpy-2.3.4-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl (64 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Pillow
  Downloading pillow-12.0.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (7.0 MB)
[2K     [38;2;114;156

In [3]:
# file: data_prep.py
# Run inside venv: python data_prep.py --csv products.csv
import argparse, json, os, re
from pathlib import Path
import pandas as pd
import requests
from tqdm import tqdm
from PIL import Image
from io import BytesIO

def clean_text(s):
    if pd.isna(s): return ""
    return re.sub(r"\s+", " ", str(s)).strip()

def normalize_row(row):
    row = dict(row)
    row['title'] = clean_text(row.get('title',''))
    row['brand'] = clean_text(row.get('brand',''))
    row['description'] = clean_text(row.get('description',''))
    p = row.get('price','')
    try:
        row['price'] = float(str(p).replace('$','').replace(',','').strip()) if str(p).strip() else None
    except:
        row['price'] = None
    cats = row.get('categories','')
    if pd.isna(cats) or cats=='':
        row['categories'] = []
    else:
        try:
            row['categories'] = list(eval(cats)) if isinstance(cats,str) and cats.strip().startswith('[') else [c.strip() for c in str(cats).split('|') if c.strip()]
        except:
            row['categories'] = [c.strip() for c in str(cats).split(',') if c.strip()]
    imgs = row.get('images','')
    if pd.isna(imgs) or imgs=='':
        row['images'] = []
    else:
        try:
            row['images'] = list(eval(imgs))
        except:
            row['images'] = [u.strip() for u in str(imgs).split(',') if u.strip()]
    # ensure uniq_id
    if not row.get('uniq_id'):
        row['uniq_id'] = f"uid_{abs(hash(row['title'])) % (10**9)}"
    return row

def download_and_thumb(url, dest_path, size=(512,512), timeout=10):
    try:
        r = requests.get(url.strip(), timeout=timeout)
        r.raise_for_status()
        img = Image.open(BytesIO(r.content)).convert('RGB')
        img.thumbnail(size)
        dest_path.parent.mkdir(parents=True, exist_ok=True)
        img.save(dest_path, format='JPEG', quality=85)
        return True
    except Exception:
        return False

def main(csv_path, out_dir="data_prep_output"):
    out = Path(out_dir)
    out.mkdir(parents=True, exist_ok=True)
    df = pd.read_csv(csv_path, engine='python')
    cleaned = df.apply(normalize_row, axis=1)
    cleaned_df = pd.DataFrame(list(cleaned))
    # save jsonl
    jsonl = out / "products_clean.jsonl"
    with open(jsonl, "w", encoding="utf-8") as f:
        for _, r in cleaned_df.iterrows():
            f.write(json.dumps(r.to_dict(), ensure_ascii=False) + "\n")
    # download images thumbnails
    img_dir = out / "images"
    for _, r in tqdm(cleaned_df.iterrows(), total=len(cleaned_df), desc="downloading images"):
        uid = r.get('uniq_id')
        for i, url in enumerate(r.get('images') or []):
            if not url: continue
            dest = img_dir / f"{uid}_{i}.jpg"
            if dest.exists(): continue
            download_and_thumb(url, dest)
    print("Saved:", jsonl, "Images at:", img_dir)


main(csv_path='products.csv')


downloading images: 100%|██████████| 312/312 [07:50<00:00,  1.51s/it]

Saved: data_prep_output/products_clean.jsonl Images at: data_prep_output/images



