In [50]:
from convokit import Corpus
from dataclasses import dataclass
from typing import List
from functools import reduce
from pathlib import Path
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

In [64]:
model = SentenceTransformer("fashion-bert-output-v2")

In [65]:
import re

def clean_text(s: str) -> str:
    s = re.sub(r'\[deleted\]|\[removed\]', '', s, flags=re.IGNORECASE)
    s = re.sub(r'&amp;?', '', s)
    s = s.replace('\n', ' ')
    s = re.sub(r'\s+', ' ', s)
    return s.strip()

In [77]:
import json
from pathlib import Path

def load_mercari_json(path: Path):
    """
    Load a JSON file where each element is expected to be a dict
    with at least 'id' and 'description' fields.
    """
    with path.open('r', encoding='utf-8') as f:
        return json.load(f)

def extract_texts_and_ids(data):
    p_texts, p_ids, p_metadata = [], [], []
    for item in data:
        product_id = item.get('ID')
        if not product_id:
            continue

        name = item.get('name', '')
        desc = item.get('description', '')
        clean_desc = clean_text(desc)
        if not clean_desc:
            continue

        p_texts.append(name + ': ' + clean_desc)
        p_ids.append(product_id)

    return p_texts, p_ids

json_path = Path("COMBINED-FINAL.json")
data = load_mercari_json(json_path)

texts, ids  = extract_texts_and_ids(data)
print('Size of text list: ' + str(len(texts)))

# apply your cleaning function and filter out any empty results
clean_texts = [clean_text(t) for t in texts]
clean_texts = [t for t in clean_texts if t]
print('Size of text list: ' + str(len(clean_texts)))

Size of text list: 1511
Size of text list: 1511


In [78]:
print(clean_texts)

['y2k silver square watch black: y2k silver square watch black leather - adjustable - works to tell time - 7.5” length #watch #accessories #squarewatch #y2kwatch #vintagewatch', 'silver skeleton hand belt w: Silver Skeleton Hand Belt w/ Metal Buckle #goth #grunge #emo #belt #y2k', '0648 floral square buckle 5979: 0648 - Floral Square Buckle Studded Belt Street/grunge style belt Fits 35-41 waist, 45” length Brand new Bundle deal discounts! #belt #buckle #studded #floral #square', 'twisted silver carabiner comes with 9078: Twisted Silver Carabiner (comes with keyring) (spiderman not included) #silver #carabiner #chrome #outdoor #keychain', 'these wrap around sport sunglasses 586e: These wrap around sport sunglasses are perfect for athletic activities and everyday wear. The perfect eyewear for active individuals who want to protect their eyes while enjoying outdoor activities such as running, cycling, hiking, and fishing. These glasses are not only exceptionally attractive but also very s

In [79]:
json_path = Path("COMBINED-FINAL.json")
with json_path.open("r", encoding="utf-8") as f:
    records = json.load(f)

ids = list(range(len(clean_texts))) # [r["ID"] for r in records]
print('Size of ID list: ' + str(len(ids)))
print('Size of text list: ' + str(len(clean_texts)))

Size of ID list: 1511
Size of text list: 1511


In [80]:
embs = model.encode(clean_texts, show_progress_bar=True)
print(len(embs))

Batches: 100%|██████████| 48/48 [00:15<00:00,  3.18it/s]

1511





In [81]:
import pandas as pd

embs = model.encode(clean_texts, show_progress_bar=True)
df = pd.DataFrame(embs, columns=[f"dim{i}" for i in range(len(embs[0]))])
df.insert(0, "id", ids)  # if you want ids
df.to_csv("FINAL-EMBEDDINGS.csv", index=False)

Batches: 100%|██████████| 48/48 [00:02<00:00, 17.19it/s]
