In [28]:
from convokit import Corpus
from dataclasses import dataclass
from typing import List
from functools import reduce
from pathlib import Path
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

In [29]:
model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-dot-prod-v3")

In [30]:
import re

def clean_text(s: str) -> str:
    s = re.sub(r'\[deleted\]|\[removed\]', '', s, flags=re.IGNORECASE)
    s = re.sub(r'&amp;?', '', s)
    s = s.replace('\n', ' ')
    s = re.sub(r'\s+', ' ', s)
    return s.strip()

In [31]:
import json
from pathlib import Path

def load_mercari_json(path: Path):
    """
    Load a JSON file where each element is expected to be a dict
    with at least 'id' and 'description' fields.
    """
    with path.open('r', encoding='utf-8') as f:
        return json.load(f)

def extract_texts_and_ids(data):
    texts, ids = [], []
    for item in data:
        name = item.get('name', '')
        # desc = item.get('description', '')
        text = clean_text(name)
        if text:
            texts.append(text)
            ids.append(item.get('ID'))
    return texts, ids

json_path = Path("reformatted-mercari-final.json")
data = load_mercari_json(json_path)

texts, ids = extract_texts_and_ids(data)

# apply your cleaning function and filter out any empty results
clean_texts = [clean_text(t) for t in texts]
clean_texts = [t for t in clean_texts if t]

In [32]:
print(clean_texts)

['AVA-VIV Blouse', 'Acacia pacific tides santorini top', 'Girls cheer and tumbling bundle of 7', 'HOLD for Dogs2016 Minnetonka boots', 'Cream/ Beige Front Cross Shirt', 'NWT VS ULTIMATE SPORTS BRA 34ddd', 'Black and Red Baseball Tee', 'Forever21 floral romper strapless', 'Kendra bundle', 'Black Capri Leggings w/ Flowers One size', 'Under Armour maroon and gray shirt metal', 'Victoria secret 34 c corest top', 'tropical flower print bra from pink', "Woman's north face puffer vest", 'Brandy Melville Off Shoulder Crop Top', 'Lululemon wunder under crop', "PINK by Victoria's Secret lace bandeau", 'NWT Lularoe OS French bulldog leggings', 'High waist Levi jeans', 'Highwaist Distressed Denim Shorts', "H&M Women's Long Horse Sweater", '(M) Under Armour half zip jacket!', 'Black & Burgundy Bikini Bottoms', 'Maternity top bundle', "Women's Espresso Pink Coral shirt XL", 'Silver jeans size 11', 'Size 6 Watercolor Inspire Crop', 'Muscle t-shirt', 'Floral two Piece tracksuit set', 'Boots NWT 6.5', 

In [33]:
json_path = Path("reformatted-mercari-final.json")
with json_path.open("r", encoding="utf-8") as f:
    records = json.load(f)

ids = [r["ID"] for r in records]
print('Size of ID list: ' + str(len(ids)))

Size of ID list: 530


In [34]:
import pandas as pd

embs = model.encode(clean_texts, show_progress_bar=True)
df = pd.DataFrame(embs, columns=[f"dim{i}" for i in range(len(embs[0]))])
df.insert(0, "id", ids)  # if you want ids
df.to_csv("mercari-embeddings.csv", index=False)

Batches: 100%|██████████| 17/17 [00:02<00:00,  6.18it/s]
