In [1]:
from IPython.display import HTML, display

def set_css(*args, **kwargs):
    display(HTML('''
    <style>
        pre {
            white-space: pre-wrap;
        }
    </style>
    '''))
    
get_ipython().events.register('pre_run_cell', set_css)

In [8]:
import re

def clean_text(s: str) -> str:
    s = re.sub(r'\[deleted\]|\[removed\]', '', s, flags=re.IGNORECASE)
    s = re.sub(r'&amp;?', '', s)
    s = s.replace('\n', ' ')
    s = re.sub(r'\s+', ' ', s)
    return s.strip()



In [3]:
%pip install faiss-cpu

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [19]:
from convokit import Corpus
from dataclasses import dataclass
from typing import List
from functools import reduce
from pathlib import Path
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

model = SentenceTransformer("fashion-bert-output-v2")

In [12]:
base = Path("fashion-corpora")
corpora = [Corpus(str(p)) 
           for p in base.iterdir() 
           if p.is_dir() and p.suffix == ".corpus"]


texts, ids = [], []
for c in corpora:
    #print()
    #print(c)
    for utt in c.iter_utterances():
        #print(utt)
        if utt.text and utt.text.strip():
            convo = utt.get_conversation()
            doc = utt.text
            if len(doc.split()) >= 5:
                texts.append(doc)
                ids.append(utt.id)


clean_texts = list(filter(None, [clean_text(t) for t in texts]))

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

def filter_distinctive_reviews(reviews, bottom_percentile=30):
    vectorizer = TfidfVectorizer(stop_words='english', min_df=2)
    tfidf_matrix = vectorizer.fit_transform(reviews)
    tfidf_scores = tfidf_matrix.mean(axis=1).A1
    threshold = np.percentile(tfidf_scores, bottom_percentile)
    filtered_reviews = [rev for rev, score in zip(reviews, tfidf_scores) if score > threshold]

    return filtered_reviews

In [13]:
filtered_texts = filter_distinctive_reviews(clean_texts)

In [14]:
len(texts)

89115

In [20]:
import json

json_path = Path("COMBINED-FINAL.json")
with json_path.open("r", encoding="utf-8") as f:
    records = json.load(f)

def extract_texts_and_ids(data):
    p_texts, p_ids, p_metadata = [], [], []
    for item in data:
        product_id = item.get('ID')
        if not product_id:
            continue

        name = item.get('name', '')
        desc = item.get('description', '')
        clean_desc = clean_text(desc)
        if not clean_desc:
            continue

        p_texts.append(name + ': ' + clean_desc)
        p_ids.append(product_id)

    return p_texts, p_ids


texts, ids  = extract_texts_and_ids(records)
print('Size of text list: ' + str(len(texts)))

Size of text list: 1511


In [21]:
embs  = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

norms = np.linalg.norm(embs, axis=1, keepdims=True)
embs  = embs / np.clip(norms, 1e-8, None)

dim   = embs.shape[1]
index = faiss.IndexFlatIP(dim)  
index.add(embs.astype("float32"))

Batches: 100%|██████████| 48/48 [00:26<00:00,  1.78it/s]


In [22]:
np.save("product_embeddings.npy", embs)

In [17]:
def retrieve_similar_reviews(query: str, top_k: int = 5):
    q_emb = model.encode([query], convert_to_numpy=True)
    q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)

    scores, idxs = index.search(q_emb.astype("float32"), top_k)
    results = []
    for sc, idx in zip(scores[0], idxs[0]):
        results.append({
            "utterance_id": ids[idx],
            "text": texts[idx],
            "score": float(sc)
        })
    return results

embs = np.load("utterance_embeddings.npy")
for res in retrieve_similar_reviews("vintage chelsea boots", top_k=20):
    print(f"{res['score']:.3f} — {res['text']}")

0.755 — • Chompy bird hat  
  
• Eagle cape  
• Desert amulet 1  
  
• Beginner wand  
• Gold satchel  
  
• Samurai greaves  
  
• Moonclan gloves  
• Slave boots
0.744 — Sorry for not replying sooner. I use r245, g190, b130. It achieves a pretty nice blonde but your results might vary on your screen.
0.725 — I totally agréé. I want to bé héalthy, not scary
0.716 — I'm using Mornes leggings, colour doesn't match well but the shape does imo.  
0.676 — No worries :) thanks I’ll have a look at that fashion! 
0.635 — Haha, we still have our original xbox so I may have to hook it up and play through fable
0.629 — thank you for your tips! very useful.
0.623 — I dunno.. its a suit you're gonna wear for a day.  Chances are its going to be too formal to wear to anything like work or a night out.  Seems like a waste to buy it unless you attend formal events on the regular.  The list of reasons seem pretty superficial too.
0.620 — For the Frayed Blade, I'm currently using the Full Ringed Knight 

In [18]:
import json

with open("mercari-set1.json","r") as f:
    products_set1 = json.load(f)

with open("mercari-set2.json", "r") as f:
    products_set2 = json.load(f)

products = products_set1 + products_set2

for item in products[:5]:
    print(item.keys())

prod_texts = []
prod_ids   = []
prod_meta  = []   

for idx, item in enumerate(products):
    pid = item.get("train_id")
    if pid is None:
        continue

    raw_desc = item.get("item_description", "")
    desc = clean_text(raw_desc)
    if not desc:
        continue

    prod_texts.append(desc)
    prod_ids.append(pid)
    prod_meta.append({
        "train_id": pid,
        "name":     item.get("name", ""),
        "price":    item.get("price", ""),
        "category": item.get("category_name", "")
    })

print(f"Indexing {len(prod_texts)} products…")

prod_embs = model.encode(prod_texts, convert_to_numpy=True, show_progress_bar=True)
prod_embs /= np.linalg.norm(prod_embs, axis=1, keepdims=True)

idx_prod = faiss.IndexFlatIP(prod_embs.shape[1])
idx_prod.add(prod_embs.astype("float32"))




FileNotFoundError: [Errno 2] No such file or directory: 'mercari-set1.json'

In [None]:
def search_products(query: str,
                    k_corpus: int = 10,
                    k_prod:   int = 20,
                    alpha:    float = 1.0,
                    beta:     float = 0.75):
    q_emb = model.encode([query], convert_to_numpy=True)
    q_emb /= np.linalg.norm(q_emb, axis=1, keepdims=True)
    
    sim_u, idxs_u = index.search(q_emb.astype("float32"), k_corpus)
    top_u_embs = embs[idxs_u[0]]  # shape (k_corpus, dim)
    
    expanded = alpha * q_emb + beta * top_u_embs.mean(axis=0, keepdims=True)
    expanded /= np.linalg.norm(expanded, axis=1, keepdims=True)
    
    sim_p, idxs_p = idx_prod.search(expanded.astype("float32"), k_prod)
    
    results = []
    for score, pid_idx in zip(sim_p[0], idxs_p[0]):
        meta = prod_meta[pid_idx]
        results.append({
            "train_id": meta["train_id"],
            "name":     meta["name"],
            "price":    meta["price"],
            "category": meta["category"],
            "score":    float(score)
        })
    return results

for r in search_products("sporty", k_corpus=10, k_prod=10):
    print(f"{r['score']:.3f}\t{r['name']} (${r['price']}) — {r['category']}")