In [1]:
from IPython.display import HTML, display

def set_css(*args, **kwargs):
    display(HTML('''
    <style>
        pre {
            white-space: pre-wrap;
        }
    </style>
    '''))
    
get_ipython().events.register('pre_run_cell', set_css)

In [2]:
from convokit import Corpus
from dataclasses import dataclass
from typing import List
from functools import reduce
from pathlib import Path


base = Path("fashion-corpora")
corpora = [Corpus(str(p)) 
           for p in base.iterdir() 
           if p.is_dir() and p.suffix == ".corpus"]


texts, ids = [], []
for c in corpora:
    #print()
    #print(c)
    for utt in c.iter_utterances():
        #print(utt)
        if utt.text and utt.text.strip():
            convo = utt.get_conversation()
            doc = utt.text
            texts.append(doc)
            ids.append(utt.id)

import re

def clean_text(s: str) -> str:
    s = re.sub(r'\[deleted\]|\[removed\]', '', s, flags=re.IGNORECASE)
    s = re.sub(r'&amp;?', '', s)
    s = s.replace('\n', ' ')
    s = re.sub(r'\s+', ' ', s)
    return s.strip()

clean_texts = list(filter(None, [clean_text(t) for t in texts]))


In [4]:
%pip install -U sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Using cached sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-4.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
%pip install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp312-cp312-macosx_11_0_arm64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss


model = SentenceTransformer("all-MiniLM-L6-v2")
embs  = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

norms = np.linalg.norm(embs, axis=1, keepdims=True)
embs  = embs / np.clip(norms, 1e-8, None)

dim   = embs.shape[1]
index = faiss.IndexFlatIP(dim)  
index.add(embs.astype("float32"))



Batches: 100%|██████████| 4378/4378 [03:17<00:00, 22.19it/s] 


0.895 — Chelsea boots. 
0.845 — Chelsea boots. Got it.


0.845 — glamorous unique design of chelsea boots


In [10]:
def retrieve_similar_reviews(query: str, top_k: int = 5):
    q_emb = model.encode([query], convert_to_numpy=True)
    q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)

    scores, idxs = index.search(q_emb.astype("float32"), top_k)
    results = []
    for sc, idx in zip(scores[0], idxs[0]):
        results.append({
            "utterance_id": ids[idx],
            "text": texts[idx],
            "score": float(sc)
        })
    return results

for res in retrieve_similar_reviews("vintage chelsea boots", top_k=20):
    print(f"{res['score']:.3f} — {res['text']}")

0.895 — Chelsea boots. 
0.845 — Chelsea boots. Got it.


0.845 — glamorous unique design of chelsea boots
0.764 — I love to wear Chelsea boots and this was more accurate than I expected.
0.754 — Chelsea boots
Description
Also known as dealer boots, the Chelsea boots are tight fitting ankle length boots with low heels. They consist of two parts each made from a single piece of leather: the vamp and the quarters which meet near the ankle where they are joined by a strip of vulcanized rubber or elastic.  The elastic strip extends to just below the ankle but does not go all the way down to the sole. Instead of being sewn on top of each other, the vamp and quarters are sewn together in one plain below the ankle. The Chelsea boots have rounded toes just like the Jodhpur boots but the sewing techniques on them totally distinguish them.
History
The boot has Victorian roots to it. Its design and invention is attributed to J.Sparkes Hall the boot maker to Queen Victoria. Thanks to the inventor o