In [2]:
from dotenv import load_dotenv
load_dotenv(override=True)
import json, re, collections, pathlib, os, time
from rapidfuzz import process, fuzz
import nltk
from nltk.corpus import stopwords

In [3]:
nltk.download('stopwords', quiet=True)

True

In [4]:
STOPWORDS = set(stopwords.words('english'))
JSON_DIR = pathlib.Path(os.getenv("JSON_DIR"))

In [5]:
with open(JSON_DIR, encoding="utf-8") as f:
    docs = json.load(f)

In [6]:
FIELDS = [
    "shoppingCategory",
    "shoppingSubcategory",
    "itemCategory",
    "itemSubcategory",
    "name",
]

In [7]:
entries = [
    (idx, fld, val.strip())
    for idx, doc in enumerate(docs)
    for fld in FIELDS
    for val in [str(doc.get(fld, ""))]
    if val.strip()               # skip empty / missing fields
]

In [8]:
entry_strings = [val for _, _, val in entries]

In [9]:
tokenizer = re.compile(r"\b\w+\b", re.U)

In [10]:
def autocomplete(query: str, matches_to_consider: int = 30, top_words: int = 1):
    """
    Return the most common word(s) inside the fuzzy-matched documents.
    • matches_to_consider – how many top fuzzy hits to examine
    • top_words – 1 ⇒ single best suggestion, >1 ⇒ a list
    """
    if not query:
        return []


    # Step 1: fuzzy match the query against the corpus
    best = process.extract(
        query,
        entry_strings,
        scorer=fuzz.WRatio,        # weighted Levenshtein               [[3]]
        limit=matches_to_consider
    )


    # Step 2: grab the raw strings of the matched docs
    matched_strings = [hit[0] for hit in best]


    # Step 3: tokenize → lowercase → drop stop-words
    tokens = [
        t.lower()
        for s in matched_strings
        for t in tokenizer.findall(s)
        if t.lower() not in STOPWORDS
    ]


    if not tokens:
        return []


    # Step 4: count & return the most common word(s)
    common = collections.Counter(tokens).most_common(top_words)
    return [w for w, _ in common] if top_words > 1 else common[0][0]

In [11]:
while True:
    q = input("\nSearch (blank to quit): ").strip()
    if not q:
        break
    start = time.time()
    suggestion = autocomplete(q, matches_to_consider=40, top_words=3)
    end = time.time()
    print(f"Suggestion(s): {suggestion}, time: {end-start}")

Suggestion(s): ['beauty', 'bronzer'], time: 1.506448745727539
Suggestion(s): ['eyebrow', 'sweet', 'straw'], time: 1.5272464752197266
