In [1]:
import json

# Load words from JSON file
with open("words.json", "r") as f:
    data = json.load(f)

# Just the keys (the words)
words = list(data.keys())

print(f"Total words loaded: {len(words)}")
print("Sample:", words[:10])


Total words loaded: 370101
Sample: ['a', 'aa', 'aaa', 'aah', 'aahed', 'aahing', 'aahs', 'aal', 'aalii', 'aaliis']


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Character n-gram TF-IDF for autocomplete
vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4))
word_vectors = vectorizer.fit_transform(words)

print("TF-IDF model trained.")


TF-IDF model trained.


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def suggest(prefix: str, top_k: int = 5):
    prefix_vector = vectorizer.transform([prefix])
    similarities = cosine_similarity(prefix_vector, word_vectors).flatten()
    
    # Get all words that actually START with the prefix
    candidates = [
        (i, similarities[i]) 
        for i in range(len(words)) 
        if words[i].startswith(prefix)
    ]
    
    # Sort by similarity (or you can sort alphabetically)
    candidates.sort(key=lambda x: x[1], reverse=True)
    
    return [words[i] for i, sim in candidates[:top_k]]



In [6]:
suggest("aut", 10)


['aute',
 'auth',
 'autojuggernaut',
 'auto',
 'autor',
 'autoput',
 'autist',
 'autos',
 'autere',
 'autism']