# Contextual Synonym Analysis (NLTK Version 1)

This notebook leverages NLTK WordNet Synsets to identify and analyze contextually interchangeable tokens for vocabulary refinement.

In [None]:
import json
from nltk.corpus import wordnet as wn
from collections import defaultdict
import editdistance

# --- Configuration ---
# Threshold for interchangeability (1.0 = perfect)
THRESHOLD = 1 

def extract_vocab(json_path):
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        return []
        
    vocab_keys = data.get('model', {}).get('vocab', {}).keys()
    
    # Fast set comprehension for filtering
    return {w for w in vocab_keys 
            if len(w) > 3 and w.isalpha() and not w.startswith("##")}

def find_contextual_synonyms(vocab_set):
    print(f"Building indices for {len(vocab_set)} words...")
    
    # Build lookup tables
    word_to_synsets = {}
    
    # synset_id -> list of words that belong to it
    synset_to_words = defaultdict(list)
    
    for word in vocab_set:
        # Get all Synset IDs (meanings) for this word
        # We use IDs (syn.name()) because they are unique strings
        synsets = set(syn.name() for syn in wn.synsets(word))
        
        if synsets:
            word_to_synsets[word] = synsets
            for sid in synsets:
                synset_to_words[sid].append(word)
                
    print("Calculating overlaps...")
    
    results = []
    processed_pairs = set()

    # Iterate through words to find synonyms
    for word_a, contexts_a in word_to_synsets.items():
        
        # Filter candidates sharing at least one context
        candidates = set()
        for sid in contexts_a:
            candidates.update(synset_to_words[sid])
            
        # Remove self from candidates
        candidates.discard(word_a)
        
        for word_b in candidates:
            # Sort pair to avoid duplicates: (A, B) and (B, A)
            pair_key = tuple(sorted((word_a, word_b)))
            if pair_key in processed_pairs:
                continue
            
            processed_pairs.add(pair_key)
            
            # Calculate Similarity
            contexts_b = word_to_synsets[word_b]
            
            intersection = len(contexts_a.intersection(contexts_b))
            union = len(contexts_a.union(contexts_b))
            
            score = intersection / union
            edit_d_ratio = editdistance.eval(word_a, word_b) / max(len(word_a), len(word_b))
            
            if score >= THRESHOLD and edit_d_ratio > 0:
                results.append({
                    "word_a": word_a,
                    "word_b": word_b,
                    "score": round(score, 2),
                    "shared_contexts": intersection,
                    "total_contexts": union
                })

    return results

# --- Execution ---

# Load Vocab
vocab = extract_vocab("./bert_tokenizer_uncased/tokenizer.json")

# (Dummy data for testing)
if not vocab:
    vocab = {"start", "begin", "commence", "car", "automobile", "speech", "talk", "run", "sprint"}

# Find Pairs
interchangeable_pairs = find_contextual_synonyms(vocab)

# Sort by Score
interchangeable_pairs.sort(key=lambda x: x['score'], reverse=True)

print(f"\nFound {len(interchangeable_pairs)} highly interchangeable pairs.\n")
print(f"{'Word A':<15} | {'Word B':<15} | {'Score':<5} | {'Shared Meanings'}")
print("-" * 60)

for p in interchangeable_pairs[:20]:
    print(f"{p['word_a']:<15} | {p['word_b']:<15} | {p['score']:<5} | {p['shared_contexts']}/{p['total_contexts']}")

In [19]:
unique_words = dict()
unique_words_related = {}
for pair in interchangeable_pairs:
    unique_words[pair['word_a']] = 0
    unique_words[pair['word_b']] = 0
    unique_words_related[pair['word_a']] = set()
    unique_words_related[pair['word_b']] = set()
print(f"Number of unique words in filtered pairs: {len(unique_words)}")

for pair in interchangeable_pairs:
    unique_words[pair['word_a']] += 1
    unique_words[pair['word_b']] += 1
    unique_words_related[pair['word_a']].add(pair['word_b'])
    unique_words_related[pair['word_b']].add(pair['word_a'])


sorted_unique_words = sorted(unique_words.items(), key=lambda x: x[1], reverse=True)

Number of unique words in filtered pairs: 4177


In [None]:
# Print words with highest counts

print(f"{'Word':<20} | {'Count':<10} | {'Related Words'}")
print("-" * 60)
for word, count in sorted_unique_words:
    related_words = ", ".join(sorted(unique_words_related[word]))
    print(f"{word:<20} | {count:<10} | {related_words}")

In [None]:
# Iterate through common words and remove related words

mapping = dict()

removed_words = set()
while len(sorted_unique_words) > 0:
    
    word, count = sorted_unique_words[0]
    if count == 0:
        break
    if word not in unique_words:
        continue

    #Pop the top word from the list
    unique_words.pop(word, None)
    related_words = set(unique_words_related[word])

    # Remove related words from other entries
    for related_word in related_words:
        mapping[related_word] = word
        removed_words.add(related_word)
        #remove the keys for the related words
        unique_words.pop(related_word, None)
        unique_words_related.pop(related_word, None)
        #remove the instance of the related word from all other related word sets
        for word2 in unique_words:
            if related_word in unique_words_related[word2]:
                unique_words_related[word2].remove(related_word)
                unique_words[word2] -= 1
    sorted_unique_words = sorted(unique_words.items(), key=lambda x: x[1], reverse=True)
    

print(f"Number of removed words: {len(removed_words)}")
print(f"Removed words: {', '.join(sorted(removed_words))}")

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("./bert_tokenizer_uncased")
model_state = tokenizer.get_vocab()
for word in removed_words:
    model_state.pop(word, None)

print(model_state)
vocab_list = [token for token, idx in sorted(model_state.items(), key=lambda x: x[1])]
print(vocab_list)
print(len(vocab_list), "words in new vocab")

import json
with open("filtered_tokenizer_vocab_wordnet.json", "w", encoding="utf-8") as f:
    json.dump(vocab_list, f, ensure_ascii=False, indent=2)


with open("removed_words_mapping_wordnet.json", "w", encoding="utf-8") as f:
    json.dump(mapping, f, ensure_ascii=False, indent=2)