In [1]:
from transformers import BertModel, BertTokenizer
import torch

# load a pre-trained BERT model and tokenizer
bert_model = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model)
model = BertModel.from_pretrained(bert_model)

# function to vectorize words and find synonyms
def find_synonyms(word):
    # tokenize the input word
    token = tokenizer.tokenize(word)
    
    # convert tokenized input to token ids
    input_id = tokenizer.convert_tokens_to_ids(token)
    # convert token ids to pytorch tensor and wrap in a list
    input_id = torch.tensor([input_id])
    print('input id:', input_id)
    
    # generate word embeddings
    # wrap the code block in a torch.no_grad() context to disable gradient calculation for faster computation
    with torch.no_grad():
        # pass the input_ids tensor to the BERT model and get the output
        output = model(input_id)
        print('outputs:', output)
        # compute the mean of the last hidden state for each token and squeeze the tensor to remove the extra dimension
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze(0)
        print('embeddings:', embedding)
    
    # find synonyms using cosine similarity
    # calculate the cosine similarity between embeddings of the input word and all other words
    similarities = torch.nn.functional.cosine_similarity(embedding.unsqueeze(0), embedding.unsqueeze(0))
    print(similarities)
    # sort the similarities in descending order and get the indices of the sorted similarities
    sorted_indices = torch.argsort(similarities, descending=True)
    print(sorted_indices)
    # top 10 indices of the sorted similarities excluding the input word itself
    top_indices = sorted_indices[1:11]
    print(top_indices)
    # convert the indices to tokens and get the synonyms
    synonyms = [tokenizer.convert_ids_to_tokens([index.item()])[0] for index in top_indices]
    print(synonyms)
    
    return synonyms

# word to find the synonyms for
word = 'Happy'
synonyms = find_synonyms(word)
print(f'Synonyms for {word}: {synonyms}')


  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)solve/main/vocab.txt: 100%|██████████████████████████████████████████| 232k/232k [00:00<00:00, 1.47MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)okenizer_config.json: 100%|██████████████████████████████████████████| 28.0/28.0 [00:00<00:00, 28.1kB/s]
Downloading (…)lve/main/config.json: 100%|█████████████████████████████████████████████| 571/571 [00:00<00:00, 115kB/s]
Downloading model.safetensors:  12%|█████▍                                         | 157M/1.34G [00:13<01:44, 11.4MB/s]

KeyboardInterrupt: 

In [10]:
from transformers import pipeline

#define the word you want similar words for and the text string
synonym_word = "soldier"
textstr = "Similar words to " + synonym_word + " are: [MASK]."

#create a pipeline for masked language modeling using BERT
model = pipeline('fill-mask', model='bert-base-uncased')

#get synonym predictions for the masked text
synonym_prediction = model(textstr, top_k=50)

#initialize a list to store the filtered words
filtered_words_list = []

#iterate through the synonym predictions
for x in synonym_prediction:
    token_str = x['token_str']

#filter out non-alphabetic tokens
    if token_str.isalpha():
        filtered_words_list.append(token_str)

#print the filtered words with some editing.


filtered_words = [];

# The words to get removed.
withS = synonym_word + 's'
withCapital = synonym_word.capitalize()
withBoth = withS.capitalize()

# Remove words that add an s, or start with a capital letter of the originial word.
# This will only work if its on a seperate loop?!
for t in filtered_words_list:
    if t == withCapital:
        filtered_words_list.remove(t)
    if t == synonym_word:
        filtered_words_list.remove(t)

for t in filtered_words_list:
    if t == withS:
        filtered_words_list.remove(t)
    
    if t == withBoth:
        filtered_words_list.remove(t)

# The list of words.
print(filtered_words_list)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['man', 'sergeant', 'officer', 'army', 'commander', 'sgt', 'warrior', 'war', 'infantry', 'captain', 'military', 'no', 'guard', 'general', 'unknown', 'enemy', 'gunner', 'a', 'corporal', 'horse', 'field', 'civilian', 'regiment', 'colonel', 'i', 'private', 'leader', 'coward', 'lieutenant', 'cavalry', 'one', 'gun', 'brig', 'citizen', 'sniper', 'hero', 'hunter', 'defender', 'farmer', 'the', 'agent', 'sir', 'unit']
