In [4]:
from transformers import BertModel, BertTokenizer
import torch

# load a pre-trained BERT model and tokenizer
bert_model = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model)
model = BertModel.from_pretrained(bert_model)

# function to vectorize words and find synonyms
def find_synonyms(word):
    # tokenize the input word
    token = tokenizer.tokenize(word)
    
    # convert tokenized input to token ids
    input_id = tokenizer.convert_tokens_to_ids(token)
    # convert token ids to pytorch tensor and wrap in a list
    input_id = torch.tensor([input_id])
    print('input id:', input_id)
    
    # generate word embeddings
    # wrap the code block in a torch.no_grad() context to disable gradient calculation for faster computation
    with torch.no_grad():
        # pass the input_ids tensor to the BERT model and get the output
        output = model(input_id)
        print('outputs:', output)
        # compute the mean of the last hidden state for each token and squeeze the tensor to remove the extra dimension
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze(0)
        print('embeddings:', embedding)
    
    # find synonyms using cosine similarity
    # calculate the cosine similarity between embeddings of the input word and all other words
    similarities = torch.nn.functional.cosine_similarity(embedding.unsqueeze(0), embedding.unsqueeze(0))
    print(similarities)
    # sort the similarities in descending order and get the indices of the sorted similarities
    sorted_indices = torch.argsort(similarities, descending=True)
    print(sorted_indices)
    # top 10 indices of the sorted similarities excluding the input word itself
    top_indices = sorted_indices[1:11]
    print(top_indices)
    # convert the indices to tokens and get the synonyms
    synonyms = [tokenizer.convert_ids_to_tokens([index.item()])[0] for index in top_indices]
    print(synonyms)
    
    return synonyms

# word to find the synonyms for
word = 'Happy'
synonyms = find_synonyms(word)
print(f'Synonyms for {word}: {synonyms}')


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


input id: tensor([[3407]])
outputs: BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.8099, -0.0580,  0.2824,  ...,  0.1177, -0.1399,  0.5079]]]), pooler_output=tensor([[ 0.1006,  0.1972, -0.3871,  ..., -0.0026,  0.1813, -0.2090]]), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)
embeddings: tensor([-1.5501e-01,  9.7101e-02, -2.0039e-02, -2.7376e-01, -2.3086e-01,
        -2.1555e-01,  2.6285e-01,  1.0263e-01, -1.4888e-01, -2.3009e-01,
        -2.3984e-01, -1.4952e-01,  1.1697e-01,  6.5679e-02, -3.2519e-01,
        -3.1836e-01, -1.0093e-01, -2.4247e-02,  1.9320e-02,  3.0708e-01,
        -7.2634e-02, -6.8835e-02,  9.6553e-02,  1.6382e-01,  1.1619e-01,
        -4.0722e-02, -1.9064e-01,  1.0297e-01, -1.9433e-01, -2.1519e-01,
        -2.0154e-01, -5.6326e-02,  8.4450e-02,  1.2890e-01, -2.6629e-01,
        -7.3395e-02,  6.7195e-02, -5.9494e-02, -4.9576e-01,  2.6328e-01,
        -1.9147e-01, -2.6232e-01, -7.8828e-03,  6.6626e-02, -9.

In [34]:
from transformers import pipeline

#define the word you want similar words for and the text string
synonym_word = "building"
textstr = "Similar words to " + synonym_word + " are: [MASK]."

#create a pipeline for masked language modeling using BERT
model = pipeline('fill-mask', model='bert-base-uncased')

#get synonym predictions for the masked text
synonym_prediction = model(textstr, top_k=10)

#initialize a list to store the filtered words
filtered_words_list = []

#iterate through the synonym predictions
for x in synonym_prediction:
    token_str = x['token_str']

#filter out non-alphabetic tokens
    if token_str.isalpha():
        filtered_words_list.append(token_str)

#print the filtered words
print(filtered_words_list)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['building', 'construction', 'tower', 'architecture', 'structure', 'house', 'office', 'housing', 'hall']
