In [11]:
from transformers import BertModel, BertTokenizer
import torch

# load a pre-trained BERT model and tokenizer
bert_model = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model)
model = BertModel.from_pretrained(bert_model)

# function to vectorize words and find synonyms
def find_synonyms(word):
    # tokenize the input word
    token = tokenizer.tokenize(word)
    
    # convert tokenized input to token ids
    input_id = tokenizer.convert_tokens_to_ids(token)
    # convert token ids to pytorch tensor and wrap in a list
    input_id = torch.tensor([input_id])
    print('input id:', input_id)
    
    # generate word embeddings
    # wrap the code block in a torch.no_grad() context to disable gradient calculation for faster computation
    with torch.no_grad():
        # pass the input_ids tensor to the BERT model and get the output
        output = model(input_id)
        print('outputs:', output)
        # compute the mean of the last hidden state for each token and squeeze the tensor to remove the extra dimension
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze(0)
        print('embeddings:', embedding)
    
    # find synonyms using cosine similarity
    # calculate the cosine similarity between embeddings of the input word and all other words
    similarities = torch.nn.functional.cosine_similarity(embedding.unsqueeze(0), embedding.unsqueeze(0))
    print(similarities)
    # sort the similarities in descending order and get the indices of the sorted similarities
    sorted_indices = torch.argsort(similarities, descending=True)
    print(sorted_indices)
    # top 10 indices of the sorted similarities excluding the input word itself
    top_indices = sorted_indices[1:11]
    print(top_indices)
    # convert the indices to tokens and get the synonyms
    synonyms = [tokenizer.convert_ids_to_tokens([index.item()])[0] for index in top_indices]
    print(synonyms)
    
    return synonyms

# word to find the synonyms for
word = 'money'
synonyms = find_synonyms(word)
print(f'Synonyms for {word}: {synonyms}')


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


input id: tensor([[2769]])
outputs: BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-1.2313, -0.3789,  0.0102,  ..., -0.9358,  0.0685,  0.3635]]]), pooler_output=tensor([[ 0.9827,  0.2063,  0.6847,  ..., -0.9192,  0.0569, -0.8858]]), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)
embeddings: tensor([ 4.7020e-01,  2.6461e-01, -2.1432e-01, -5.2927e-02,  3.0778e-01,
        -2.3462e-02,  2.2461e-01, -1.9313e-01,  3.3127e-02, -3.2388e-01,
        -1.1657e-02, -6.6924e-02, -5.0658e-02,  3.2662e-01, -3.0206e-01,
        -1.0265e-01, -9.0821e-02,  3.2740e-01,  9.3796e-02,  1.9723e-01,
         1.1347e-02, -1.7763e-01, -8.4163e-03, -6.0926e-02, -7.6311e-03,
         3.2154e-01, -5.1560e-02,  6.0643e-02, -1.9824e-01, -2.5021e-02,
        -1.6107e-02, -4.2907e-01,  4.1345e-02,  3.6247e-01,  2.1230e-01,
        -1.1116e-01,  2.4410e-01, -2.7897e-01, -1.9281e-01, -1.2328e-01,
         1.3469e-01,  8.1929e-02,  2.0420e-01, -6.4993e-02,  1.

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Synonyms for 'money': []


In [7]:
from transformers import pipeline

#define the word you want similar words for and the text string
synonym_word = "building"
textstr = "Similar words to " + synonym_word + " are: [MASK]."

#create a pipeline for masked language modeling using BERT
model = pipeline('fill-mask', model='bert-base-uncased')

#get synonym predictions for the masked text
synonym_prediction = model(textstr, top_k=10)

#initialize a list to store the filtered words
filtered_words_list = []




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[]
