In [1]:
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

# Function to calculate cosine similarity
def get_cosine_similarity(embedding1, embedding2):
    # Calculate cosine similarity (note: scipy's cosine function actually computes the distance, so we subtract from 1)
    cos_sim = 1 - cosine(embedding1, embedding2)
    return cos_sim

# Initialize the model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Example sentences
sentences = ["This is an example sentence", "This is also an example sentence."]

# Generate embeddings
embedding1, embedding2 = model.encode(sentences)

In [2]:
# Calculate similarity
similarity = get_cosine_similarity(embedding1, embedding2)
print("Cosine Similarity:", similarity)


Cosine Similarity: 0.8140871524810791


In [3]:
sentences = ["I like this", "I hate this"]

# Generate embeddings
embedding1, embedding2 = model.encode(sentences)

# Calculate similarity
similarity = get_cosine_similarity(embedding1, embedding2)
print("Cosine Similarity:", similarity)


Cosine Similarity: 0.42858538031578064


In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
import torch

def get_word_embedding(sentence, word, model, tokenizer, model_type='bert'):
    # Tokenize and encode the sentence
    encoded_input = tokenizer(sentence, return_tensors='pt')
    tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])

    # Find the index of the word (handling potential subword tokenization)
    word_tokens = tokenizer.tokenize(word)
    word_index = None
    for i in range(len(tokens) - len(word_tokens) + 1):
        if tokens[i:i + len(word_tokens)] == word_tokens:
            word_index = i
            break
    if word_index is None:
        raise ValueError(f"Word '{word}' not found in the tokenized sentence.")

    # Get model output
    with torch.no_grad():
        output = model(**encoded_input)

    # Extract the embedding for the specified word (for GPT models, take the last layer)
    if model_type == 'bert':
        word_embedding = output.last_hidden_state[0, word_index, :]
    elif model_type == 'gpt':
        word_embedding = output['last_hidden_state'][0, word_index, :]
    else:
        raise ValueError("Invalid model type specified. Choose 'bert' or 'gpt'.")
    
    return word_embedding


In [5]:
# Load pre-trained models and tokenizers

# For BERT
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')

# For GPT-2
gpt_tokenizer = AutoTokenizer.from_pretrained('gpt2')
gpt_model = AutoModel.from_pretrained('gpt2')

In [6]:
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
import torch

def get_initial_word_embedding(word, tokenizer, model, model_type='bert'):
    # Tokenize and encode the word
    encoded_input = tokenizer(word, return_tensors='pt')
    tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])

    # Handling the case when a word is split into subwords
    if len(tokens) > 3:  # Including special tokens [CLS], [SEP] for BERT or GPT-2
        raise ValueError("The word was split into subwords. Please provide a single token.")

    # Extract the token index (excluding special tokens)
    token_index = 1 if model_type == 'bert' else 0

    # Get the embeddings
    with torch.no_grad():
        if model_type == 'bert':
            embeddings = model.embeddings(encoded_input['input_ids'])[0, token_index, :]
        elif model_type == 'gpt':
            # For GPT-2, manually apply the embedding layer
            input_ids = encoded_input['input_ids']
            embeddings = model.wte(input_ids)[0, token_index, :]
        else:
            raise ValueError("Invalid model type specified. Choose 'bert' or 'gpt'.")

    return embeddings
    
# Example word
word = "river"

# Get initial embedding for BERT
bert_initial_embedding = get_initial_word_embedding(word, bert_tokenizer, bert_model, 'bert')
print("BERT Initial Embedding for '{}':".format(word), bert_initial_embedding[:5])

# Get initial embedding for GPT-2
gpt_initial_embedding = get_initial_word_embedding(word, gpt_tokenizer, gpt_model, 'gpt')
print("GPT-2 Initial Embedding for '{}':".format(word), gpt_initial_embedding[:5])

BERT Initial Embedding for 'river': tensor([ 0.1578,  0.4036, -0.3913,  0.1713, -0.1797])
GPT-2 Initial Embedding for 'river': tensor([0.0582, 0.0548, 0.2876, 0.0670, 0.1042])


# Starting with BERT

In [7]:
sentence = "I went to the river bank for a nice walk."
word = "bank"

# Get embedding for bank
embedding = get_word_embedding(sentence, word, bert_model, bert_tokenizer)
print("Embedding for '{}':".format(word), embedding[:5])

Embedding for 'bank': tensor([ 0.2764, -0.4860,  0.2104, -0.3106, -0.0630])


In [8]:
# Calculate similarity to initial lake/money embedding
water_embedding = get_initial_word_embedding('water', bert_tokenizer, bert_model, 'bert')
money_embedding = get_initial_word_embedding('money', bert_tokenizer, bert_model, 'bert')

In [9]:
print("Cosine Similarity to water:", get_cosine_similarity(embedding, water_embedding))
print("Cosine Similarity to money:", get_cosine_similarity(embedding, money_embedding))


Cosine Similarity to water: 0.06027546897530556
Cosine Similarity to money: 0.004379441495984793


In [10]:
# bank for cash
sentence = "I went to the bank to get some cash out of savings."
word = "bank"

# bank has a different embedding
embedding = get_word_embedding(sentence, word, bert_model, bert_tokenizer)
print("Embedding for '{}':".format(word), embedding[:5])

Embedding for 'bank': tensor([ 0.7613, -0.3984, -0.1457, -0.1107,  1.2720])


In [11]:
print("Cosine Similarity to water:", get_cosine_similarity(embedding, water_embedding))
print("Cosine Similarity to money:", get_cosine_similarity(embedding, money_embedding))

# similarity went down compared to water and up for money

Cosine Similarity to water: -0.0015438803238794208
Cosine Similarity to money: 0.061874888837337494


# Now with GPT

In [None]:
# For gpt, position matters for embeddings!

In [12]:
sentence = "I went to the bank to get some cash"
word = " bank"  # gpt tokenizer prepends spaces

# Get embedding
embedding = get_word_embedding(sentence, word, gpt_model, gpt_tokenizer)
print("Embedding for '{}':".format(word), embedding[:5])

Embedding for ' bank': tensor([-0.1299, -0.3162, -1.0468,  0.1864,  0.2709])


In [13]:
sentence = "I went to the bank of the river"
word = " bank"

embedding = get_word_embedding(sentence, word, gpt_model, gpt_tokenizer)
print("Embedding for '{}':".format(word), embedding[:5])

Embedding for ' bank': tensor([-0.1299, -0.3162, -1.0468,  0.1864,  0.2709])


In [None]:
# Same embedding for " bank" because all words are the same before " bank"
