In [14]:
import torch
from transformers import BertTokenizer, BertModel
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

text1 = "Electric vehicles are becoming increasingly popular. They help reduce greenhouse gas emissions and air pollution. Many governments offer incentives to promote the adoption of electric cars. Charging infrastructure is rapidly expanding in urban areas. The future of transportation seems to be electric."
text2 = "Renewable energy sources are gaining traction worldwide. Solar and wind power are becoming more cost-effective and efficient. Governments are implementing policies to encourage the use of clean energy. Innovations in energy storage, such as advanced batteries, facilitate the adoption of renewables. The shift towards sustainable energy is gaining momentum."

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def insert_sep_token(text):
    sentences = sent_tokenize(text)
    text_with_sep = ' [SEP] '.join(sentences)
    return text_with_sep

def bert_embed_text(text):
    marked_text = "[CLS] " + insert_sep_token(text)
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Create segment ids
    segments_ids = []
    current_segment_id = 0
    for value in tokenized_text:
        segments_ids.append(current_segment_id)
        if value == "[SEP]":
            current_segment_id = 1 - current_segment_id

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
    model.eval()

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    token_vecs = hidden_states[-2][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)
    return sentence_embedding


[nltk_data] Downloading package punkt to /Users/florian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([-7.0695e-01,  2.2960e-02,  1.6947e-01, -6.0236e-01, -2.8968e-02,
         3.3720e-01,  1.9686e-01,  8.1760e-01,  1.6311e-01, -3.6833e-01,
         5.5707e-03,  7.7598e-02,  2.2679e-01,  1.0565e-01, -2.3633e-02,
         4.5737e-01,  2.5490e-01,  8.4916e-02, -2.3445e-01, -3.2580e-01,
         9.2524e-01,  1.7235e-01, -5.1883e-01,  5.2436e-02,  4.1851e-01,
        -1.3476e-02,  1.5213e-01, -3.7569e-01, -5.8883e-01,  1.3974e-01,
         3.7349e-01,  2.4171e-01, -5.7730e-02,  9.3303e-02,  5.5190e-02,
        -4.8736e-01,  8.0719e-01, -5.8010e-01, -3.8817e-01,  4.6544e-01,
        -6.6057e-01,  5.8922e-02,  3.5083e-01, -6.4317e-01, -2.7222e-01,
        -3.6182e-01,  2.7176e-01, -6.2038e-01, -1.3663e-01, -2.8860e-01,
        -1.2310e+00, -1.8025e-02, -1.0962e-01, -7.4949e-02, -2.9828e-01,
         2.0283e-01, -4.9564e-02, -3.4626e-01, -4.4542e-01, -2.6465e-02,
         3.6474e-02, -6.7238e-01,  3.2589e-02, -3.2047e-03,  2.7904e-01,
        -4.3119e-01,  7.2844e-01,  6.0596e-01, -1.4

In [None]:

model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

# `token_vecs` is a tensor with shape [22 x 768]
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding2 = torch.mean(token_vecs, dim=0)
sentence_embedding2

In [4]:
example_text1 = "Electric vehicles are becoming increasingly popular. They help reduce greenhouse gas emissions and air pollution. Many governments offer incentives to promote the adoption of electric cars. Charging infrastructure is rapidly expanding in urban areas. The future of transportation seems to be electric."
example_text2 = "Renewable energy sources are gaining traction worldwide. Solar and wind power are becoming more cost-effective and efficient. Governments are implementing policies to encourage the use of clean energy. Innovations in energy storage, such as advanced batteries, facilitate the adoption of renewables. The shift towards sustainable energy is gaining momentum."
example_texts = [example_text1, example_text2]

In [5]:
import torch
from torch.nn.functional import cosine_similarity
from transformers import BertTokenizer, BertModel
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

def insert_sep_token(text):
    sentences = sent_tokenize(text)
    text_with_sep = ' [SEP] '.join(sentences)
    return text_with_sep

processed_texts = []
# Preprocessing
for text in example_texts:
    processed_text = insert_sep_token(text)
    processed_text = "[CLS] " + processed_text
    processed_texts.append(processed_text)

inputs = tokenizer(processed_texts, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

hidden_states = outputs.hidden_states
last_hidden_state = hidden_states[-1]  # shape: (batch_size, sequence_length, hidden_size)

# Calculate the mean of the last hidden layer for each sentence
embeddings = last_hidden_state.mean(dim=1)

# Extract the embeddings for Text 1 and Text 2
text1_embedding = embeddings[:len(text1.split('. ')), :]
text2_embedding = embeddings[len(text1.split('. ')):, :]

# Calculate the mean embeddings for Text 1 and Text 2
text1_mean_embedding = text1_embedding.mean(dim=0)
text2_mean_embedding = text2_embedding.mean(dim=0)

# Calculate the cosine similarity between the mean embeddings of Text 1 and Text 2
cosine_sim = cosine_similarity(text1_mean_embedding.unsqueeze(0), text2_mean_embedding.unsqueeze(0))

print(f"Cosine similarity between Text 1 and Text 2: {cosine_sim.item():.4f}")

[nltk_data] Downloading package punkt to /Users/florian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NameError: name 'text1' is not defined