In [74]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#### The below needs to be made more solid by using spacy (or sth comparable) to define sentences

In [81]:
text1 = "Electric vehicles are becoming increasingly popular. They help reduce greenhouse gas emissions and air pollution. Many governments offer incentives to promote the adoption of electric cars. Charging infrastructure is rapidly expanding in urban areas. The future of transportation seems to be electric."
text2 = "Renewable energy sources are gaining traction worldwide. Solar and wind power are becoming more cost-effective and efficient. Governments are implementing policies to encourage the use of clean energy. Innovations in energy storage, such as advanced batteries, facilitate the adoption of renewables. The shift towards sustainable energy is gaining momentum."
marked_text = text2.replace('.', '. [SEP]')
marked_text = "[CLS] " + marked_text

# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

sublist_lengths = []
current_length = 0
for value in tokenized_text:
    current_length += 1
    if "[SEP]" in value:
        sublist_lengths.append(current_length)
        current_length = 0

segments_ids = []
for i in range(len(sublist_lengths)):
    segments_ids += [i] * sublist_lengths[i]
    
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

# `hidden_states` has shape [13 x 1 x 22 x 768]

# `token_vecs` is a tensor with shape [22 x 768]
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding2 = torch.mean(token_vecs, dim=0)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [87]:
segments_tensors

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]])

In [64]:
from scipy.spatial.distance import cosine

1 - cosine(sentence_embedding1, sentence_embedding2)

nan

In [88]:
import torch
from torch.nn.functional import cosine_similarity
from transformers import BertTokenizer, BertModel
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

text1 = "Electric vehicles are becoming increasingly popular. They help reduce greenhouse gas emissions and air pollution. Many governments offer incentives to promote the adoption of electric cars. Charging infrastructure is rapidly expanding in urban areas. The future of transportation seems to be electric."
text2 = "Renewable energy sources are gaining traction worldwide. Solar and wind power are becoming more cost-effective and efficient. Governments are implementing policies to encourage the use of clean energy. Innovations in energy storage, such as advanced batteries, facilitate the adoption of renewables. The shift towards sustainable energy is gaining momentum."
texts = [text1, text2]

processed_texts = []
# Preprocessing
for text in texts:
    processed_texts
    processed_texts.append(processed_text)

input_sentences = [sentence.strip() for sentence in input_sentences if sentence.strip()]

inputs = tokenizer(input_sentences, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

hidden_states = outputs.hidden_states
last_hidden_state = hidden_states[-1]  # shape: (batch_size, sequence_length, hidden_size)

# Calculate the mean of the last hidden layer for each sentence
embeddings = last_hidden_state.mean(dim=1)

# Extract the embeddings for Text 1 and Text 2
text1_embedding = embeddings[:len(text1.split('. ')), :]
text2_embedding = embeddings[len(text1.split('. ')):, :]

# Calculate the mean embeddings for Text 1 and Text 2
text1_mean_embedding = text1_embedding.mean(dim=0)
text2_mean_embedding = text2_embedding.mean(dim=0)

# Calculate the cosine similarity between the mean embeddings of Text 1 and Text 2
cosine_sim = cosine_similarity(text1_mean_embedding.unsqueeze(0), text2_mean_embedding.unsqueeze(0))

print(f"Cosine similarity between Text 1 and Text 2: {cosine_sim.item():.4f}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Cosine similarity between Text 1 and Text 2: 0.9161
