In [130]:
from pytorch_transformers import BertModel
from pytorch_transformers import BertTokenizer
from sentence_transformers import SentenceTransformer

import numpy as np
import torch

model_siamese = SentenceTransformer('bert-base-nli-mean-tokens')
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [170]:
def obtain_sentence_embedding(model, tokenizer, input_sentence):
    input_sentence = torch.tensor(
        tokenizer.encode(f"[CLS] {input_sentence}")
    ).unsqueeze(0)
    last_hidden_state, pooler_output = model(input_sentence)
    last_hidden_state = last_hidden_state.squeeze()
    cls_embedding = last_hidden_state[0].view(1, -1)

    return cls_embedding


def obtain_sentence_embeddings(model, tokenizer, input_sentences):
    cls_embeddings = torch.cat([
        obtain_sentence_embedding(model, tokenizer, s) for s in input_sentences
    ])

    return cls_embeddings


def obtain_sentence_embeddings_siamese(model, input_sentences):
    encoded_sentences = np.stack(model.encode(sentences))
    encoded_sentences = torch.tensor(encoded_sentences)

    return encoded_sentences


def euclidean_distance(vector_1, vector_2):
    dist = torch.sqrt(torch.sum(vector_1 - vector_2)**2)
    return dist


In [169]:
input_sentences = [
    'hello world my name is bob', 
    'hello world my name is doug',
    'i am jiu-jitsu god amongst men',
    'i am a jiu-jitsu god amongst men'
]

encoded_sentences = obtain_sentence_embeddings(model, tokenizer, input_sentences)

print(euclidean_distance(encoded_sentences[0], encoded_sentences[1]))
print(euclidean_distance(encoded_sentences[0], encoded_sentences[2]))
print(euclidean_distance(encoded_sentences[0], encoded_sentences[3]))
print(euclidean_distance(encoded_sentences[2], encoded_sentences[3]))

tensor(0.0022, grad_fn=<SqrtBackward>)
tensor(0.8775, grad_fn=<SqrtBackward>)
tensor(0.8285, grad_fn=<SqrtBackward>)
tensor(0.0490, grad_fn=<SqrtBackward>)
