In [1]:
from simcse import SimCSE

import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer

## Using with SimCSE

In [2]:
# Instantiate the model
model = SimCSE("princeton-nlp/sup-simcse-bert-base-uncased")

In [None]:
# Get embeddings
embeddings = model.encode('Attending PyData Global is awesome!')

In [None]:
# Compute similarities
sentences_a = ['Attending PyData Global is awesome!']
sentences_b = ['Jenny is hungry :(']
similarities = model.similarity(sentences_a, sentences_b)
print(similarities)

sentences_a = ['Attending PyData Global is awesome!']
sentences_b = ['Python conferences are great!']
similarities = model.similarity(sentences_a, sentences_b)
print(similarities)

## Using with Huggingface

In [None]:
# Imports
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")

# Tokenize the inputs
texts = [
    'I ate an apple',
    'Jane ate an apple',
    'Python conferences are great!'
]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get the embeddings
with torch.no_grad():
    embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output

In [None]:
# Calculate cosine similarities => higher values -> more similarity
cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))