In [3]:
from simcse import SimCSE

import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer

# BERT's Achilles' heel? 
#### Applying contrastive learning to fight anisotropy in language models.

PyData Global 2022, 2022-12-02


#### Abstract
Transformer models became state-of-the-art in natural language processing. Word representations learned by these models offer great flexibility for many types of downstream tasks from classification to summarization. Nonetheless, these representations suffer from certain conditions that impair their effectiveness. Researchers have demonstrated that BERT and GPT embeddings tend to cluster in a narrow cone of the embedding space which leads to unwanted consequences (e.g. spurious similarities between unrelated words). During the talk we’ll introduce SimCSE – a contrastive learning method that helps to regularize the embeddings and reduce the problem of anisotropy. We will demonstrate how SimCSE can be implemented in Python.

#### Installation

To run the notebook, create and activate a **Conda** environemnt using `simcse.yml` file.


To install SimCSE only:

`pip install simcse`

## Using with SimCSE

In [2]:
# Instantiate the model
model = SimCSE("princeton-nlp/sup-simcse-bert-base-uncased")

In [4]:
# Get embeddings
embeddings = model.encode('Attending PyData Global is awesome!')

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.12s/it]


In [5]:
# Compute similarities
sentences_a = ['Attending PyData Global is awesome!']
sentences_b = ['Jenny is hungry :(']
similarities = model.similarity(sentences_a, sentences_b)
print(similarities)

sentences_a = ['Attending PyData Global is awesome!']
sentences_b = ['Python conferences are great!']
similarities = model.similarity(sentences_a, sentences_b)
print(similarities)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.00it/s]


[[0.14462978]]


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.42it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.00it/s]

[[0.5483091]]





## Using with Huggingface

In [6]:
# Imports
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")

# Tokenize the inputs
texts = [
    'I ate an apple',
    'Jane ate an apple',
    'Python conferences are great!'
]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get the embeddings
with torch.no_grad():
    embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output

In [7]:
# Calculate cosine similarities => higher values -> more similarity
cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))

Cosine similarity between "I ate an apple" and "Jane ate an apple" is: 0.855
Cosine similarity between "I ate an apple" and "Python conferences are great!" is: 0.083


## Training your own SimCSE

Check the instructions here: https://github.com/princeton-nlp/SimCSE#training