In [1]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 4.8 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 15.1 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 50.8 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 66.6 MB/s 
Collecting tokenizers!=0.11.3,<

## import

In [2]:
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, evaluation, losses
import logging
import os
import gzip
from datetime import datetime
import torch

In [3]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

In [4]:
# Some training parameters. We use a batch size of 16, for every positive example we include 8-1=7 negative examples
# Sentences are truncated to 75 word pieces
model_name = 'distilbert-base-uncased'
batch_size = 16
pos_neg_ratio = 8   # batch_size must be devisible by pos_neg_ratio
max_seq_length = 75
num_epochs = 1

In [5]:
################# Download AskUbuntu and extract training corpus  #################
askubuntu_folder = 'askubuntu'
output_path = 'output/train_askubuntu_ct-{}-{}-{}'.format(model_name, batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

In [6]:
## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
for filename in ['text_tokenized.txt.gz', 'dev.txt', 'test.txt', 'train_random.txt']:
    filepath = os.path.join(askubuntu_folder, filename)
    if not os.path.exists(filepath):
        util.http_get('https://github.com/taolei87/askubuntu/raw/master/'+filename, filepath)

  0%|          | 0.00/46.4M [00:00<?, ?B/s]

  0%|          | 0.00/34.7k [00:00<?, ?B/s]

  0%|          | 0.00/34.4k [00:00<?, ?B/s]

  0%|          | 0.00/4.12M [00:00<?, ?B/s]

In [7]:
# Read the corpus
corpus = {}

with gzip.open(os.path.join(askubuntu_folder, 'text_tokenized.txt.gz'), 'rt', encoding='utf8') as fIn:
    for line in fIn:
        splits = line.strip().split("\t")
        id = splits[0]
        title = splits[1]
        corpus[id] = title

In [11]:
# corpus

dev_test_ids = set()
# read dev and test dataset
def read_eval_dataset(filepath):
  dataset = []
  with open(filepath) as fIn:
    for line in fIn:
      query_id,relevant_id,candidate_ids,bm25_scores = line.strip().split('\t')
      if len(relevant_id) == 0:   #Skip examples without relevant entries
          continue
      relevant_id = relevant_id.split(" ")
      candidate_ids = candidate_ids.split(" ")
      negative_ids = set(candidate_ids) - set(relevant_id)
      dataset.append({
          'query': corpus[query_id],
          'positive': [corpus[pid] for pid in relevant_id],
          'negative': [corpus[pid] for pid in negative_ids]
      })
      dev_test_ids.add(query_id)
      dev_test_ids.update(candidate_ids)
  return dataset

dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, 'dev.txt'))
test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, 'test.txt'))

In [12]:
## Now we need a list of train sentences.
## In this example we simply use all sentences that don't appear in the train/dev set
train_sentences = []
for id,sentence in corpus.items():
  if id not in dev_test_ids:
    train_sentences.append(sentence)
logging.info("{} train sentences".format(len(train_sentences)))

2022-06-04 13:51:51 - 160436 train sentences


In [13]:
################# Intialize an SBERT model #################
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

2022-06-04 13:53:14 - Use pytorch device: cuda


In [14]:
################# Train the model #################

# For ContrastiveTension we need a special data loader to construct batches with the desired properties
train_dataloader = losses.ContrastiveTensionDataLoader(train_sentences,batch_size=batch_size,pos_neg_ratio=pos_neg_ratio)

# as loss,we losses.ContrastiveTensionLoss
train_loss = losses.ContrastiveTensionLoss(model)

# create a dev evaluator
dev_evaluator = evaluation.RerankingEvaluator(dev_dataset,name='AskUbuntu dev')
test_evaluator = evaluation.RerankingEvaluator(test_dataset, name='AskUbuntu test')

In [15]:
logging.info("Start training")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    weight_decay=0,
    warmup_steps=0,
    optimizer_class=torch.optim.RMSprop,
    optimizer_params={'lr': 1e-5},
    use_amp=False    #Set to True, if your GPU has optimized FP16 cores
)

2022-06-04 13:55:43 - Start training


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

In [16]:
latest_output_path = output_path + "-latest"
model.save(latest_output_path)

### Run test evaluation on the latest model. This is equivalent to not having a dev dataset
model = SentenceTransformer(latest_output_path)
test_evaluator(model)

2022-06-04 14:05:13 - Save model to output/train_askubuntu_ct-distilbert-base-uncased-16-2022-06-04_13-44-42-latest
2022-06-04 14:05:14 - Load pretrained SentenceTransformer: output/train_askubuntu_ct-distilbert-base-uncased-16-2022-06-04_13-44-42-latest
2022-06-04 14:05:14 - Use pytorch device: cuda
2022-06-04 14:05:14 - RerankingEvaluator: Evaluating the model on AskUbuntu test dataset:


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2022-06-04 14:05:17 - Queries: 179 	 Positives: Min 1.0, Mean 5.2, Max 19.0 	 Negatives: Min 1.0, Mean 14.8, Max 19.0
2022-06-04 14:05:17 - MAP: 57.57
2022-06-04 14:05:17 - MRR@10: 71.06


0.5756506783671937

# stsb_ct

https://github.com/UKPLab/sentence-transformers/blob/master/examples/unsupervised_learning/CT/train_stsb_ct.py

In [1]:
import torch
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import SentenceTransformer, LoggingHandler, models, util, InputExample
from sentence_transformers import losses
import os
import gzip
import csv
from datetime import datetime
import logging

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

In [2]:
## Training parameters
model_name = 'distilbert-base-uncased'
batch_size = 16
pos_neg_ratio = 8   # batch_size must be devisible by pos_neg_ratio
epochs = 1
max_seq_length = 75

In [3]:
# Save path to store our model
model_save_path = 'output/train_stsb_ct-{}-{}'.format(model_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

In [4]:
################# Train sentences #################
# We use 1 Million sentences from Wikipedia to train our model
wikipedia_dataset_path = 'data/wiki1m_for_simcse.txt'
if not os.path.exists(wikipedia_dataset_path):
    util.http_get('https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt', wikipedia_dataset_path)

  0%|          | 0.00/120M [00:00<?, ?B/s]

In [6]:
!head -n 20 data/wiki1m_for_simcse.txt

YMCA in South Australia
South Australia (SA)  has a unique position in Australia's history as, unlike the other states which were founded as colonies, South Australia began as a self governing province Many were attracted to this and Adelaide and SA developed as an independent and free thinking state.
The compound of philosophical radicalism, evangelical religion and self reliant ability typical of its founders had given an equalitarian flavour to South Australian thinking from the beginning.
It was into this social setting that in February 1850 a meeting was called primarily for the formation of an Association (apparently meaning a Y.M.C.A.)
for apprentices and others, after their day's work, to enjoy books, lectures, discussions, readings, friendly relief and recreation for a leisure hour.
In September 1850 records show that this became “The Young Men's Christian Association of South Australia" as evidenced by a member's letter in London Y.M.C.A.
Report 1851.
There was no census in 1

In [7]:
# train_sentences are simply your list of sentences
train_sentences = []
with open(wikipedia_dataset_path, 'r', encoding='utf8') as fIn:
    for line in fIn:
        line = line.strip()
        if len(line) >= 10:
            train_sentences.append(line)

In [8]:
################# Download and load STSb #################
data_folder = 'data/stsbenchmark'
sts_dataset_path = f'{data_folder}/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)

  0%|          | 0.00/392k [00:00<?, ?B/s]

In [9]:
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)

        if row['split'] == 'dev':
            dev_samples.append(inp_example)
        elif row['split'] == 'test':
            test_samples.append(inp_example)

In [10]:
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')

In [11]:
################# Intialize an SBERT model #################
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2022-06-04 14:11:00 - Use pytorch device: cuda


In [12]:
# For ContrastiveTension we need a special data loader to construct batches with the desired properties
train_dataloader =  losses.ContrastiveTensionDataLoader(train_sentences, batch_size=batch_size, pos_neg_ratio=pos_neg_ratio)

In [13]:
# As loss, we losses.ContrastiveTensionLoss
train_loss = losses.ContrastiveTensionLoss(model)

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=1,
    evaluation_steps=1000,
    weight_decay=0,
    warmup_steps=0,
    optimizer_class=torch.optim.RMSprop,
    optimizer_params={'lr': 1e-5},
    output_path=model_save_path,
    use_amp=False    #Set to True, if your GPU has optimized FP16 cores
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/30803 [00:00<?, ?it/s]

2022-06-04 14:14:57 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 1000 steps:
2022-06-04 14:14:59 - Cosine-Similarity :	Pearson: 0.7811	Spearman: 0.7850
2022-06-04 14:14:59 - Manhattan-Distance:	Pearson: 0.7849	Spearman: 0.7842
2022-06-04 14:14:59 - Euclidean-Distance:	Pearson: 0.7857	Spearman: 0.7850
2022-06-04 14:14:59 - Dot-Product-Similarity:	Pearson: 0.6751	Spearman: 0.6709
2022-06-04 14:14:59 - Save model to output/train_stsb_ct-distilbert-base-uncased-2022-06-04_14-08-19
2022-06-04 14:18:07 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 2000 steps:
2022-06-04 14:18:09 - Cosine-Similarity :	Pearson: 0.7860	Spearman: 0.7883
2022-06-04 14:18:09 - Manhattan-Distance:	Pearson: 0.7878	Spearman: 0.7876
2022-06-04 14:18:09 - Euclidean-Distance:	Pearson: 0.7881	Spearman: 0.7877
2022-06-04 14:18:09 - Dot-Product-Similarity:	Pearson: 0.7029	Spearman: 0.6979
2022-06-04 14:18:09 - Save model to output/train_

In [None]:
########### Load the model and evaluate on test set

model = SentenceTransformer(model_save_path)
test_evaluator(model)