In [1]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 2.3 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 11.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 47.2 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 46.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYA

In [2]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv

In [None]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

In [None]:
# Training parameters
model_name = 'distilbert-base-uncased'
train_batch_size = 128
num_epochs = 1
max_seq_length = 32

In [None]:
# Save path to store our model
model_save_path = 'output/training_stsb_simcse-{}-{}-{}'.format(model_name, train_batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

In [None]:
# Check if dataset exsist. If not, download and extract  it
sts_dataset_path = 'data/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)

  0%|          | 0.00/392k [00:00<?, ?B/s]

In [None]:
# Here we define our SentenceTransformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

2022-06-05 01:42:02 - Use pytorch device: cuda


In [None]:
# We use 1 Million sentences from Wikipedia to train our model
wikipedia_dataset_path = 'data/wiki1m_for_simcse.txt'
if not os.path.exists(wikipedia_dataset_path):
    util.http_get('https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt', wikipedia_dataset_path)

  0%|          | 0.00/120M [00:00<?, ?B/s]

In [None]:
# train_samples is a list of InputExample objects where we pass the same sentence twice to texts, i.e. texts=[sent, sent]
train_samples = []
with open(wikipedia_dataset_path, 'r', encoding='utf8') as fIn:
    for line in fIn:
        line = line.strip()
        if len(line) >= 10:
            train_samples.append(InputExample(texts=[line, line]))

In [None]:
# Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1

        if row['split'] == 'dev':
            dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
        elif row['split'] == 'test':
            test_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))

2022-06-05 01:42:09 - Read STSbenchmark dev dataset


In [None]:
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test')

In [None]:
# We train our model using the MultipleNegativesRankingLoss
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
train_loss = losses.MultipleNegativesRankingLoss(model)

In [None]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
evaluation_steps = int(len(train_dataloader) * 0.1) #Evaluate every 10% of the data
logging.info("Training sentences: {}".format(len(train_samples)))
logging.info("Warmup-steps: {}".format(warmup_steps))
logging.info("Performance before training")
dev_evaluator(model)

2022-06-05 01:42:49 - Training sentences: 985723
2022-06-05 01:42:49 - Warmup-steps: 770
2022-06-05 01:42:49 - Performance before training
2022-06-05 01:42:49 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset:
2022-06-05 01:43:06 - Cosine-Similarity :	Pearson: 0.6558	Spearman: 0.6728
2022-06-05 01:43:06 - Manhattan-Distance:	Pearson: 0.6838	Spearman: 0.6905
2022-06-05 01:43:06 - Euclidean-Distance:	Pearson: 0.6833	Spearman: 0.6903
2022-06-05 01:43:06 - Dot-Product-Similarity:	Pearson: 0.3578	Spearman: 0.3508


0.6904746491017895

In [None]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=evaluation_steps,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          optimizer_params={'lr': 5e-5},
          use_amp=True         #Set to True, if your GPU supports FP16 cores
          )





Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7700 [00:00<?, ?it/s]

2022-06-05 01:46:23 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 770 steps:
2022-06-05 01:46:25 - Cosine-Similarity :	Pearson: 0.7556	Spearman: 0.7583
2022-06-05 01:46:25 - Manhattan-Distance:	Pearson: 0.7680	Spearman: 0.7679
2022-06-05 01:46:25 - Euclidean-Distance:	Pearson: 0.7688	Spearman: 0.7688
2022-06-05 01:46:25 - Dot-Product-Similarity:	Pearson: 0.6773	Spearman: 0.6730
2022-06-05 01:46:25 - Save model to output/training_stsb_simcse-distilbert-base-uncased-128-2022-06-05_01-41-27
2022-06-05 01:49:24 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 1540 steps:
2022-06-05 01:49:26 - Cosine-Similarity :	Pearson: 0.7923	Spearman: 0.7941
2022-06-05 01:49:26 - Manhattan-Distance:	Pearson: 0.7964	Spearman: 0.8006
2022-06-05 01:49:26 - Euclidean-Distance:	Pearson: 0.7969	Spearman: 0.8011
2022-06-05 01:49:26 - Dot-Product-Similarity:	Pearson: 0.7391	Spearman: 0.7405
2022-06-05 01:49:26 - Save model to out

KeyboardInterrupt: ignored

In [None]:
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################


model = SentenceTransformer(model_save_path)
test_evaluator(model, output_path=model_save_path)

# askubuntu

In [14]:
from sentence_transformers import models, util, datasets, evaluation, losses

In [3]:
# Some training parameters. For the example, we use a batch_size of 128, a max sentence length (max_seq_length)
# of 32 word pieces and as model roberta-base
model_name = 'roberta-base'
batch_size = 128
max_seq_length = 32
num_epochs = 1

In [4]:
################# Download AskUbuntu and extract training corpus  #################
askubuntu_folder = 'data/askubuntu'
output_path = 'output/askubuntu-simcse-{}-{}-{}'.format(model_name, batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

In [5]:
## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
for filename in ['text_tokenized.txt.gz', 'dev.txt', 'test.txt', 'train_random.txt']:
    filepath = os.path.join(askubuntu_folder, filename)
    if not os.path.exists(filepath):
        util.http_get('https://github.com/taolei87/askubuntu/raw/master/'+filename, filepath)

  0%|          | 0.00/46.4M [00:00<?, ?B/s]

  0%|          | 0.00/34.7k [00:00<?, ?B/s]

  0%|          | 0.00/34.4k [00:00<?, ?B/s]

  0%|          | 0.00/4.12M [00:00<?, ?B/s]

In [6]:
# Read the corpus
corpus = {}
dev_test_ids = set()
with gzip.open(os.path.join(askubuntu_folder, 'text_tokenized.txt.gz'), 'rt', encoding='utf8') as fIn:
    for line in fIn:
        splits = line.strip().split("\t")
        id = splits[0]
        title = splits[1]
        corpus[id] = title

In [7]:
# Read dev & test dataset
def read_eval_dataset(filepath):
    dataset = []
    with open(filepath) as fIn:
        for line in fIn:
            query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
            if len(relevant_id) == 0:   #Skip examples without relevant entries
                continue

            relevant_id = relevant_id.split(" ")
            candidate_ids = candidate_ids.split(" ")
            negative_ids = set(candidate_ids) - set(relevant_id)
            dataset.append({
                'query': corpus[query_id],
                'positive': [corpus[pid] for pid in relevant_id],
                'negative': [corpus[pid] for pid in negative_ids]
            })
            dev_test_ids.add(query_id)
            dev_test_ids.update(candidate_ids)
    return dataset


In [8]:
dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, 'dev.txt'))
test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, 'test.txt'))

In [9]:
dev_dataset[:10]

[{'negative': [],
  'positive': ['problem with source list',
   'e : malformed line 59 in source list /etc/apt/sources.list ( dist parse ) e : the list of sources could not be read',
   'i was trying to update and i received this error',
   'e : malformed line 57 in source list /etc/apt/sources.list ( dist parse )',
   'update error dist parse',
   'error in all app installations',
   'e : malformed line 54 in source list /etc/apt/sources.list ( dist parse )',
   'synaptic package manager and update manager are not working',
   'ubuntu software source file error',
   'sudo apt-get update command',
   'an error occurred-malformed-/etc/apt/sources.list.d',
   'can not run command sudo apt-get update and getting error',
   'software center is not opening or closing by itself',
   'e : malformed line 1 in source list /etc/apt/sources.list.d/bitdefender.list ( dist ) e : the list of sources could not be read',
   'update manager getting errors',
   'how do i fix malformed line in sources li

In [10]:
## Now we need a list of train sentences.
## In this example we simply use all sentences that don't appear in the train/dev set
train_sentences = []
for id, sentence in corpus.items():
    if id not in dev_test_ids:
        train_sentences.append(InputExample(texts=[sentence, sentence]))

logging.info("{} train sentences".format(len(train_sentences)))

In [11]:
################# Intialize an SBERT model #################


word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)

# Apply mean pooling
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [15]:
################# Train the model #################

# As Loss function, we use MultipleNegativesRankingLoss
train_dataloader = DataLoader(train_sentences, batch_size=batch_size, shuffle=True, drop_last=True)
train_loss = losses.MultipleNegativesRankingLoss(model)

In [16]:
# Create a dev evaluator
dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name='AskUbuntu dev')
test_evaluator = evaluation.RerankingEvaluator(test_dataset, name='AskUbuntu test')

In [None]:
logging.info("Dev performance before training")
dev_evaluator(model)

warmup_steps = int(num_epochs*len(train_dataloader)*0.1)

logging.info("Start training")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    evaluation_steps=100,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=output_path,
    show_progress_bar=True,
    use_amp=True                #If your GPU does not have FP16 cores, set use_amp=False
)

latest_output_path = output_path + "-latest"
model.save(latest_output_path)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1253 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
### Run test evaluation on the latest model. This is equivalent to not having a dev dataset
model = SentenceTransformer(latest_output_path)
test_evaluator(model)