## Downloading dataset

In [2]:
import urllib.request
import zipfile
import os
folder_path = os.path.dirname(os.path.realpath("__file__"))
print('Beginning download of datasets')

#datasets = ['AllNLI.zip', 'stsbenchmark.zip', 'wikipedia-sections-triplets.zip', 'STS2017.en-de.txt.gz', 'TED2013-en-de.txt.gz', 'xnli-en-de.txt.gz']
datasets = ['AllNLI.zip', 'stsbenchmark.zip', 'STS2017.en-de.txt.gz']
server = "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/"

for dataset in datasets:
    print("Download", dataset)
    url = server+dataset
    dataset_path = os.path.join(folder_path, dataset)
    urllib.request.urlretrieve(url, dataset_path)

    if dataset.endswith('.zip'):
        print("Extract", dataset)
        with zipfile.ZipFile(dataset_path, "r") as zip_ref:
            zip_ref.extractall(folder_path)
        os.remove(dataset_path)


print("All datasets downloaded and extracted")

Beginning download of datasets
Download AllNLI.zip
Extract AllNLI.zip
Download stsbenchmark.zip
Extract stsbenchmark.zip
Download STS2017.en-de.txt.gz
All datasets downloaded and extracted


In [3]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import *
import logging
from datetime import datetime
import sys

In [4]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout
model_name = 'allenai/scibert_scivocab_cased'

In [5]:
# Read the dataset
batch_size = 16
nli_reader = NLIDataReader('./datasets/AllNLI')
sts_reader = STSBenchmarkDataReader('./datasets/stsbenchmark')
train_num_labels = nli_reader.get_num_labels()
model_save_path = 'models/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
print('model save path: ', model_save_path)


# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


# Convert the dataset to a DataLoader ready for training
logging.info("Read AllNLI train dataset")
train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)



logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

# Configure the training
num_epochs = 1

warmup_steps = math.ceil(len(train_dataloader) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))



# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )



##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

model.evaluate(evaluator)

model save path:  models/training_nli_allenai-scibert_scivocab_cased-2020-04-20_18-03-25
2020-04-20 18:03:30 - Lock 140340517905296 acquired on /home/ubuntu/.cache/torch/transformers/560df3639836cbc0b55a7264963b1b5a7abc7ab307932944f88d56a79daf9f77.5f40512b66512e48222f7267da169e756934fb080cd4a0f6e9ba46da19ff8696.lock
2020-04-20 18:03:30 - https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_cased/config.json not found in cache or force_download set to True, downloading to /home/ubuntu/.cache/torch/transformers/tmpe3ovtqwi


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=313.0, style=ProgressStyle(description_…


2020-04-20 18:03:31 - storing https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_cased/config.json in cache at /home/ubuntu/.cache/torch/transformers/560df3639836cbc0b55a7264963b1b5a7abc7ab307932944f88d56a79daf9f77.5f40512b66512e48222f7267da169e756934fb080cd4a0f6e9ba46da19ff8696
2020-04-20 18:03:31 - creating metadata file for /home/ubuntu/.cache/torch/transformers/560df3639836cbc0b55a7264963b1b5a7abc7ab307932944f88d56a79daf9f77.5f40512b66512e48222f7267da169e756934fb080cd4a0f6e9ba46da19ff8696
2020-04-20 18:03:31 - Lock 140340517905296 released on /home/ubuntu/.cache/torch/transformers/560df3639836cbc0b55a7264963b1b5a7abc7ab307932944f88d56a79daf9f77.5f40512b66512e48222f7267da169e756934fb080cd4a0f6e9ba46da19ff8696.lock
2020-04-20 18:03:31 - loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_cased/config.json from cache at /home/ubuntu/.cache/torch/transformers/560df3639836cbc0b55a7264963b1b5a7abc7ab307932944f

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442301670.0, style=ProgressStyle(descri…


2020-04-20 18:04:26 - storing https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_cased/pytorch_model.bin in cache at /home/ubuntu/.cache/torch/transformers/b79b81a602400229c8f721cd9c9147e170d4ae1695f4125badbc9d122e509839.dab672d3d9c86398f8504e0e3f46391eca25af4f7b7a88bae481e9d6974e731f
2020-04-20 18:04:26 - creating metadata file for /home/ubuntu/.cache/torch/transformers/b79b81a602400229c8f721cd9c9147e170d4ae1695f4125badbc9d122e509839.dab672d3d9c86398f8504e0e3f46391eca25af4f7b7a88bae481e9d6974e731f
2020-04-20 18:04:26 - Lock 140340502619280 released on /home/ubuntu/.cache/torch/transformers/b79b81a602400229c8f721cd9c9147e170d4ae1695f4125badbc9d122e509839.dab672d3d9c86398f8504e0e3f46391eca25af4f7b7a88bae481e9d6974e731f.lock
2020-04-20 18:04:26 - loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_cased/pytorch_model.bin from cache at /home/ubuntu/.cache/torch/transformers/b79b81a602400229c8f721cd9c9147e170d4ae1695f

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=222296.0, style=ProgressStyle(descripti…


2020-04-20 18:04:32 - storing https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_cased/vocab.txt in cache at /home/ubuntu/.cache/torch/transformers/ae3febdd51990b429457018b5852c92e1be6bb95248faf4ac377d66fc4a5d8d0.a8e6b7905f755590e7ebfff6b58d35a8589e27e0ad0033165cb1bfd32dc9bbed
2020-04-20 18:04:32 - creating metadata file for /home/ubuntu/.cache/torch/transformers/ae3febdd51990b429457018b5852c92e1be6bb95248faf4ac377d66fc4a5d8d0.a8e6b7905f755590e7ebfff6b58d35a8589e27e0ad0033165cb1bfd32dc9bbed
2020-04-20 18:04:32 - Lock 140343828292880 released on /home/ubuntu/.cache/torch/transformers/ae3febdd51990b429457018b5852c92e1be6bb95248faf4ac377d66fc4a5d8d0.a8e6b7905f755590e7ebfff6b58d35a8589e27e0ad0033165cb1bfd32dc9bbed.lock
2020-04-20 18:04:34 - loading file https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_cased/vocab.txt from cache at /home/ubuntu/.cache/torch/transformers/ae3febdd51990b429457018b5852c92e1be6bb95248faf4ac377d66fc4a5d8d0.a8

Convert dataset: 100%|██████████| 942069/942069 [12:13<00:00, 1284.35it/s]
Convert dataset:  15%|█▌        | 228/1500 [00:00<00:00, 2270.49it/s]

2020-04-20 18:16:57 - Num sentences: 942069
2020-04-20 18:16:57 - Sentences 0 longer than max_seqence_length: 987
2020-04-20 18:16:57 - Sentences 1 longer than max_seqence_length: 0
2020-04-20 18:16:57 - Softmax loss: #Vectors concatenated: 3
2020-04-20 18:16:57 - Read STSbenchmark dev dataset


Convert dataset: 100%|██████████| 1500/1500 [00:01<00:00, 1275.44it/s]
Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/58880 [00:00<?, ?it/s][A

2020-04-20 18:16:58 - Num sentences: 1500
2020-04-20 18:16:58 - Sentences 0 longer than max_seqence_length: 0
2020-04-20 18:16:58 - Sentences 1 longer than max_seqence_length: 0
2020-04-20 18:16:58 - Warmup-steps: 368



Iteration:   0%|          | 1/58880 [00:30<497:15:12, 30.40s/it][A
Iteration:   0%|          | 2/58880 [01:32<522:57:57, 31.98s/it][A
Iteration:   0%|          | 3/58880 [02:09<527:20:50, 32.24s/it][A
Iteration:   0%|          | 4/58880 [02:49<533:36:38, 32.63s/it][A
Iteration:   0%|          | 5/58880 [03:29<539:55:07, 33.01s/it][A
Iteration:   0%|          | 6/58880 [04:06<542:34:25, 33.18s/it][A
Iteration:   0%|          | 7/58880 [04:42<545:28:13, 33.35s/it][A
Iteration:   0%|          | 8/58880 [05:04<535:38:30, 32.75s/it][A
Iteration:   0%|          | 9/58880 [05:23<524:23:19, 32.07s/it][A
Iteration:   0%|          | 10/58880 [05:46<516:51:54, 31.61s/it][A
Iteration:   0%|          | 11/58880 [06:04<506:09:05, 30.95s/it][A
Iteration:   0%|          | 12/58880 [06:30<502:23:07, 30.72s/it][A
Iteration:   0%|          | 13/58880 [06:50<492:57:28, 30.15s/it][A
Iteration:   0%|          | 14/58880 [07:14<488:00:23, 29.84s/it][A
Iteration:   0%|          | 15/58880 [07:3

KeyboardInterrupt: 

## Finetuning on sts dataset

In [None]:
model_name = 'bert-base-nli-mean-tokens'
train_batch_size = 16
num_epochs = 4
model_save_path = 'models/fineTunedSciBERT
sts_reader = STSBenchmarkDataReader('./datasets/stsbenchmark', normalize_scores=True)

# Load a pre-trained sentence transformer model<- loaded in last cell for evaluation, else reload
#model = SentenceTransformer(model_name)

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
model.evaluate(evaluator)