In [1]:
from transformers import set_seed
set_seed(916)

In [2]:
from sbert_reduced import SBertReduce
from sentence_transformers import models, SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

def load_sbert_reduced(reduced_size=48, inter_sizes=(512,256,128,64)):
    word_embedding_model = models.Transformer("bert-base-uncased", max_seq_length=512)
    embedding_reduction_model = SBertReduce(word_embedding_model.get_word_embedding_dimension(), reduced_size, inter_sizes)
    pooling_model = models.Pooling(reduced_size)
    return SentenceTransformer(modules=[word_embedding_model, embedding_reduction_model, pooling_model])

model = load_sbert_reduced()
 
train_examples = [InputExample(texts=["This is a sentence", "This is also a sentence"], label=0.8),
                  InputExample(texts=["One sentence", "Another sentence"], label=0.3)]

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=0)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
model.save("stsb-sbert")

In [4]:
SBertReduce.load("stsb-sbert")

stsb-sbert/1_SBertReduce


FileNotFoundError: [Errno 2] No such file or directory: 'stsb-sbert/1_SBertReduce/config.json'

In [2]:
from bert_reduced.bert_reduced import BertReducedForPreTraining, SBertReduce
from sentence_transformers import SentenceTransformer, models

def load_sbert(model_name, revision="main"):
    pretrained = BertReducedForPreTraining.from_pretrained(model_name, revision=revision)

    word_embedding_model = models.Transformer("bert-base-uncased", max_seq_length=pretrained.config.max_position_embeddings)
    embedding_reduction_model = SBertReduce(pretrained.reduce)
    pooling_model = models.Pooling(pretrained.config.reduced_size)

    return SentenceTransformer(modules=[word_embedding_model, embedding_reduction_model, pooling_model])

sbert_model = load_sbert("cayjobla/bert-base-uncased-reduced", revision="pretrain")

In [10]:
from bert_reduced.bert_reduced import BertReducedForPreTraining, SBertReduce
from sentence_transformers import SentenceTransformer, models

word_embedding_model = models.Transformer("bert-base-uncased", max_seq_length=512)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
model.save('stsb-sbert')

### STS Benchmark dataset

In [3]:
from datasets import load_dataset

raw_datasets = load_dataset("stsb_multi_mt", "en")
raw_datasets

Found cached dataset stsb_multi_mt (/home/cayjobla/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1500
    })
})

## What I need to do still:
* Data labels need to be adjusted to be between 0 and 1
* In our data, 5 indicates very similar. Does the CosineSimilarity relate larger or smaller numbers with similarity? Do I need to adjust?
* Adjust model to make it loadable later

In [4]:
from sentence_transformers import InputExample

def wrap_data(examples):
    return [InputExample(texts=[examples["sentence1"][i], examples["sentence2"][i]], 
                         label=examples["similarity_score"][i]) for i in range(len(examples))]

training_data = wrap_data(raw_datasets["train"])

In [7]:
from sentence_transformers import losses
from torch.utils.data import DataLoader

train_dataloader = DataLoader(training_data, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(sbert_model)

sbert_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10, warmup_steps=0, evaluation_steps=360, output_path="stsb-sbert")

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

AttributeError: 'SBertReduce' object has no attribute 'save'