# Training an embedding model

In [None]:
from datasets import load_dataset

# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction
train_dataset = load_dataset('glue', 'mnli', split='train').select(range(50_000))

train_dataset = train_dataset.remove_columns(['idx'])


In [None]:
from sentence_transformers import SentenceTransformer

# Use a base model
embedding_model = SentenceTransformer('bert-base-uncased')



In [None]:
from sentence_transformers import losses

# Define loss, in this case it is softmax loss.
train_loss = losses.SoftmaxLoss(
    model=embedding_model,
    sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(),
    num_labels=3
)

In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for STSB
val_sts=load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']],
    main_similarity='cosine'
)


In [None]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define training arguments

args = SentenceTransformerTrainingArguments(
    output_dir='base_embedding_model',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

In [None]:
from sentence_transformers.trainer import SentenceTransformerTrainer

# Train embedding Model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
100,1.0646
200,0.9323
300,0.8763
400,0.8311
500,0.8202
600,0.8199
700,0.806
800,0.7852
900,0.7743
1000,0.7702


TrainOutput(global_step=1563, training_loss=0.8076587611105072, metrics={'train_runtime': 500.3632, 'train_samples_per_second': 99.927, 'train_steps_per_second': 3.124, 'total_flos': 0.0, 'train_loss': 0.8076587611105072, 'epoch': 1.0})

In [None]:
# Evaluate our trained model
evaluator(embedding_model)

{'pearson_cosine': 0.5677151949583746, 'spearman_cosine': 0.6403511856854297}

In [None]:
from mteb import MTEB

# Choose evaluation tasks
evaluation = MTEB(tasks=['Banking77Classification'])

# Calculate results
results = evaluation.run(model)

ModuleNotFoundError: No module named 'mteb'

### Exploring Cosine Similarity Loss

In [None]:
# (neutral / contradition) = 0 and (entailment) = 1
mapping = {2:0, 1:0, 0:1}
train_dataset = Dataset.from_dict({
    'sentence1': train_dataset['premise'],
    'sentence2': train_dataset['hypothesis'],
    'label': [float(mapping[label] for label in train_dataset['label'])]
    })

In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']],
    main_similarity='cosine'
)

In [None]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss Function
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir='cosineloss_embedding_model',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator)
trainer.train()

In [None]:
# Evaluate our trained model
evaluator(embedding_model)

### Exploring MNR Loss

In [None]:
import random
from tqdm import tqdm
from datasets import Dataset, load_dataset

# Load MNLI dataset from GLUE
mnli = load_dataset('glue', 'mnli', split='train').select(range(50_000))
mnli = mnli.remove_columns('idx')
mnli = mnli.filter(lambda x: True if x['label'] == 0 else False)

# Prepare data and add a soft negative
train_dataset = {'anchor': [], 'positive': [], 'negative': []}
soft_negatives = mnli['hypothesis']
random.shuffle(soft_negatives)
for row, soft_negative in tqdm(zip(mnli, soft_negatives)):
    train_dataset['anchor'].append(row['premise'])
    train_dataset['positive'].append(row['hypothesis'])
    train_dataset['negative'].append(soft_negative)
train_dataset = Dataset.from_dict(train_dataset)


In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']],
    main_similarity='cosine'
)

In [None]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss Function
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir='mnrloss_embedding_model',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator)
trainer.train()

In [None]:
# Evaluate our trained model
evaluator(embedding_model)

# Fine-Tuning an embedding model : Supervised Learning

In [None]:
from datasets import load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction

train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns(['idx'])

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')

evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']],
    main_similarity='cosine'
)

In [None]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define model
embedding_model = SentenceTransformer('sentence_transformers/all-MiniLM-L6-v2')

# Loss Function
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir='finetuned_embedding_model',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    fp16=True,
    warmup_steps=100,
    eval_steps=100,
    logging_steps=100
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
    )
trainer.train()

In [None]:
# Evaluate our trained model
evaluator(embedding_model)

### Augmented SBERT : Finetuning embedding model with scarce labelled data

In [None]:
import pandas as pd
from tqdm import tqdm
from datasets import Dataset, load_dataset
from sentence_transformers import InputExample
from sentence_transformers.datasets import NoDuplicatesDataLoader

# Prepare a small set of 10000 documents for the cross-encoder
dataset = load_dataset('glue', 'mnli', split='train').select(range(10_000))
# 0 = entailment, 1 = neutral, 2 = contradiction
mapping = {2:0, 1:0, 0:1}

# Data loader
gold_examples = [
    InputExample(texts=[row['premise'], row['hypothesis']], label=mapping[row['label']])
    for row in tqdm(dataset)
]

gold_dataloader = NoDuplicatesDataLoader(
    gold_examples, batch_size=32
)

# Pandas DataFrame for easier data handling
gold = pd.DataFrame(
    {
        "sentence1": dataset['premise'],
        "sentence2": dataset['hypothesis'],
        "label": [mapping[label] for label in dataset['label']]
    }
)

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder

# Train a cross-encoder on the gold dataset
cross_encoder = CrossEncoder('bert-base-uncased', num_labels=2)
cross_encoder.fit(
    train_dataloader=gold_dataloader,
    epochs=1,
    show_progress-bar=True,
    warmup_steps=100,
    use_amp=False
)

In [None]:
# Prepare the silver dataset by predicting labels with the cross-encoder
silver = load_dataset('glue', 'mnli', split='train').select(range(10_000, 50_000))
pairs = list(zip(silver['premise'], silver['hypothesis']))

In [None]:
import numpy as np

# Label the sentence pairs using our fine-tuned cross-encoder
output = cross_encoder.predict(pairs, apply_softmax=True, show_progress_bar=True)
silver = pd.DataFrame(
    {
        "sentence1": silver['premise'],
        "sentence2": silver['hypothesis'],
        "label": np.argmax(output, axis=1)
    }
)

In [None]:
# Combine Gold + Silver

data = pd.concat([gold, silver], ignore_index=True, axis=0)
data = data.drop_duplicates(subset=['sentence1', 'sentence2'], keep='first')
train_dataset = Dataset.from_pandas(data, preserve_index=False)

In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']],
    main_similarity='cosine'
)

In [None]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss Function
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir='augmented_embedding_model',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
    )
trainer.train()

In [None]:
evaluator(embedding_model)

# Unsupervised Learning : Transformer-Based Sequential Denoising Auto-Encoder

In [None]:
# Download additional tokenizer
import nltk
nltk.download('punkt')

from tqdm import tqdm
from datasets import load_dataset, Dataset
from sentence_transformers.datasets import DenoisingAutoEncoderDataset

# Create a flat list of sentences
mnli = load_dataset('glue', 'mnli', split='train').select(range(25_000))
flat_sentences = mnli['premise'] + mnli['hypothesis']

# Add noise to our input data
damaged_data = DenoisingAutoEncoderDataset(list(set(flat_sentences)))

# Create dataset
train_dataset = {'damaged_sentence' : [], 'original_sentence' : []}

for data in tqdm(damaged_data):
    train_dataset['damaged_sentence'].append(data.texts[0])
    train_dataset['original_sentence'].append(data.texts[1])

train_dataset = Dataset.from_dict(train_dataset)

train_dataset[0]

In [None]:
from sentence_transformrs import models, SentenceTransformer

# Create your embedding model
word_embedding_model = models.Transformer('bert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
embedding_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
from sentence_transformers import losses

# Use the denoising autoencoder loss
train_loss = losses.DenoisingAutoEncoderLoss(embedding_model, tie_encoder_decoder=True)
train_loss.decoder = train_loss.decoder.to('cuda')

In [None]:
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir='tsdae_embedding_model',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=True,
    warmup_steps=100,
    eval_steps=100,
    logging_steps=100
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

In [None]:
# Evaluate our trained model
evaluator(embedding_model)