<a href="https://colab.research.google.com/github/Anubh-debug/embedding_train/blob/main/basic_bert_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

important modules to import. below cell must be executed.

In [None]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, Dataset
from sentence_transformers import losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.trainer import SentenceTransformerTrainer
from tqdm import tqdm
import random

# use a base model
embedding_model = SentenceTransformer('bert-base-uncased')

below code is for softmax loss

In [None]:
# load mnli dataset from glue
train_dataset=load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset=train_dataset.remove_columns("idx")

# using softmax loss
train_loss = losses.SoftmaxLoss(model=embedding_model, sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(), num_labels=3)


# create an embedding similarity evaluator for stsb
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator=EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]], #making scores between 0 and 1
    main_similarity='cosine'
)

# Define training arguments
args=SentenceTransformerTrainingArguments(
    output_dir = "/content/embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True, #computations will be formed on 16-bit floating point numbers.
    eval_steps=100,
    logging_steps=100
)

# training our model
trainer=SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

In [None]:
# evaluating our embedding model
evaluator(embedding_model)

we are going to try two more loss functions: cosine loss and MNR loss

cosine loss

In [None]:
from datasets import load_dataset, Dataset
# for cosine similarity, we are going to use only two labels.
# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction
train_dataset = load_dataset(
    "glue", "mnli", split="train"
).select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")
mapping={0:1, 1:0, 2:0}
train_dataset=Dataset.from_dict(
    {
        "sentence1": train_dataset["premise"],
        "sentence2": train_dataset["hypothesis"],
        "label": [float(mapping[label]) for label in train_dataset["label"]]
    }
)

# cosine loss
train_loss=losses.CosineSimilarityLoss(model=embedding_model)

# again load evaluator
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator=EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity='cosine'
)

# training_args
# Define training arguments
args=SentenceTransformerTrainingArguments(
    output_dir = "/content/cosine_loss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True, #computations will be formed on 16-bit floating point numbers.
    eval_steps=100,
    logging_steps=100
)

trainer=SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

In [None]:
evaluator(embedding_model)

# output: {'pearson_cosine': 0.7265418343568838, 'spearman_cosine': 0.7280083600575848}

using multiple negative ranking loss

In [None]:
mnli = load_dataset("glue", "mnli", split="train").select(range(50_000))
mnli = mnli.remove_columns("idx")
mnli=mnli.filter(lambda x: True if x["label"]==0 else False) #keeping only entailment

# prepare data and soft negative
train_dataset={"anchor": [], "positive": [], "negative": []}
soft_negatives=list(mnli['hypothesis'])
random.shuffle(soft_negatives)

for row, soft_negative in tqdm(zip(mnli, soft_negatives)):
  train_dataset['anchor'].append(row['premise'])
  train_dataset['positive'].append(row['hypothesis'])
  train_dataset['negative'].append(soft_negative)

train_dataset = Dataset.from_dict(train_dataset)

# let's define the evaluator
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

# defining train loss
train_loss=losses.MultipleNegativesRankingLoss(model=embedding_model)

# defining training args
args=SentenceTransformerTrainingArguments(
    output_dir="mnrloss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

# defining trainer
trainer=SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

In [None]:
evaluator(embedding_model)
output: {'pearson_cosine': 0.8091421864243222, 'spearman_cosine': 0.8134963799078724}