<a href="https://colab.research.google.com/github/Anubh-debug/embedding_train/blob/main/basic_bert_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

important modules to import. below cell must be executed.

In [None]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from sentence_transformers import losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.trainer import SentenceTransformerTrainer

# use a base model
embedding_model = SentenceTransformer('bert-base-uncased')

below code is for softmax loss

In [None]:
# load mnli dataset from glue
train_dataset=load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset=train_dataset.remove_columns("idx")

# using softmax loss
train_loss = losses.SoftmaxLoss(model=embedding_model, sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(), num_labels=3)


# create an embedding similarity evaluator for stsb
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator=EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]], #making scores between 0 and 1
    main_similarity='cosine'
)

# Define training arguments
args=SentenceTransformerTrainingArguments(
    output_dir = "/content/embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True, #computations will be formed on 16-bit floating point numbers.
    eval_steps=100,
    logging_steps=100
)

# training our model
trainer=SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

In [None]:
# evaluating our embedding model
evaluator(embedding_model)

we are going to try two more loss functions: cosine loss and MNR loss

cosine loss

In [2]:
from datasets import load_dataset, Dataset
# for cosine similarity, we are going to use only two labels.
# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction
train_dataset = load_dataset(
    "glue", "mnli", split="train"
).select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")
mapping={0:1, 1:0, 2:0}
train_dataset=Dataset.from_dict(
    {
        "sentence1": train_dataset["premise"],
        "sentence2": train_dataset["hypothesis"],
        "label": [float(mapping[label]) for label in train_dataset["label"]]
    }
)

# cosine loss
train_loss=losses.CosineSimilarityLoss(model=embedding_model)

# again load evaluator
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator=EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity='cosine'
)

# training_args
# Define training arguments
args=SentenceTransformerTrainingArguments(
    output_dir = "/content/cosine_loss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True, #computations will be formed on 16-bit floating point numbers.
    eval_steps=100,
    logging_steps=100
)

trainer=SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
100,0.2296
200,0.1716
300,0.1722
400,0.1579
500,0.1527
600,0.1568
700,0.1492
800,0.1571
900,0.1502
1000,0.1467


TrainOutput(global_step=1563, training_loss=0.15709608606398295, metrics={'train_runtime': 680.2148, 'train_samples_per_second': 73.506, 'train_steps_per_second': 2.298, 'total_flos': 0.0, 'train_loss': 0.15709608606398295, 'epoch': 1.0})

In [3]:
evaluator(embedding_model)

{'pearson_cosine': 0.7265418343568838, 'spearman_cosine': 0.7280083600575848}

using multiple negative ranking loss

In [5]:
mnli = load_dataset("glue", "mnli", split="train").select(range(50_000))
mnli = mnli.remove_columns("idx")
mnli

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 50000
})

In [11]:
mnli.features['label']

ClassLabel(names=['entailment', 'neutral', 'contradiction'])

In [6]:
mnli=mnli.filter(lambda x: True if x["label"]==0 else False) #keeping only entailment
mnli

Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 16875
})