<a href="https://colab.research.google.com/github/Anubh-debug/embedding_train/blob/main/basic_bert_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

important modules to import. below cell must be executed.

In [None]:
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer, losses, SentenceTransformerTrainingArguments
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.trainer import SentenceTransformerTrainer
from tqdm import tqdm
import random
from sentence_transformers import InputExample
from sentence_transformers.datasets import NoDuplicatesDataLoader # Corrected import path
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np

In [None]:
# use a base model
embedding_model = SentenceTransformer('bert-base-uncased')

below code is for softmax loss

In [None]:
# load mnli dataset from glue
train_dataset=load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset=train_dataset.remove_columns("idx")

# using softmax loss
train_loss = losses.SoftmaxLoss(model=embedding_model, sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(), num_labels=3)


# create an embedding similarity evaluator for stsb
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator=EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]], #making scores between 0 and 1
    main_similarity='cosine'
)

# Define training arguments
args=SentenceTransformerTrainingArguments(
    output_dir = "/content/embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True, #computations will be formed on 16-bit floating point numbers.
    eval_steps=100,
    logging_steps=100
)

# training our model
trainer=SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

In [None]:
# evaluating our embedding model
evaluator(embedding_model)

we are going to try two more loss functions: cosine loss and MNR loss

cosine loss

In [None]:
from datasets import load_dataset, Dataset
# for cosine similarity, we are going to use only two labels.
# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction
train_dataset = load_dataset(
    "glue", "mnli", split="train"
).select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")
mapping={0:1, 1:0, 2:0}
train_dataset=Dataset.from_dict(
    {
        "sentence1": train_dataset["premise"],
        "sentence2": train_dataset["hypothesis"],
        "label": [float(mapping[label]) for label in train_dataset["label"]]
    }
)

# cosine loss
train_loss=losses.CosineSimilarityLoss(model=embedding_model)

# again load evaluator
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator=EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity='cosine'
)

# training_args
# Define training arguments
args=SentenceTransformerTrainingArguments(
    output_dir = "/content/cosine_loss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True, #computations will be formed on 16-bit floating point numbers.
    eval_steps=100,
    logging_steps=100
)

trainer=SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

In [None]:
evaluator(embedding_model)

# output: {'pearson_cosine': 0.7265418343568838, 'spearman_cosine': 0.7280083600575848}

using multiple negative ranking loss

In [None]:
mnli = load_dataset("glue", "mnli", split="train").select(range(50_000))
mnli = mnli.remove_columns("idx")
mnli=mnli.filter(lambda x: True if x["label"]==0 else False) #keeping only entailment

# prepare data and soft negative
train_dataset={"anchor": [], "positive": [], "negative": []}
soft_negatives=list(mnli['hypothesis'])
random.shuffle(soft_negatives)

for row, soft_negative in tqdm(zip(mnli, soft_negatives)):
  train_dataset['anchor'].append(row['premise'])
  train_dataset['positive'].append(row['hypothesis'])
  train_dataset['negative'].append(soft_negative)

train_dataset = Dataset.from_dict(train_dataset)

# let's define the evaluator
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

# defining train loss
train_loss=losses.MultipleNegativesRankingLoss(model=embedding_model)

# defining training args
args=SentenceTransformerTrainingArguments(
    output_dir="mnrloss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

# defining trainer
trainer=SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

In [None]:
evaluator(embedding_model)
# output: {'pearson_cosine': 0.8091421864243222, 'spearman_cosine': 0.8134963799078724}

The most straightforward way to fine-tune an embedding model is to repeat the process of training our model as we did before but replace the 'bert-base-uncased' with a pretrained sentence-transformers model

In [None]:
# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction
train_dataset=load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset=train_dataset.remove_columns("idx")

val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1 = val_sts["sentence1"],
    sentences2 = val_sts["sentence2"],
    scores = [score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [None]:
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

args = SentenceTransformerTrainingArguments(
    output_dir = "finetuned_embed_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

In [None]:
evaluator(embedding_model)
# output={'pearson_cosine': 0.8492843146977941, 'spearman_cosine': 0.8491189934593896}

Augmented SBert. We are going to train a cross encoder bert with small gold label dataset. Then we will use that cross encoder to label our unlabeled data creating silver label dataset. Then we will fine-tune our BiEncoder bert with gold+silver dataset.

In [None]:
# prepare a small dataset of 10000 to train our cross encoder
dataset=load_dataset("glue", "mnli", split="train").select(range(10_000))
mapping={2:0, 1:0, 0:1}

gold_examples=[
    InpputExample(texts=[row['premise'], row['hypothesis']], label=mapping[row["label"]] for row in dataset)
]
gold_dataloader = NoDuplicatesDataLoader(gold_examples, batch_size=32)

# pandas dataframe for easier data handling
gold=pd.DataFrame(
    {
        "sentence1": dataset["premise"],
        "sentence2": dataset["hypothesis"],
        "label": [mapping[label] for label in dataset["label"]]
    }
)

using this gold labeled dataset, we can train our cross encoder bert

In [None]:
cross_encoder = CrossEncoder("bert-base-uncased", num_labels=2)
cross_encoder.fit(
    train_dataloader = gold_dataloader,
    epochs=1,
    show_progress_bar=True,
    warmup_steps=100,
    use_amp=False
)

After training our cross encoder we can use this model to label our silver dataset

In [None]:
silver_dataset=load_dataset("glue", "mnli", split="train").select(range(10_000, 50_000))
pairs=list(zip(silver_dataset["premise"], silver_dataset["hypothesis"]))

label these sentence pairs

In [None]:
output=cross_encoder.predict(
    pairs, apply_softmax=True
)

silver=pd.DataFrame(
    {
        "sentence1": silver_dataset["premise"],
        "sentence2": silver_dataset["hypothesis"],
        "label":np.argmax(output, axis=1)
    }
)

In [None]:
# Combine gold + silver
data = pd.concat([gold, silver], ignore_index=True, axis=0)
data = data.drop_duplicates(subset=["sentence1", "sentence2"], keep="first")
train_dataset = Dataset.from_pandas(data, preserve_index=False)

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [None]:
# Define model
embedding_model = SentenceTransformer("bert-base-uncased")

# Loss function
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="augmented_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

In [None]:
evaluator(embedding_model)

Unsupervised Learning: TSDAE

The underlying idea of TSDAE is that we add noise to the input sentence by removing a certain percentage of words from it. This “damaged” sentence is put through an encoder, with a pooling layer on top of it, to map it to a sentence embedding. From this sentence embedding, a decoder tries to reconstruct the original sentence from the “damaged” sentence but without the artificial noise. The main concept here is that the more accurate the sentence embedding is, the more accurate the reconstructed sentence will be.

In [None]:
# Download additional tokenizer
import nltk
nltk.download("punkt")

In [None]:
from sentence_transformers.datasets import DenoisingAutoEncoderDataset

# create a flat list of sentences
mnli=load_dataset("glue", "mnli", split="train").select(range(25_000))
flat_sentences = mnli["premise"] + mnli["hypothesis"]

# add noise to our input data
damaged_data = DenoisingAutoEncoderDataset(list(set(flat_sentences)))

# create dataset
train_dataset={"damaged_sentence"=[], "original_sentence"=[]}

for data in tqdm(damaged_data):
  train_dataset["damaged_data"].append(data.texts[0])
  train_dataset["original_sentence"].append(data.texts[1])
train_dataset=Dataset.from_dict(train_dataset)

In [None]:
# Create an embedding similarity evaluator for stsb
val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)