## Finetune an encoder-encoder model

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

In [2]:
from pathlib import Path 

paths = [str(path) for path in Path('/content/drive/MyDrive/Colab Notebooks/data').glob('*.tsv')]

In [3]:
!pip install transformers sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from sentence_transformers import InputExample
from tqdm.auto import tqdm

query_pairs = []
for path in paths:
  with open(path, 'r', encoding = 'utf-8') as fp:
    lines = fp.read().split('\n')
    for line in lines:
      if '\t' not in line:
        continue
      else:
        q,p = line.split('\t')
        query_pairs.append(InputExample(texts = [q,p]))

In [5]:
len(query_pairs)

15129

In [6]:
from sentence_transformers import datasets

batch_size = 8

loader = datasets.NoDuplicatesDataLoader(
    query_pairs, batch_size=batch_size
)

In [7]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"


### Load Model - Roberta

In [8]:
from sentence_transformers import models, SentenceTransformer

roberta = models.Transformer('roberta-base', max_seq_length = 512)
pooler = models.Pooling(
    roberta.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[roberta, pooler])

model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

### Set Loss Function

In [9]:
from sentence_transformers import losses

loss = losses.MultipleNegativesRankingLoss(model)

In [10]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:45"


### Train

In [11]:
epochs = 3
warmup_steps = int(len(loader) * epochs * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='/content/drive/MyDrive/Colab Notebooks/models/finetuned_roberta',
    show_progress_bar=True
)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1891 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1891 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1891 [00:00<?, ?it/s]

## Train MPNet

In [12]:
from sentence_transformers import models, SentenceTransformer

mpnet = models.Transformer('microsoft/mpnet-base', max_seq_length = 512)
pooler = models.Pooling(
    mpnet.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[mpnet, pooler])

model.to(device)

Some weights of the model checkpoint at microsoft/mpnet-base were not used when initializing MPNetModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing MPNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MPNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

### Set Loss Function

In [13]:
from sentence_transformers import losses

loss = losses.MultipleNegativesRankingLoss(model)

In [14]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:45"


### Train

In [15]:
epochs = 3
warmup_steps = int(len(loader) * epochs * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='/content/drive/MyDrive/Colab Notebooks/models/finetuned_mpnet',
    show_progress_bar=True
)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1891 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1891 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1891 [00:00<?, ?it/s]