# Bi-Encoder model

## Data loading

In [1]:
from datasets import load_from_disk
from src.utils.config_management import CONFIG

In [2]:
hf_dataset = load_from_disk(CONFIG['paths']['data']['dalip_hf_dataset'])

In [3]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 'question_view_count', 'question_body', 'question_last_edit_date', 'question_last_activity_date', 'question_title', 'question_tags', 'question_answer_count', 'question_comment_count', 'question_favorite_count', 'question_closed_date', 'question_community_owned_date', 'answer_accepted'],
        num_rows: 42700
    })
    test: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 

## Data preprocessing

In [4]:
from src.utils.text_preprocessing import Preprocessor

In [5]:
TARGET_COL = 'answer_normalized_score'

In [6]:
preprocessor = Preprocessor(preserve_html_tags=['code'])

In [7]:
hf_dataset = hf_dataset.map(preprocessor, batched=True)

In [8]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 'question_view_count', 'question_body', 'question_last_edit_date', 'question_last_activity_date', 'question_title', 'question_tags', 'question_answer_count', 'question_comment_count', 'question_favorite_count', 'question_closed_date', 'question_community_owned_date', 'answer_accepted', 'question_text', 'answer_text'],
        num_rows: 42700
    })
    test: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_cr

## Fine-tuning

In [9]:
import torch

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

MODEL_PATH = 'mmukh/SOBertLarge'
MODEL_NAME = MODEL_PATH.split('/')[-1]
MAX_LENGTH = 1024
BATCH_SIZE = 1
EMBEDDINGS_POOLING = 'mean'
LOSS = 'MSE'

### Define model

In [11]:
from dataclasses import dataclass
import torch.nn as nn
from transformers import PreTrainedTokenizerBase, MegatronBertModel, PreTrainedTokenizerFast

2025-04-27 13:21:06.133076: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-27 13:21:06.180953: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-27 13:21:06.180982: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-27 13:21:06.182104: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-27 13:21:06.189652: I tensorflow/core/platform/cpu_feature_guar

In [12]:
encoder_model = MegatronBertModel.from_pretrained(MODEL_PATH)

Some weights of MegatronBertModel were not initialized from the model checkpoint at mmukh/SOBertLarge and are newly initialized: ['embeddings.token_type_embeddings.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_PATH)
if not tokenizer.pad_token:
    pad_token_id = encoder_model.embeddings.word_embeddings.padding_idx
    print(f'Setting pad token id to {pad_token_id}...')
    tokenizer.pad_token_id = pad_token_id
    print(f'Pad token set to {tokenizer.pad_token}')

Setting pad token id to 0...
Pad token set to <unk>


In [14]:
class BiEncoderRegressor(nn.Module):
    def __init__(self, encoder_model, embeddings_pooling):
        super().__init__()
        self.encoder = encoder_model
        self.embeddings_pooling = embeddings_pooling
        self.hidden_size = self.encoder.embeddings.word_embeddings.embedding_dim

        self.regressor = nn.Sequential(
            nn.Linear(2 * self.hidden_size, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )
        self.regressor = self.regressor

    def get_sentence_embeddings(self, tokenized_inputs):
        outputs = self.encoder(**tokenized_inputs)

        attention_mask = tokenized_inputs['attention_mask']
        last_hidden_state = outputs.last_hidden_state

        if self.embeddings_pooling == 'mean':
            attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size())
            sum_embeddings = (last_hidden_state * attention_mask_expanded).sum(dim=1)
            sum_mask = attention_mask_expanded.sum(dim=1)
            pooled = sum_embeddings / sum_mask

        return pooled

    def gradient_checkpointing_enable(self, **gradient_checkpointing_kwargs):
        if hasattr(self.encoder, 'gradient_checkpointing_enable'):
            self.encoder.gradient_checkpointing_enable()
        else:
            raise NotImplementedError('Encoder model does not support gradient checkpointing.')

    def gradient_checkpointing_disable(self, **gradient_checkpointing_kwargs):
        if hasattr(self.encoder, 'gradient_checkpointing_disable'):
            self.encoder.gradient_checkpointing_disable()
        else:
            raise NotImplementedError('Encoder model does not support gradient checkpointing.')

    def forward(self, questions_tokenized, answers_tokenized, labels=None):
        question_embeddings = self.get_sentence_embeddings(questions_tokenized)
        answer_embeddings = self.get_sentence_embeddings(answers_tokenized)

        combined = torch.cat([question_embeddings, answer_embeddings], dim=1)

        logits = self.regressor(combined).squeeze(-1)

        return {'logits': logits}

In [15]:
model = BiEncoderRegressor(encoder_model, embeddings_pooling=EMBEDDINGS_POOLING).to(device)

### Train model

In [16]:
from transformers import Trainer, TrainingArguments
import os
from src.evaluation import RankingEvaluator
import wandb
import pandas as pd

In [17]:
@dataclass
class BiEncoderDataCollator:
    tokenizer: PreTrainedTokenizerBase
    padding: bool = True

    def __call__(self, batch):
        question_texts = []
        answer_texts = []
        labels = []

        for sample in batch:
            question_texts.append(sample['question_text'])
            answer_texts.append(sample['answer_text'])
            labels.append(sample[TARGET_COL])

        questions_tokenized = self.tokenizer(question_texts, padding=self.padding, truncation=True, max_length=MAX_LENGTH,
                                             return_tensors='pt')
        answers_tokenized = self.tokenizer(answer_texts, padding=self.padding, truncation=True, max_length=MAX_LENGTH,
                                           return_tensors='pt')

        labels = torch.tensor([sample[TARGET_COL] for sample in batch]).float()

        collated_batch = {
            'questions_tokenized': questions_tokenized,
            'answers_tokenized': answers_tokenized,
            'labels': labels
        }

        return collated_batch

In [18]:
data_collator = BiEncoderDataCollator(tokenizer=tokenizer)

In [19]:
def trainer_loss_fn(outputs, labels, num_items_in_batch=None):
    logits = outputs['logits']

    if LOSS == 'MSE':
        loss_fn = nn.MSELoss()

    loss = loss_fn(logits, labels)

    return loss

In [20]:
test_question_ids = hf_dataset['test']['question_id']

evaluator = RankingEvaluator(ndcg_k=list(range(1, 11)),
                             ndcg_gain_func='exponential', ndcg_discount_func='logarithmic')

def compute_metrics(eval_pred):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids

    metrics = evaluator(labels, predictions, test_question_ids)
    if TARGET_COL == 'answer_log_normalized_score':
        metrics.pop('mae')

    predictions_df = pd.DataFrame()
    predictions_df['answer_id'] = hf_dataset['test']['answer_id']
    predictions_df[TARGET_COL] = hf_dataset['test'][TARGET_COL]
    predictions_df['predicted_score'] = predictions

    wandb.log({'predictions_table': wandb.Table(dataframe=predictions_df)})

    return metrics

In [21]:
MODEL_OUTPUT_PATH = os.path.join(CONFIG['paths']['models']['dalip_bi-encoder'],
                                 f'bi-encoder_{MODEL_NAME}_{EMBEDDINGS_POOLING}')

training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_PATH,
    logging_steps=1,
    eval_steps=int(len(hf_dataset['train']) / BATCH_SIZE / 2),
    eval_strategy = "steps",
    save_strategy = "epoch",
    save_total_limit=1,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to='wandb',
    remove_unused_columns=False,
    # gradient_checkpointing=True,
    # fp16=True,
)

In [22]:
run = wandb.init(
    project='dalip-stackoverflow-answer-ranking',
    tags=['bi-encoder']
)

wandb.config.update({
    'preprocessing': preprocessor.__dict__,
    'model_name': MODEL_NAME,
    'vectorizer': {
        'vectorization_type': 'embeddings',
        'embeddings_pooling': EMBEDDINGS_POOLING,
        'max_length': MAX_LENGTH
    },
    'loss_fn': LOSS
})

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=hf_dataset['train'],
    eval_dataset=hf_dataset['test'],
    compute_loss_func=trainer_loss_fn,
    compute_metrics=compute_metrics
)

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbunnynobugs[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [23]:
trainer.train()
run.finish()



OutOfMemoryError: CUDA out of memory. Tried to allocate 36.00 MiB. GPU 