# Cross-Encoder ranking model

## Data loading

In [1]:
from datasets import load_from_disk
from src.utils.config_management import CONFIG

In [2]:
hf_dataset = load_from_disk(CONFIG['paths']['data']['dalip_hf_dataset'])

In [3]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 'question_view_count', 'question_body', 'question_last_edit_date', 'question_last_activity_date', 'question_title', 'question_tags', 'question_answer_count', 'question_comment_count', 'question_favorite_count', 'question_closed_date', 'question_community_owned_date', 'answer_accepted'],
        num_rows: 42700
    })
    test: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 

## Data preprocessing

In [4]:
from src.utils.text_preprocessing import Preprocessor

In [5]:
preprocessor = Preprocessor(preserve_html_tags=['code'])

In [6]:
hf_dataset = hf_dataset.map(preprocessor, batched=True)

In [7]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 'question_view_count', 'question_body', 'question_last_edit_date', 'question_last_activity_date', 'question_title', 'question_tags', 'question_answer_count', 'question_comment_count', 'question_favorite_count', 'question_closed_date', 'question_community_owned_date', 'answer_accepted', 'question_text', 'answer_text'],
        num_rows: 42700
    })
    test: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_cr

## Fine-tuning

### Create pairs dataset

In [8]:
import torch
import os
import math
import pandas as pd
from src.data_management import create_pairs_dataset_df
from datasets import Dataset

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

TARGET_COL = 'answer_normalized_score'
PAIRS_SAMPLING_STRATEGY = 'mean'
N_SAMPLES = 'all'
MODEL_PATH = 'mmukh/SOBertLarge'
MODEL_NAME = MODEL_PATH.split('/')[-1]
MAX_LENGTH = 1024
BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = math.ceil(4 / BATCH_SIZE)
LOSS = 'margin_ranking_loss'
MODEL_OUTPUT_PATH = os.path.join(CONFIG['paths']['models']['dalip_cross-encoder_ranking'],
                                 f'cross-encoder_ranking_{MODEL_NAME}')

In [10]:
train_dataset_df = pd.DataFrame(hf_dataset['train'])
test_dataset_df = pd.DataFrame(hf_dataset['test'])

In [11]:
train_pairs_dataset_df = create_pairs_dataset_df(train_dataset_df, pairs_sampling_strategy=PAIRS_SAMPLING_STRATEGY, n=N_SAMPLES,
                                                 TARGET_COL=TARGET_COL)
test_pairs_dataset_df = create_pairs_dataset_df(test_dataset_df, pairs_sampling_strategy='mean', n='all',
                                                TARGET_COL=TARGET_COL)

hf_dataset['test'] = hf_dataset['test'].rename_column('answer_text', 'answer_1_text')

100%|██████████| 7776/7776 [01:29<00:00, 86.89it/s] 
100%|██████████| 1945/1945 [00:22<00:00, 85.33it/s] 


In [12]:
train_pairs_dataset = Dataset.from_pandas(train_pairs_dataset_df)

### Define model and data collator

In [13]:
import torch.nn as nn
from dataclasses import dataclass
from transformers import PreTrainedTokenizerBase, MegatronBertForSequenceClassification, PreTrainedTokenizerFast

2025-05-17 20:06:08.356080: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-17 20:06:08.425033: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-17 20:06:08.425070: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-17 20:06:08.427867: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-17 20:06:08.440430: I tensorflow/core/platform/cpu_feature_guar

In [14]:
scorer_model = MegatronBertForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=1)

Some weights of MegatronBertForSequenceClassification were not initialized from the model checkpoint at mmukh/SOBertLarge and are newly initialized: ['bert.embeddings.token_type_embeddings.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_PATH)
if not tokenizer.pad_token:
    pad_token_id = scorer_model.bert.embeddings.word_embeddings.padding_idx
    print(f'Setting pad token id to {pad_token_id}...')
    tokenizer.pad_token_id = pad_token_id
    print(f'Pad token set to {tokenizer.pad_token}')

Setting pad token id to 0...
Pad token set to <unk>


In [16]:
class CrossEncoderRanker(nn.Module):
    def __init__(self, scorer_model):
        super().__init__()
        self.scorer = scorer_model

    def gradient_checkpointing_enable(self, **gradient_checkpointing_kwargs):
        if hasattr(self.scorer, 'gradient_checkpointing_enable'):
            self.scorer.gradient_checkpointing_enable()
        else:
            raise NotImplementedError('Scorer model does not support gradient checkpointing.')

    def gradient_checkpointing_disable(self, **gradient_checkpointing_kwargs):
        if hasattr(self.scorer, 'gradient_checkpointing_disable'):
            self.scorer.gradient_checkpointing_disable()
        else:
            raise NotImplementedError('Scorer model does not support gradient checkpointing.')

    def forward(self, pairs_1_tokenized, pairs_2_tokenized=None, labels=None):
        pair_1_scores = self.scorer(**pairs_1_tokenized).logits.squeeze(-1)

        outputs = {'pair_1_scores': pair_1_scores}

        if pairs_2_tokenized is not None:
            pair_2_scores = self.scorer(**pairs_2_tokenized).logits.squeeze(-1)

            outputs['pair_2_scores'] = pair_2_scores

        return outputs

In [17]:
model = CrossEncoderRanker(scorer_model).to(device)

In [18]:
@dataclass
class CrossEncoderPairwiseDataCollator:
    tokenizer: PreTrainedTokenizerBase
    padding: bool = True

    def __call__(self, batch):
        question_texts = []
        answer_1_texts = []
        answer_2_texts = []
        labels = []

        for sample in batch:
            question_texts.append(sample['question_text'])
            answer_1_texts.append(sample['answer_1_text'])
            if 'answer_2_text' in sample:
                answer_2_texts.append(sample['answer_2_text'])
            if 'label' in sample:  # if training
                labels.append(sample['label'])
            else:  # if evaluation
                labels.append(sample[TARGET_COL])

        pairs_1_tokenized = self.tokenizer(question_texts, answer_1_texts, padding=self.padding, truncation='longest_first',
                                           return_tensors='pt')

        labels = torch.tensor(labels).float()

        collated_batch = {
            'pairs_1_tokenized': pairs_1_tokenized,
            'labels': labels
        }

        if answer_2_texts:
            pairs_2_tokenized = self.tokenizer(question_texts, answer_2_texts, padding=self.padding, truncation='longest_first',
                                           return_tensors='pt')

            collated_batch['pairs_2_tokenized'] = pairs_2_tokenized

        return collated_batch

In [19]:
data_collator = CrossEncoderPairwiseDataCollator(tokenizer=tokenizer)

### Train model

In [20]:
from transformers import Trainer, TrainingArguments
from src.evaluation import RankingEvaluator
import wandb
import pandas as pd

In [21]:
if LOSS == 'margin_ranking_loss':
    loss_fn = nn.MarginRankingLoss(margin=1.0)

def trainer_loss_fn(outputs, labels, num_items_in_batch=None):
    pair_1_scores = outputs['pair_1_scores']

    if 'pair_2_scores' in outputs: # if training
        pair_2_scores = outputs['pair_2_scores']
        loss = loss_fn(pair_1_scores, pair_2_scores, labels)

    else: # if evaluation
        loss = torch.tensor(0.0)

    return loss

In [22]:
test_question_ids = hf_dataset['test']['question_id']

evaluator = RankingEvaluator(ndcg_k=list(range(1, 11)),
                             ndcg_gain_func='exponential', ndcg_discount_func='logarithmic')

def compute_metrics(eval_pred):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids

    predictions_df = pd.DataFrame()
    predictions_df['answer_id'] = hf_dataset['test']['answer_id']
    predictions_df[TARGET_COL] = hf_dataset['test'][TARGET_COL]
    predictions_df = predictions_df[:len(predictions)]
    predictions_df['predicted_score'] = predictions

    pairs_predictions_df = test_pairs_dataset_df.merge(predictions_df, left_on='answer_1_id', right_on='answer_id')
    pairs_predictions_df = pairs_predictions_df.rename(columns={'predicted_score': 'answer_1_predicted_score'})
    pairs_predictions_df = pairs_predictions_df.merge(predictions_df, left_on='answer_2_id', right_on='answer_id')
    pairs_predictions_df = pairs_predictions_df.rename(columns={'predicted_score': 'answer_2_predicted_score'})

    loss = loss_fn(torch.tensor(pairs_predictions_df['answer_1_predicted_score']),
                   torch.tensor(pairs_predictions_df['answer_2_predicted_score']),
                   torch.tensor(pairs_predictions_df['label']))

    metrics = {LOSS: loss}
    metrics.update(evaluator(labels, predictions, test_question_ids))
    metrics.pop('mae')

    wandb.log({'predictions_table': wandb.Table(dataframe=predictions_df)})

    return metrics

In [23]:
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_PATH,
    logging_steps=1,
    eval_steps=int(len(train_pairs_dataset_df) / BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS / 10),
    eval_strategy = "steps",
    save_strategy = "epoch",
    save_total_limit=1,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to='wandb',
    remove_unused_columns=False,
    gradient_checkpointing=True,
    optim="adamw_8bit"
)

In [24]:
run = wandb.init(
    project='dalip-stackoverflow-answer-ranking',
    tags=['cross-encoder', 'ranking']
)

wandb.config.update({
    'preprocessing': preprocessor.__dict__,
    'dataset': {
        'pairs_sampling_strategy': PAIRS_SAMPLING_STRATEGY,
        'n': N_SAMPLES
    },
    'model_name': MODEL_NAME,
    'vectorizer': {
        'vectorization_type': 'embeddings',
        'max_length': MAX_LENGTH
    },
    'loss_fn': LOSS
})

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_pairs_dataset,
    eval_dataset=hf_dataset['test'],
    compute_loss_func=trainer_loss_fn,
    compute_metrics=compute_metrics
)

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbunnynobugs[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
trainer.train()
run.finish()



Step,Training Loss,Validation Loss
