# Bi-Encoder model

## Data loading

In [1]:
from datasets import load_from_disk
from src.utils.config_management import CONFIG

In [2]:
hf_dataset = load_from_disk(CONFIG['paths']['data']['dalip_hf_dataset'])

In [3]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 'question_view_count', 'question_body', 'question_last_edit_date', 'question_last_activity_date', 'question_title', 'question_tags', 'question_answer_count', 'question_comment_count', 'question_favorite_count', 'question_closed_date', 'question_community_owned_date', 'answer_accepted'],
        num_rows: 42700
    })
    test: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 

## Data preprocessing

In [4]:
from src.utils.text_preprocessing import Preprocessor

In [5]:
TARGET_COL = 'answer_normalized_score'

In [6]:
preprocessor = Preprocessor(preserve_html_tags=['code'])

In [7]:
hf_dataset = hf_dataset.map(preprocessor, batched=True)

In [8]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 'question_view_count', 'question_body', 'question_last_edit_date', 'question_last_activity_date', 'question_title', 'question_tags', 'question_answer_count', 'question_comment_count', 'question_favorite_count', 'question_closed_date', 'question_community_owned_date', 'answer_accepted', 'question_text', 'answer_text'],
        num_rows: 42700
    })
    test: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_cr

## Model inference

In [9]:
from transformers import MegatronBertModel, PreTrainedTokenizerFast
from tqdm import tqdm
from torch.utils.data import DataLoader
import torch
import pandas as pd

2025-04-18 19:03:58.907277: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-18 19:03:58.975137: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-18 19:03:58.975194: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-18 19:03:58.978202: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-18 19:03:58.991847: I tensorflow/core/platform/cpu_feature_guar

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

MODEL_PATH = 'mmukh/SOBertBase'
MODEL_NAME = MODEL_PATH.split('/')[-1]
BATCH_SIZE = 4
EMBEDDINGS_POOLING = 'mean'

In [11]:
model = MegatronBertModel.from_pretrained(MODEL_PATH).to(device)
model.eval();

Some weights of MegatronBertModel were not initialized from the model checkpoint at mmukh/SOBertBase and are newly initialized: ['embeddings.token_type_embeddings.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_PATH)
if not tokenizer.pad_token:
    pad_token_id = model.embeddings.word_embeddings.padding_idx
    print(f'Setting pad token id to {pad_token_id}...')
    tokenizer.pad_token_id = pad_token_id
    print(f'Pad token set to {tokenizer.pad_token}')

Setting pad token id to 0...
Pad token set to <unk>


In [13]:
def get_sentence_embeddings(tokenized_inputs, pooling):
    with torch.no_grad():
        outputs = model(**tokenized_inputs)
    
    attention_mask = tokenized_inputs['attention_mask']
    last_hidden_state = outputs.last_hidden_state
    
    if pooling == 'mean':
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size())
        sum_embeddings = (last_hidden_state * attention_mask_expanded).sum(dim=1)
        sum_mask = attention_mask_expanded.sum(dim=1)
        pooled = sum_embeddings / sum_mask
        
    return pooled.cpu()

In [14]:
def tokenize_texts(texts):
    outputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
    
    return outputs

In [15]:
def compute_question_embeddings(dataset_split):
    def get_unique_questions_dataset(dataset_split):
        unique_question_ids = []
        unique_question_row_ids = []

        curr_question_id = None
        for row_id, row in enumerate(dataset_split):
            question_id = row['question_id']
            if question_id != curr_question_id:
                unique_question_ids.append(question_id)
                unique_question_row_ids.append(row_id)
                curr_question_id = question_id

        unique_questions_dataset = dataset_split.select(unique_question_row_ids)

        return unique_questions_dataset

    unique_questions_dataset = get_unique_questions_dataset(dataset_split)

    questions_dataloader = DataLoader(unique_questions_dataset['question_text'],
                                      batch_size=BATCH_SIZE, collate_fn=tokenize_texts)

    unique_question_embeddings = []

    for batch in tqdm(questions_dataloader):
        batch_embeddings = get_sentence_embeddings(batch, pooling=EMBEDDINGS_POOLING)

        unique_question_embeddings.append(batch_embeddings)

    unique_question_embeddings = torch.cat(unique_question_embeddings, dim=0)
    unique_question_embeddings = pd.DataFrame({'question_id': unique_questions_dataset['question_id'],
                                               'embedding': unique_question_embeddings.unbind(dim=0)})

    question_embeddings = pd.DataFrame({'question_id': dataset_split['question_id']})
    question_embeddings = pd.merge(question_embeddings, unique_question_embeddings, on='question_id')
    question_embeddings = torch.stack(question_embeddings['embedding'].tolist(), dim=0)

    return question_embeddings

In [16]:
train_question_embeddings = compute_question_embeddings(hf_dataset['train'])

100%|██████████| 1944/1944 [04:43<00:00,  6.87it/s]


In [17]:
test_question_embeddings = compute_question_embeddings(hf_dataset['test'])

100%|██████████| 487/487 [01:16<00:00,  6.40it/s]


In [18]:
# test_questions_dataloader = DataLoader(hf_dataset['test']['question_text'], batch_size=BATCH_SIZE, collate_fn=tokenize_texts)
# 
# question_embeddings = []
# 
# for batch in tqdm(test_questions_dataloader):
#     batch_embeddings = get_sentence_embeddings(batch, pooling=EMBEDDINGS_POOLING)
# 
#     question_embeddings.append(batch_embeddings)
# 
# question_embeddings = torch.cat(question_embeddings, dim=0)

In [19]:
def compute_answer_embeddings(dataset_split):
    answers_dataloader = DataLoader(dataset_split['answer_text'], batch_size=BATCH_SIZE, collate_fn=tokenize_texts)

    answer_embeddings = []

    for batch in tqdm(answers_dataloader):
        batch_embeddings = get_sentence_embeddings(batch, pooling=EMBEDDINGS_POOLING)

        answer_embeddings.append(batch_embeddings)

    answer_embeddings = torch.cat(answer_embeddings, dim=0)

    return answer_embeddings

In [20]:
train_answer_embeddings = compute_answer_embeddings(hf_dataset['train'])

100%|██████████| 10675/10675 [14:43<00:00, 12.08it/s]


In [21]:
test_answer_embeddings = compute_answer_embeddings(hf_dataset['test'])

100%|██████████| 2641/2641 [03:48<00:00, 11.56it/s]


### Dump embeddings 

In [22]:
import pickle
import os

In [23]:
embeddings = {
    'train': {
        'question_embeddings': train_question_embeddings,
        'answer_embeddings': train_answer_embeddings
    },
    'test': {
        'question_embeddings': test_question_embeddings,
        'answer_embeddings': test_answer_embeddings
    }
}

In [24]:
embeddings_path = os.path.join(CONFIG['paths']['models']['dalip_embeddings'],
                    f"embeddings_{MODEL_NAME}_{EMBEDDINGS_POOLING}.pickle")

In [25]:
with open(embeddings_path, 'wb') as f:
    pickle.dump(embeddings, f)

In [26]:
# test_question_ids = hf_dataset['test']['question_id']
#
# a = pd.DataFrame({'id': test_question_ids, 'emb': question_embeddings.unbind(dim=0)})
# a['emb'] = a['emb'].apply(lambda x: tuple(x.tolist()))

In [27]:
# a.groupby('id')['emb'].nunique().value_counts()

In [28]:
# with open(test_embeddings_path, 'rb') as f:
#     test_embeddings = pickle.load(f)

## Evaluation

In [29]:
import wandb
from collections import defaultdict
from src.evaluation import RankingEvaluator
import pandas as pd

In [30]:
SIMILARITY_METRIC = 'cosine_similarity'

In [31]:
if SIMILARITY_METRIC == 'cosine_similarity':
    predictions = torch.cosine_similarity(
        test_question_embeddings, test_answer_embeddings).cpu().numpy()

In [32]:
wandb_config = defaultdict(dict)

wandb_config['preprocessing'] = preprocessor.__dict__

wandb_config['vectorizer']['vectorization_type'] = 'embeddings'
wandb_config['vectorizer']['embeddings_pooling'] = EMBEDDINGS_POOLING

wandb_config['regression_model'] = defaultdict(dict)
wandb_config['regression_model']['regressor_type'] = SIMILARITY_METRIC

wandb_config['model_name'] = MODEL_NAME

wandb_config

defaultdict(dict,
            {'preprocessing': {'question_columns': ['question_title',
               'question_body'],
              'answer_columns': ['answer_body'],
              'preserve_html_tags': ['code']},
             'vectorizer': {'vectorization_type': 'embeddings',
              'embeddings_pooling': 'mean'},
             'regression_model': defaultdict(dict,
                         {'regressor_type': 'cosine_similarity'}),
             'model_name': 'SOBertBase'})

In [33]:
run = wandb.init(
    project='dalip-stackoverflow-answer-ranking',
    config=wandb_config,
    tags=['embeddings', 'similarity']
)

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbunnynobugs[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Compute metrics

In [34]:
targets = hf_dataset['test'][TARGET_COL]

In [35]:
test_question_ids = hf_dataset['test']['question_id']

In [36]:
evaluator = RankingEvaluator(ndcg_k=list(range(1, 11)), 
                             ndcg_gain_func='exponential', ndcg_discount_func='logarithmic')

In [37]:
metrics = evaluator(targets, predictions, test_question_ids)
if TARGET_COL == 'answer_log_normalized_score':
    metrics.pop('mae')
metrics

{'ndcg@1_g.exponential_d.logarithmic': 0.43623295567056014,
 'ndcg@2_g.exponential_d.logarithmic': 0.5405199551966923,
 'ndcg@3_g.exponential_d.logarithmic': 0.6283210018946189,
 'ndcg@4_g.exponential_d.logarithmic': 0.7030367653538673,
 'ndcg@5_g.exponential_d.logarithmic': 0.7323741428370554,
 'ndcg@6_g.exponential_d.logarithmic': 0.7431773178434113,
 'ndcg@7_g.exponential_d.logarithmic': 0.7477800642209043,
 'ndcg@8_g.exponential_d.logarithmic': 0.7503184786095484,
 'ndcg@9_g.exponential_d.logarithmic': 0.7526999141663308,
 'ndcg@10_g.exponential_d.logarithmic': 0.7531754923844403,
 'mae': 17.620280573472012,
 'hit_rate@1': 0.3079691516709512}

In [38]:
wandb.log(metrics)

### Log predictions

In [39]:
predictions_df = pd.DataFrame()
predictions_df['answer_id'] = hf_dataset['test']['answer_id']
predictions_df['answer_normalized_score'] = hf_dataset['test']['answer_normalized_score']
predictions_df['predicted_score'] = predictions

In [40]:
wandb.log({'predictions_table': wandb.Table(dataframe=predictions_df)})

In [41]:
run.finish()

0,1
hit_rate@1,▁
mae,▁
ndcg@10_g.exponential_d.logarithmic,▁
ndcg@1_g.exponential_d.logarithmic,▁
ndcg@2_g.exponential_d.logarithmic,▁
ndcg@3_g.exponential_d.logarithmic,▁
ndcg@4_g.exponential_d.logarithmic,▁
ndcg@5_g.exponential_d.logarithmic,▁
ndcg@6_g.exponential_d.logarithmic,▁
ndcg@7_g.exponential_d.logarithmic,▁

0,1
hit_rate@1,0.30797
mae,17.62028
ndcg@10_g.exponential_d.logarithmic,0.75318
ndcg@1_g.exponential_d.logarithmic,0.43623
ndcg@2_g.exponential_d.logarithmic,0.54052
ndcg@3_g.exponential_d.logarithmic,0.62832
ndcg@4_g.exponential_d.logarithmic,0.70304
ndcg@5_g.exponential_d.logarithmic,0.73237
ndcg@6_g.exponential_d.logarithmic,0.74318
ndcg@7_g.exponential_d.logarithmic,0.74778
