# Bi-Encoder ranking model

## Data loading

In [1]:
from datasets import load_from_disk
from src.utils.config_management import CONFIG

In [2]:
hf_dataset = load_from_disk(CONFIG['paths']['data']['dalip_hf_dataset'])

In [3]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 'question_view_count', 'question_body', 'question_last_edit_date', 'question_last_activity_date', 'question_title', 'question_tags', 'question_answer_count', 'question_comment_count', 'question_favorite_count', 'question_closed_date', 'question_community_owned_date', 'answer_accepted'],
        num_rows: 42700
    })
    test: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 

## Data preprocessing

In [4]:
from src.utils.text_preprocessing import Preprocessor

In [5]:
preprocessor = Preprocessor(preserve_html_tags=['code'])

In [6]:
hf_dataset = hf_dataset.map(preprocessor, batched=True)

In [7]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 'question_view_count', 'question_body', 'question_last_edit_date', 'question_last_activity_date', 'question_title', 'question_tags', 'question_answer_count', 'question_comment_count', 'question_favorite_count', 'question_closed_date', 'question_community_owned_date', 'answer_accepted', 'question_text', 'answer_text'],
        num_rows: 42700
    })
    test: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_cr

## Model training

In [8]:
MODEL_PATH = 'mmukh/SOBertLarge'
MODEL_NAME = MODEL_PATH.split('/')[-1]
EMBEDDINGS_POOLING = 'mean'
EMBEDDINGS_POSTPROCESSING = 'normalize'

### Load embeddings

In [9]:
import pickle
import os

In [10]:
embeddings_path = os.path.join(CONFIG['paths']['models']['dalip_embeddings'],
                               f"embeddings_{MODEL_NAME}_{EMBEDDINGS_POOLING}.pickle")

In [11]:
with open(embeddings_path, 'rb') as f:
    embeddings = pickle.load(f)

# if EMBEDDINGS_POSTPROCESSING == 'normalize':
#     embeddings['train']['question_embeddings'] = torch.nn.functional.normalize(embeddings['train']['question_embeddings'])
#     embeddings['train']['answer_embeddings'] = torch.nn.functional.normalize(embeddings['train']['answer_embeddings'])
#     embeddings['test']['question_embeddings'] = torch.nn.functional.normalize(embeddings['test']['question_embeddings'])
#     embeddings['test']['answer_embeddings'] = torch.nn.functional.normalize(embeddings['test']['answer_embeddings'])

### Train ranker

In [12]:
import torch
import pandas as pd
import xgboost as xgb

In [13]:
train_targets_df = pd.DataFrame(hf_dataset['train']).select_dtypes(exclude=['object'])

train_idxs_sorted = train_targets_df.sort_values('question_id').index
train_targets_df = train_targets_df.loc[train_idxs_sorted]

In [14]:
X_train = torch.cat([embeddings['train']['question_embeddings'], embeddings['train']['answer_embeddings']], dim=1)
X_train = X_train[train_idxs_sorted]
# X_train = torch.nn.functional.normalize(X_train)
X_test = torch.cat([embeddings['test']['question_embeddings'], embeddings['test']['answer_embeddings']], dim=1)
# X_test = torch.nn.functional.normalize(X_test)

In [15]:
train_targets_df['answer_reverse_rank'] = train_targets_df.groupby('question_id')['answer_score'].rank(method='dense', ascending=True)

In [17]:
TARGET_COL = 'answer_normalized_score'

model = xgb.XGBRanker(
    n_estimators=100,
    objective='rank:ndcg',
    lambdarank_pair_method='mean',
    lambdarank_num_pair_per_sample=100,
    ndcg_exp_gain=False,
    device='cuda',
)
model.fit(X_train, train_targets_df[TARGET_COL], qid=train_targets_df['question_id'])

## Evaluation

In [18]:
import wandb
from collections import defaultdict
from src.evaluation import RankingEvaluator
import pandas as pd

In [93]:
wandb_config = defaultdict(dict)

wandb_config['preprocessing'] = preprocessor.__dict__

wandb_config['vectorizer']['vectorization_type'] = 'embeddings'
wandb_config['vectorizer']['embeddings_pooling'] = EMBEDDINGS_POOLING

wandb_config['ranking_model'] = defaultdict(dict)
wandb_config['ranking_model']['ranker_type'] = type(model).__name__
wandb_config['ranking_model']['n_features'] = model.n_features_in_
wandb_config['ranking_model']['params'][type(model).__name__] = model.get_params()

wandb_config['model_name'] = MODEL_NAME

wandb_config

defaultdict(dict,
            {'preprocessing': {'question_columns': ['question_title',
               'question_body'],
              'answer_columns': ['answer_body'],
              'preserve_html_tags': ['code']},
             'vectorizer': {'vectorization_type': 'embeddings',
              'embeddings_pooling': 'mean'},
             'ranking_model': defaultdict(dict,
                         {'ranker_type': 'XGBRanker',
                          'n_features': 3072,
                          'params': {'XGBRanker': {'objective': 'rank:ndcg',
                            'base_score': None,
                            'booster': None,
                            'callbacks': None,
                            'colsample_bylevel': None,
                            'colsample_bynode': None,
                            'colsample_bytree': None,
                            'device': 'cuda',
                            'early_stopping_rounds': None,
                            'enable_categ

In [94]:
run = wandb.init(
    project='dalip-stackoverflow-answer-ranking',
    config=wandb_config,
    tags=['embeddings', 'ranking']
)

### Compute metrics

In [19]:
targets = hf_dataset['test']['answer_normalized_score']
test_question_ids = hf_dataset['test']['question_id']

In [20]:
predictions = model.predict(X_test)

In [21]:
evaluator = RankingEvaluator(ndcg_k=list(range(1, 11)),
                             ndcg_gain_func='exponential', ndcg_discount_func='logarithmic')

In [22]:
metrics = evaluator(targets, predictions, test_question_ids)
metrics.pop('mae')
metrics

{'ndcg@1_g.exponential_d.logarithmic': 0.4954621748626572,
 'ndcg@2_g.exponential_d.logarithmic': 0.5943246291952936,
 'ndcg@3_g.exponential_d.logarithmic': 0.6781826386292512,
 'ndcg@4_g.exponential_d.logarithmic': 0.7385195164982931,
 'ndcg@5_g.exponential_d.logarithmic': 0.7645322460729602,
 'ndcg@6_g.exponential_d.logarithmic': 0.773293290672338,
 'ndcg@7_g.exponential_d.logarithmic': 0.778274797257841,
 'ndcg@8_g.exponential_d.logarithmic': 0.7811336633630334,
 'ndcg@9_g.exponential_d.logarithmic': 0.7824710555974302,
 'ndcg@10_g.exponential_d.logarithmic': 0.7829575835770114,
 'hit_rate@1': 0.3712082262210797}

In [99]:
wandb.log(metrics)

### Log predictions

In [100]:
predictions_df = pd.DataFrame()
predictions_df['answer_id'] = hf_dataset['test']['answer_id']
predictions_df['answer_normalized_score'] = hf_dataset['test']['answer_normalized_score']
predictions_df['predicted_score'] = predictions

In [101]:
wandb.log({'predictions_table': wandb.Table(dataframe=predictions_df)})

In [102]:
run.finish()

0,1
hit_rate@1,▁
ndcg@10_g.exponential_d.logarithmic,▁
ndcg@1_g.exponential_d.logarithmic,▁
ndcg@2_g.exponential_d.logarithmic,▁
ndcg@3_g.exponential_d.logarithmic,▁
ndcg@4_g.exponential_d.logarithmic,▁
ndcg@5_g.exponential_d.logarithmic,▁
ndcg@6_g.exponential_d.logarithmic,▁
ndcg@7_g.exponential_d.logarithmic,▁
ndcg@8_g.exponential_d.logarithmic,▁

0,1
hit_rate@1,0.38663
ndcg@10_g.exponential_d.logarithmic,0.78761
ndcg@1_g.exponential_d.logarithmic,0.50439
ndcg@2_g.exponential_d.logarithmic,0.60609
ndcg@3_g.exponential_d.logarithmic,0.68523
ndcg@4_g.exponential_d.logarithmic,0.74619
ndcg@5_g.exponential_d.logarithmic,0.76728
ndcg@6_g.exponential_d.logarithmic,0.77824
ndcg@7_g.exponential_d.logarithmic,0.78333
ndcg@8_g.exponential_d.logarithmic,0.78551
