# Bi-Encoder model

## Data loading

In [2]:
from datasets import load_from_disk
from src.utils.config_management import CONFIG

In [3]:
hf_dataset = load_from_disk(CONFIG['paths']['data']['dalip_hf_dataset'])

In [4]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 'question_view_count', 'question_body', 'question_last_edit_date', 'question_last_activity_date', 'question_title', 'question_tags', 'question_answer_count', 'question_comment_count', 'question_favorite_count', 'question_closed_date', 'question_community_owned_date', 'answer_accepted'],
        num_rows: 42700
    })
    test: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 

## Data preprocessing

In [5]:
from src.utils.text_preprocessing import Preprocessor

In [6]:
TARGET_COL = 'answer_normalized_score'

In [7]:
preprocessor = Preprocessor(preserve_html_tags=['code'])

In [8]:
hf_dataset = hf_dataset.map(preprocessor, batched=True)

In [9]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 'question_view_count', 'question_body', 'question_last_edit_date', 'question_last_activity_date', 'question_title', 'question_tags', 'question_answer_count', 'question_comment_count', 'question_favorite_count', 'question_closed_date', 'question_community_owned_date', 'answer_accepted', 'question_text', 'answer_text'],
        num_rows: 42700
    })
    test: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_cr

## Model training

In [66]:
MODEL_PATH = 'mmukh/SOBertBase'
MODEL_NAME = MODEL_PATH.split('/')[-1]
EMBEDDINGS_POOLING = 'mean'
EMBEDDINGS_POSTPROCESSING = 'normalize'

### Load embeddings

In [17]:
import pickle
import os
import torch

In [12]:
embeddings_path = os.path.join(CONFIG['paths']['models']['dalip_embeddings'],
                               f"embeddings_{MODEL_NAME}_{EMBEDDINGS_POOLING}.pickle")

In [90]:
with open(embeddings_path, 'rb') as f:
    embeddings = pickle.load(f)

# if EMBEDDINGS_POSTPROCESSING == 'normalize':
#     embeddings['train']['question_embeddings'] = torch.nn.functional.normalize(embeddings['train']['question_embeddings'])
#     embeddings['train']['answer_embeddings'] = torch.nn.functional.normalize(embeddings['train']['answer_embeddings'])
#     embeddings['test']['question_embeddings'] = torch.nn.functional.normalize(embeddings['test']['question_embeddings'])
#     embeddings['test']['answer_embeddings'] = torch.nn.functional.normalize(embeddings['test']['answer_embeddings'])

### Fit regression

In [140]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import torch

In [126]:
X_train = torch.cat([embeddings['train']['question_embeddings'], embeddings['train']['answer_embeddings']], dim=1)
# X_train = torch.nn.functional.normalize(X_train)
X_test = torch.cat([embeddings['test']['question_embeddings'], embeddings['test']['answer_embeddings']], dim=1)
# X_test = torch.nn.functional.normalize(X_test)

y_train = hf_dataset['train'][TARGET_COL]

In [127]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [147]:
%%time

regression_model = MLPRegressor(hidden_layer_sizes=(512, 128), early_stopping=True, random_state=42)
regression_model.fit(X_train, y_train)

Wall time: 3min 25s


## Evaluation

In [112]:
import wandb
from collections import defaultdict
from src.evaluation import RankingEvaluator
import pandas as pd

In [18]:
wandb_config = defaultdict(dict)

wandb_config['preprocessing'] = preprocessor.__dict__

wandb_config['vectorizer']['vectorization_type'] = 'embeddings'
wandb_config['vectorizer']['embeddings_pooling'] = EMBEDDINGS_POOLING

wandb_config['regression_model'] = defaultdict(dict)
wandb_config['regression_model']['regressor_type'] = str(regression_model)
wandb_config['regression_model']['n_features'] = regression_model.n_features_in_
wandb_config['regression_model']['params'][str(regression_model)] = regression_model.get_params()

wandb_config['model_name'] = MODEL_NAME

wandb_config

defaultdict(dict,
            {'preprocessing': {'question_columns': ['question_title',
               'question_body'],
              'answer_columns': ['answer_body'],
              'preserve_html_tags': ['code']},
             'vectorizer': {'vectorization_type': 'embeddings',
              'embeddings_pooling': 'mean'},
             'regression_model': defaultdict(dict,
                         {'regressor_type': 'LinearRegression()',
                          'n_features': 1536,
                          'params': {'LinearRegression()': {'copy_X': True,
                            'fit_intercept': True,
                            'n_jobs': None,
                            'positive': False}}}),
             'model_name': 'SOBertBase'})

In [19]:
run = wandb.init(
    project='dalip-stackoverflow-answer-ranking',
    config=wandb_config,
    tags=['embeddings', 'regression']
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbunnynobugs[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Compute metrics

In [148]:
targets = hf_dataset['test'][TARGET_COL]

In [149]:
predictions = regression_model.predict(X_test)

In [150]:
test_question_ids = hf_dataset['test']['question_id']

In [151]:
evaluator = RankingEvaluator(ndcg_k=list(range(1, 11)),
                             ndcg_gain_func='exponential', ndcg_discount_func='logarithmic')

In [152]:
metrics = evaluator(targets, predictions, test_question_ids)
if TARGET_COL == 'answer_log_normalized_score':
    metrics.pop('mae')
metrics

{'ndcg@1_g.exponential_d.logarithmic': 0.4830207522798136,
 'ndcg@2_g.exponential_d.logarithmic': 0.5882704991821809,
 'ndcg@3_g.exponential_d.logarithmic': 0.6734547567129939,
 'ndcg@4_g.exponential_d.logarithmic': 0.7356214891675508,
 'ndcg@5_g.exponential_d.logarithmic': 0.7582557664929953,
 'ndcg@6_g.exponential_d.logarithmic': 0.7694040576011398,
 'ndcg@7_g.exponential_d.logarithmic': 0.7736315586532834,
 'ndcg@8_g.exponential_d.logarithmic': 0.7757015575461668,
 'ndcg@9_g.exponential_d.logarithmic': 0.7772503209255999,
 'ndcg@10_g.exponential_d.logarithmic': 0.7777891596216847,
 'mae': 2.548447059053511,
 'hit_rate@1': 0.3593830334190231}

In [25]:
wandb.log(metrics)

### Log predictions

In [26]:
predictions_df = pd.DataFrame()
predictions_df['answer_id'] = hf_dataset['test']['answer_id']
predictions_df['answer_normalized_score'] = hf_dataset['test']['answer_normalized_score']
predictions_df['predicted_score'] = predictions

In [27]:
wandb.log({'predictions_table': wandb.Table(dataframe=predictions_df)})

In [28]:
run.finish()

0,1
hit_rate@1,▁
mae,▁
ndcg@10_g.exponential_d.logarithmic,▁
ndcg@1_g.exponential_d.logarithmic,▁
ndcg@2_g.exponential_d.logarithmic,▁
ndcg@3_g.exponential_d.logarithmic,▁
ndcg@4_g.exponential_d.logarithmic,▁
ndcg@5_g.exponential_d.logarithmic,▁
ndcg@6_g.exponential_d.logarithmic,▁
ndcg@7_g.exponential_d.logarithmic,▁

0,1
hit_rate@1,0.36504
mae,3.03446
ndcg@10_g.exponential_d.logarithmic,0.7806
ndcg@1_g.exponential_d.logarithmic,0.48894
ndcg@2_g.exponential_d.logarithmic,0.58875
ndcg@3_g.exponential_d.logarithmic,0.67326
ndcg@4_g.exponential_d.logarithmic,0.73987
ndcg@5_g.exponential_d.logarithmic,0.76191
ndcg@6_g.exponential_d.logarithmic,0.77085
ndcg@7_g.exponential_d.logarithmic,0.77542
