# TF-IDF baseline

## Data loading

In [28]:
from datasets import load_from_disk
from src.utils.config_management import CONFIG

In [29]:
hf_dataset = load_from_disk(CONFIG['paths']['data']['dalip_hf_dataset'])

In [30]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 'question_view_count', 'question_body', 'question_last_edit_date', 'question_last_activity_date', 'question_title', 'question_tags', 'question_answer_count', 'question_comment_count', 'question_favorite_count', 'question_closed_date', 'question_community_owned_date', 'answer_accepted'],
        num_rows: 42700
    })
    test: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 

## Data preprocessing

In [31]:
from src.utils.text_preprocessing import Preprocessor

In [32]:
preprocessor = Preprocessor()

In [33]:
hf_dataset = hf_dataset.map(preprocessor, batched=True)

In [34]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_creation_date', 'question_score', 'question_view_count', 'question_body', 'question_last_edit_date', 'question_last_activity_date', 'question_title', 'question_tags', 'question_answer_count', 'question_comment_count', 'question_favorite_count', 'question_closed_date', 'question_community_owned_date', 'answer_accepted', 'question_text', 'answer_text'],
        num_rows: 42700
    })
    test: Dataset({
        features: ['answer_id', 'question_id', 'answer_creation_date', 'answer_score', 'answer_normalized_score', 'answer_log_normalized_score', 'answer_body', 'answer_last_edit_date', 'answer_last_activity_date', 'answer_comment_count', 'answer_community_owned_date', 'question_cr

In [35]:
hf_dataset['train'] = hf_dataset['train'].sort('question_id')

## Model training

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import pandas as pd
import xgboost as xgb

In [37]:
vectorizer = TfidfVectorizer(max_features=None, stop_words=None)

In [38]:
def vectorize_qa_pairs(vectorizer, hf_dataset):
    vectorizer.fit(hf_dataset['train']['question_text'] + hf_dataset['train']['answer_text'])
    
    train_questions_vect = vectorizer.transform(hf_dataset['train']['question_text'])
    train_answers_vect = vectorizer.transform(hf_dataset['train']['answer_text'])

    
    test_questions_vect = vectorizer.transform(hf_dataset['test']['question_text'])
    test_answers_vect = vectorizer.transform(hf_dataset['test']['answer_text'])
    
    if hf_dataset['train']['question_text'][0]:
        X_train = scipy.sparse.hstack([train_questions_vect, train_answers_vect])
        X_test = scipy.sparse.hstack([test_questions_vect, test_answers_vect])
    else:
        X_train = train_answers_vect
        X_test = test_answers_vect
    
    return X_train, X_test

In [39]:
X_train, X_test = vectorize_qa_pairs(vectorizer, hf_dataset)
X_train

<42700x193582 sparse matrix of type '<class 'numpy.float64'>'
	with 5116675 stored elements in Compressed Sparse Row format>

In [40]:
train_targets_df = pd.DataFrame(hf_dataset['train']).select_dtypes(exclude=['object'])

In [41]:
train_targets_df['answer_reverse_rank'] = train_targets_df.groupby('question_id')['answer_score'].rank(method='dense', ascending=True)

In [42]:
TARGET_COL = 'answer_normalized_score'

model = xgb.XGBRanker(
    n_estimators=500,
    objective='rank:ndcg',
    lambdarank_pair_method='mean',
    lambdarank_num_pair_per_sample=10,
    ndcg_exp_gain=False,
    device='cuda',
)
model.fit(X_train, train_targets_df[TARGET_COL], qid=hf_dataset['train']['question_id'])

## Evaluation

In [43]:
import pandas as pd
import wandb
from collections import defaultdict
from src.evaluation import RankingEvaluator

In [44]:
wandb_config = defaultdict(dict)

wandb_config['preprocessing'] = preprocessor.__dict__

wandb_config['vectorizer'] = vectorizer.get_params()

wandb_config['ranking_model'] = defaultdict(dict)
wandb_config['ranking_model']['ranker_type'] = type(model).__name__
wandb_config['ranking_model']['n_features'] = model.n_features_in_
wandb_config['ranking_model']['params'][type(model).__name__] = model.get_params()

wandb_config['target'] = TARGET_COL

wandb_config

defaultdict(dict,
            {'preprocessing': {'question_columns': ['question_title',
               'question_body'],
              'answer_columns': ['answer_body'],
              'preserve_html_tags': None},
             'vectorizer': {'analyzer': 'word',
              'binary': False,
              'decode_error': 'strict',
              'dtype': numpy.float64,
              'encoding': 'utf-8',
              'input': 'content',
              'lowercase': True,
              'max_df': 1.0,
              'max_features': None,
              'min_df': 1,
              'ngram_range': (1, 1),
              'norm': 'l2',
              'preprocessor': None,
              'smooth_idf': True,
              'stop_words': None,
              'strip_accents': None,
              'sublinear_tf': False,
              'token_pattern': '(?u)\\b\\w\\w+\\b',
              'tokenizer': None,
              'use_idf': True,
              'vocabulary': None},
             'ranking_model': defaultdict(

In [45]:
run = wandb.init(
    project='dalip-stackoverflow-answer-ranking',
    config=wandb_config,
    tags=['tfidf', 'baseline', 'ranking']
)

### Compute metrics

In [46]:
targets = hf_dataset['test']['answer_normalized_score']
test_question_ids = hf_dataset['test']['question_id']

In [47]:
predictions = model.predict(X_test)

In [48]:
pd.Series(predictions).describe()

count    10563.000000
mean     -4713.014160
std       7400.028320
min     -30030.658203
25%      -9101.183594
50%          0.021742
75%          0.075802
max         33.917713
dtype: float64

In [49]:
evaluator = RankingEvaluator(ndcg_k=list(range(1, 11)), 
                             ndcg_gain_func='exponential', ndcg_discount_func='logarithmic')

In [50]:
metrics = evaluator(targets, predictions, test_question_ids)
metrics.pop('mae')
metrics

{'ndcg@1_g.exponential_d.logarithmic': 0.4260181936164547,
 'ndcg@2_g.exponential_d.logarithmic': 0.528274823619074,
 'ndcg@3_g.exponential_d.logarithmic': 0.6118411130234743,
 'ndcg@4_g.exponential_d.logarithmic': 0.6866535107106003,
 'ndcg@5_g.exponential_d.logarithmic': 0.7169681603441735,
 'ndcg@6_g.exponential_d.logarithmic': 0.7305662411954977,
 'ndcg@7_g.exponential_d.logarithmic': 0.7371760003930563,
 'ndcg@8_g.exponential_d.logarithmic': 0.7412628444369044,
 'ndcg@9_g.exponential_d.logarithmic': 0.7430130050558686,
 'ndcg@10_g.exponential_d.logarithmic': 0.7438344636526573,
 'hit_rate@1': 0.33830334190231365}

In [51]:
wandb.log(metrics)

### Log predictions

In [52]:
predictions_df = pd.DataFrame()
predictions_df['answer_id'] = hf_dataset['test']['answer_id']
predictions_df['answer_normalized_score'] = hf_dataset['test']['answer_normalized_score']
predictions_df['predicted_score'] = predictions

In [53]:
wandb.log({'predictions_table': wandb.Table(dataframe=predictions_df)})

In [54]:
run.finish()

0,1
hit_rate@1,▁
ndcg@10_g.exponential_d.logarithmic,▁
ndcg@1_g.exponential_d.logarithmic,▁
ndcg@2_g.exponential_d.logarithmic,▁
ndcg@3_g.exponential_d.logarithmic,▁
ndcg@4_g.exponential_d.logarithmic,▁
ndcg@5_g.exponential_d.logarithmic,▁
ndcg@6_g.exponential_d.logarithmic,▁
ndcg@7_g.exponential_d.logarithmic,▁
ndcg@8_g.exponential_d.logarithmic,▁

0,1
hit_rate@1,0.3383
ndcg@10_g.exponential_d.logarithmic,0.74383
ndcg@1_g.exponential_d.logarithmic,0.42602
ndcg@2_g.exponential_d.logarithmic,0.52827
ndcg@3_g.exponential_d.logarithmic,0.61184
ndcg@4_g.exponential_d.logarithmic,0.68665
ndcg@5_g.exponential_d.logarithmic,0.71697
ndcg@6_g.exponential_d.logarithmic,0.73057
ndcg@7_g.exponential_d.logarithmic,0.73718
ndcg@8_g.exponential_d.logarithmic,0.74126
