## Install further packages

In [6]:
!pip install sentence-transformers
!pip install rank_bm25

[0mCollecting rank_bm25
  Using cached rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
[0m

## Setup

In [51]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
import os

In [15]:
corpus = pd.read_json("data/corpus.jsonl", lines=True)

baseline_queries_train = pd.read_json("data/baseline-queries/queries_train.jsonl", orient="records", lines=True)
baseline_queries_dev = pd.read_json("data/baseline-queries/queries_dev.jsonl", orient="records", lines=True)

## SBERT Baseline
Encode the plain text of the arguments and queries using a multi-lingual sbert model

In [18]:
sbert_encoder = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

In [27]:
corpus_embeddings = sbert_encoder.encode(corpus["argument"].values)
corpus["sbert_embeddings"] = list(corpus_embeddings)

In [28]:
query_train_embeddings = sbert_encoder.encode(baseline_queries_train["text"].values) 
query_dev_embeddings = sbert_encoder.encode(baseline_queries_dev["text"].values) 

baseline_queries_train["sbert_embeddings"] = list(query_train_embeddings)
baseline_queries_dev["sbert_embeddings"] = list(query_dev_embeddings)

In [29]:
train_similarities = cosine_similarity(
    list(baseline_queries_train["sbert_embeddings"].values), list(corpus["sbert_embeddings"].values)
)
dev_similarities = cosine_similarity(
    list(baseline_queries_dev["sbert_embeddings"].values), list(corpus["sbert_embeddings"].values)
)

In [57]:
train_predictions = [
    {
        "query_id": baseline_queries_train.iloc[i]["query_id"],
        "relevant_candidates": [
            corpus.iloc[candidate_index]["argument_id"]
            for candidate_index in candidates.argsort()[::-1][:1000]
        ]
    }
    for i, candidates in enumerate(train_similarities)
]

dev_predictions = [
    {
        "query_id": baseline_queries_train.iloc[i]["query_id"],
        "relevant_candidates": [
            corpus.iloc[candidate_index]["argument_id"]
            for candidate_index in candidates.argsort()[::-1][:1000]
        ]
    }
    for i, candidates in enumerate(dev_similarities)
]

pd.DataFrame(train_predictions).to_json("sbert_train_predictions.jsonl", orient="records", lines=True)
pd.DataFrame(dev_predictions).to_json("sbert_dev_predictions.jsonl", orient="records", lines=True)

## BM25

In [83]:
corpus["bm25_tokens"] = corpus["argument"].str.split()
bm25_corpus = BM25Okapi(corpus["bm25_tokens"].values)

In [86]:
train_similarities = numpy.array([bm25_corpus.get_scores(query.split()) for query in baseline_queries_train["text"]])
dev_similarities = numpy.array([bm25_corpus.get_scores(query.split()) for query in baseline_queries_dev["text"]])

In [87]:
train_predictions = [
    {
        "query_id": baseline_queries_train.iloc[i]["query_id"],
        "relevant_candidates": [
            corpus.iloc[candidate_index]["argument_id"]
            for candidate_index in candidates.argsort()[::-1][:1000]
        ]
    }
    for i, candidates in enumerate(train_similarities)
]

dev_predictions = [
    {
        "query_id": baseline_queries_train.iloc[i]["query_id"],
        "relevant_candidates": [
            corpus.iloc[candidate_index]["argument_id"]
            for candidate_index in candidates.argsort()[::-1][:1000]
        ]
    }
    for i, candidates in enumerate(dev_similarities)
]

pd.DataFrame(train_predictions).to_json("bm25_train_predictions.jsonl", orient="records", lines=True)
pd.DataFrame(dev_predictions).to_json("bm25_dev_predictions.jsonl", orient="records", lines=True)

## Evaluation
Evaluate train and dev baseline predictions for sbert and bm25.

In [89]:
for baseline_method in ["sbert", "bm25"]:
    for split in ["train", "dev"]:
        os.system(f"python3 scripts/evaluation.py --data ./data --scenario baseline --split {split} --predictions  {baseline_method}_{split}_predictions.jsonl --output_dir results/{baseline_method} --diversity True")

100%|██████████| 105/105 [00:00<00:00, 3429.52it/s]


+----+-----+----------+---------------+
|    |   k |   ndcg@k |   precision@k |
|----+-----+----------+---------------|
|  0 |   4 | 0.957958 |      0.957143 |
|  1 |   8 | 0.953721 |      0.95119  |
|  2 |  16 | 0.93929  |      0.930952 |
|  3 |  20 | 0.932585 |      0.922381 |
+----+-----+----------+---------------+


100%|██████████| 105/105 [00:05<00:00, 19.29it/s]


+----+-----+----------------+-------------------+
|    |   k |   alpha_ndcg@k |   kl_divergence@k |
|----+-----+----------------+-------------------|
|  0 |   4 |       0.875765 |         0.157709  |
|  1 |   8 |       0.879157 |         0.141179  |
|  2 |  16 |       0.884422 |         0.108081  |
|  3 |  20 |       0.886017 |         0.0990265 |
+----+-----+----------------+-------------------+


100%|██████████| 30/30 [00:00<00:00, 3537.31it/s]


+----+-----+----------+---------------+
|    |   k |   ndcg@k |   precision@k |
|----+-----+----------+---------------|
|  0 |   4 | 0.96837  |      0.975    |
|  1 |   8 | 0.96466  |      0.966667 |
|  2 |  16 | 0.95687  |      0.954167 |
|  3 |  20 | 0.953713 |      0.95     |
+----+-----+----------+---------------+


100%|██████████| 30/30 [00:01<00:00, 19.45it/s]


+----+-----+----------------+-------------------+
|    |   k |   alpha_ndcg@k |   kl_divergence@k |
|----+-----+----------------+-------------------|
|  0 |   4 |       0.877648 |         0.150732  |
|  1 |   8 |       0.880229 |         0.136596  |
|  2 |  16 |       0.891939 |         0.107244  |
|  3 |  20 |       0.896645 |         0.0996513 |
+----+-----+----------------+-------------------+


100%|██████████| 105/105 [00:00<00:00, 3619.14it/s]


+----+-----+----------+---------------+
|    |   k |   ndcg@k |   precision@k |
|----+-----+----------+---------------|
|  0 |   4 | 0.746408 |      0.728571 |
|  1 |   8 | 0.69606  |      0.665476 |
|  2 |  16 | 0.632391 |      0.589881 |
|  3 |  20 | 0.610755 |      0.565714 |
+----+-----+----------+---------------+


100%|██████████| 105/105 [00:06<00:00, 15.41it/s]


+----+-----+----------------+-------------------+
|    |   k |   alpha_ndcg@k |   kl_divergence@k |
|----+-----+----------------+-------------------|
|  0 |   4 |       0.688805 |         0.1527    |
|  1 |   8 |       0.6558   |         0.136317  |
|  2 |  16 |       0.615283 |         0.102595  |
|  3 |  20 |       0.600987 |         0.0935152 |
+----+-----+----------------+-------------------+


100%|██████████| 30/30 [00:00<00:00, 3685.46it/s]


+----+-----+----------+---------------+
|    |   k |   ndcg@k |   precision@k |
|----+-----+----------+---------------|
|  0 |   4 | 0.674276 |      0.633333 |
|  1 |   8 | 0.640405 |      0.604167 |
|  2 |  16 | 0.591176 |      0.552083 |
|  3 |  20 | 0.569195 |      0.526667 |
+----+-----+----------+---------------+


100%|██████████| 30/30 [00:01<00:00, 19.77it/s]


+----+-----+----------------+-------------------+
|    |   k |   alpha_ndcg@k |   kl_divergence@k |
|----+-----+----------------+-------------------|
|  0 |   4 |       0.637264 |         0.144039  |
|  1 |   8 |       0.610142 |         0.12915   |
|  2 |  16 |       0.577216 |         0.100056  |
|  3 |  20 |       0.561662 |         0.0924095 |
+----+-----+----------------+-------------------+
