## Install further packages

In [None]:
!pip install sentence-transformers
!pip install rank_bm25

## Setup

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
import os
import numpy
from scripts.utils import read_gold_data

In [None]:
data = read_gold_data("data-release")
corpus = data["corpus"]
baseline_queries_train = data["baseline"]["train"]
baseline_queries_dev = data["baseline"]["dev"]


## SBERT Baseline
Encode the plain text of the arguments and queries using a multi-lingual sbert model

In [None]:
sbert_encoder = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

In [None]:
corpus_embeddings = sbert_encoder.encode(corpus["argument"].values)
corpus["sbert_embeddings"] = list(corpus_embeddings)

In [None]:
query_train_embeddings = sbert_encoder.encode(baseline_queries_train["text"].values) 
query_dev_embeddings = sbert_encoder.encode(baseline_queries_dev["text"].values) 

baseline_queries_train["sbert_embeddings"] = list(query_train_embeddings)
baseline_queries_dev["sbert_embeddings"] = list(query_dev_embeddings)

In [None]:
train_similarities = cosine_similarity(
    list(baseline_queries_train["sbert_embeddings"].values), list(corpus["sbert_embeddings"].values)
)
dev_similarities = cosine_similarity(
    list(baseline_queries_dev["sbert_embeddings"].values), list(corpus["sbert_embeddings"].values)
)

In [None]:
train_predictions = [
    {
        "query_id": baseline_queries_train.iloc[i]["query_id"],
        "relevant_candidates": [
            corpus.iloc[candidate_index]["argument_id"]
            for candidate_index in candidates.argsort()[::-1][:1000]
        ]
    }
    for i, candidates in enumerate(train_similarities)
]

dev_predictions = [
    {
        "query_id": baseline_queries_dev.iloc[i]["query_id"],
        "relevant_candidates": [
            corpus.iloc[candidate_index]["argument_id"]
            for candidate_index in candidates.argsort()[::-1][:1000]
        ]
    }
    for i, candidates in enumerate(dev_similarities)
]

pd.DataFrame(train_predictions).to_json("sbert_train_predictions.jsonl", orient="records", lines=True)
pd.DataFrame(dev_predictions).to_json("sbert_dev_predictions.jsonl", orient="records", lines=True)

## BM25

In [None]:
corpus["bm25_tokens"] = corpus["argument"].str.split()
bm25_corpus = BM25Okapi(corpus["bm25_tokens"].values)

In [None]:
train_similarities = numpy.array([bm25_corpus.get_scores(query.split()) for query in baseline_queries_train["text"]])
dev_similarities = numpy.array([bm25_corpus.get_scores(query.split()) for query in baseline_queries_dev["text"]])

In [None]:
train_predictions = [
    {
        "query_id": baseline_queries_train.iloc[i]["query_id"],
        "relevant_candidates": [
            corpus.iloc[candidate_index]["argument_id"]
            for candidate_index in candidates.argsort()[::-1][:1000]
        ]
    }
    for i, candidates in enumerate(train_similarities)
]

dev_predictions = [
    {
        "query_id": baseline_queries_dev.iloc[i]["query_id"],
        "relevant_candidates": [
            corpus.iloc[candidate_index]["argument_id"]
            for candidate_index in candidates.argsort()[::-1][:1000]
        ]
    }
    for i, candidates in enumerate(dev_similarities)
]

pd.DataFrame(train_predictions).to_json("bm25_train_predictions.jsonl", orient="records", lines=True)
pd.DataFrame(dev_predictions).to_json("bm25_dev_predictions.jsonl", orient="records", lines=True)

## Evaluation
Evaluate train and dev baseline predictions for sbert and bm25.

In [None]:
for baseline_method in ["sbert", "bm25"]:
    for split in ["train", "dev"]:
        os.system(f"python3 scripts/evaluation.py --data ./data --scenario baseline --split {split} --predictions  {baseline_method}_{split}_predictions.jsonl --output_dir results/{baseline_method} --diversity True --implicit False")