In [5]:
import pandas as pd
import pickle
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sentence_transformers import SentenceTransformer, util
import torch
from nltk.tokenize import word_tokenize
import nltk

In [6]:
df = pd.read_csv('lcr_input_final.csv')
abstracts = list(set(df['abstract']))

In [7]:
nltk.download('punkt')

# Prepare the BM25 model
tokenized_abstracts = [word_tokenize(str(abstract)) for abstract in abstracts]
bm25 = BM25Okapi(tokenized_abstracts)


scincl = SentenceTransformer('/Users/borankahraman/ITU/lcr/lcr_yeni/fine-tuned-scincl-2')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/borankahraman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
No sentence-transformers model found with name /Users/borankahraman/ITU/lcr/lcr_yeni/fine-tuned-scincl-2. Creating a new one with MEAN pooling.


In [8]:
# Query and rerank function
def query_and_rerank(query, bm25, scincl_model, top_k=10):
    # Tokenize the query
    tokenized_query = query.split()
    
    # Get BM25 top_k results
    bm25_scores = bm25.get_scores(tokenized_query)
    top_indices = bm25_scores.argsort()[-top_k:][::-1]
    top_abstracts = [abstracts[i] for i in top_indices]

    # Encode the query and abstracts
    query_embedding = scincl_model.encode(query, convert_to_tensor=True)
    abstract_embeddings = scincl_model.encode(top_abstracts, convert_to_tensor=True)
    
    # Compute cosine similarities
    cosine_scores = util.pytorch_cos_sim(query_embedding, abstract_embeddings)[0]
    
    # Sort by similarity scores
    sorted_indices = torch.argsort(cosine_scores, descending=True)
    
    # Collect the top 10 similar abstracts
    reranked_top_abstracts = [top_abstracts[i] for i in sorted_indices[:10]]

    return reranked_top_abstracts

# Example query
query = "The political system of a country defines the set of formal legal institutions that constitute a government or a state and establishes the distribution of power and resources among its citizens and government officials."

# Get the most suitable abstract for the query
most_suitable_abstract = query_and_rerank(query, bm25, scincl)


In [9]:
df = pd.read_csv('eval_dataset.csv')

In [10]:
context_l = df['context'].tolist()
eval_abstract = df['abstract'].tolist()

In [27]:
tp_1 = 0
tp_5 = 0
tp_10 = 0

In [28]:
# Evaluate the model by calculating precision@1, precision@5, and precision@10

for i in range(len(context_l)):
    most_suitable_abstracts = query_and_rerank(context_l[i], bm25, scincl)

    most_suitable_abstract = most_suitable_abstracts[0]

    if eval_abstract[i] == most_suitable_abstract:
        tp_1 += 1
    
    if eval_abstract[i] in most_suitable_abstracts[:5]:
        tp_5 += 1

    if eval_abstract[i] in most_suitable_abstracts:
        tp_10 += 1

query_count = len(context_l)

print(f"Precision @ 1: {tp_1/query_count}")
print(f"Precision @ 5: {tp_5/query_count}")
print(f"Precision @ 10: {tp_10/query_count}")

Precision @ 1: 0.021806853582554516
Precision @ 5: 0.11682242990654206
Precision @ 10: 0.2071651090342679


In [38]:
# Calculate MRR

mrr = 0

for i in range(len(context_l)):
    most_suitable_abstracts = query_and_rerank(context_l[i], bm25, scincl)

    if eval_abstract[i] in most_suitable_abstracts:
        mrr += 1/(most_suitable_abstracts.index(eval_abstract[i]) + 1) 
        

print(f"MRR: {mrr/query_count}")

MRR: 0.06262671215942245
