In [None]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
import ir_datasets
import random

  from .autonotebook import tqdm as notebook_tqdm


# Collecting data

In [2]:
dataset = ir_datasets.load("neuclir/1/multi/trec-2023")

In [None]:
# Sample documents data because of the size of the dataset
def random_sample(iterator, sample_size, total_count):
    sample_probability = sample_size / total_count
    sampled_data = []

    for item in iterator:
        if random.random() < sample_probability:
            sampled_data.append(item)
        if len(sampled_data) >= sample_size:
            break

    return pd.DataFrame(sampled_data)

qrels_df = pd.DataFrame(dataset.qrels_iter())
documents_df = random_sample(dataset.docs_iter(), 2000000, dataset.docs_count())
queries_df = pd.DataFrame(dataset.queries_iter())
qrels_doc_ids = set(qrels_df["doc_id"])
documents_df = documents_df[documents_df["doc_id"].isin(qrels_doc_ids)]

In [8]:
documents_df.head()

Unnamed: 0.1,Unnamed: 0,doc_id,title,text,url,time,cc_file
0,163,14a0a786-c707-487c-af0a-16ef97e1d023,越捷航空連三年獲評為「最佳超低廉航空公司」 - 生活,AirlineRatings頒發越捷航空 2020「最佳超低廉航空公司」殊榮。圖：越捷提供\...,https://www.chinatimes.com/realtimenews/201911...,2019-11-27T15:44:07+08:00,crawl-data/CC-NEWS/2019/11/CC-NEWS-20191127083...
1,232,fc24998f-6ed8-43f5-93aa-f71244cd50af,精品包用酒精消毒小心壞掉！快來學4招正確、實用的「清潔技巧」,疫情期間只要一出門，回到家就要把隨身物品消毒一次，那麼大家都是如何幫心愛的精品包消毒呢？其實...,https://fashion.ettoday.net/news/2024734?from=...,2021-07-08T09:09:00+08:00,crawl-data/CC-NEWS/2021/07/CC-NEWS-20210708003...
2,250,b6c8165f-b19a-4d47-84b9-0c7c7e450cf6,一室枯山水 喝一杯手冲咖啡忘憂 - 20210708 - 副刊,手搖凍啡 入口柔順\n\n一般禪味的café均以茶品為主，不過店家主打咖啡，設有手冲咖啡菜單...,https://news.mingpao.com/pns/%E5%89%AF%E5%88%8...,,crawl-data/CC-NEWS/2021/07/CC-NEWS-20210708003...
3,301,42dfc137-e3b8-4ea5-ba96-6914dc6afc1f,《電子零件》車用、儲能雙引擎 系統電6月營收月增5.97％ - 財經,系統電(5309)因車用及儲能產品需求強勁，6月合併營收為2.13億元，月成長5.97%，系...,https://www.chinatimes.com/realtimenews/202107...,2021-07-08T09:44:29+08:00,crawl-data/CC-NEWS/2021/07/CC-NEWS-20210708022...
4,306,e6887499-98df-4408-a6aa-c002a76cc801,富商遇死劫！小20歲嫩妻崩潰痛哭 遭警識破「全是演戲」,記者張寧倢／編譯\n\n印尼巴布亞省（Papua）於6月28日發生一起劫車殺人的嚴重社會案件...,https://www.ettoday.net/news/20210708/2025480....,2021-07-08T10:40:00+08:00,crawl-data/CC-NEWS/2021/07/CC-NEWS-20210708041...


In [9]:
queries_df.head()

Unnamed: 0,query_id,title,description,narrative,fa_mt_title,fa_mt_description,fa_mt_narrative,ru_mt_title,ru_mt_description,ru_mt_narrative,zh_mt_title,zh_mt_description,zh_mt_narrative
0,200,Corruption Bribery Sports Federation Olympics,Are there cases of Institutional Corruption an...,We are Looking for articles that contain a cas...,المپیک فدراسیون ورزشی رشوه خواری فساد,آیا مواردی از فساد نهادی و رشوه خواری در ورزش ...,ما به دنبال مقالاتی هستیم که حاوی یک مورد فساد...,Олимпийские игры Федерации спортивных игр корр...,Существуют ли случаи институциональной коррупц...,"Мы ищем статьи, которые содержат случай финанс...",腐败贿赂体育联合会奥运会,体育中是否存在机构腐败和贿赂的案例？,我们正在寻找在机构层面上包含金融腐败或贿赂案件的文章。奥运会和体育联合会中的贿赂案件是相关的...
1,201,China investment in Iran,In what fields or industries China’s investmen...,We need articles that specify at least one fie...,سرمایه گذاری چین در ایران,در کدام زمینه ها یا صنایع سرمایه گذاری چین در ...,ما به مقالاتی نیاز داریم که حداقل یک زمینه سرم...,Китайские инвестиции в Иран,В каких областях или промышленности инвестиции...,"Нам нужны статьи, в которых указано хотя бы од...",中国对伊朗的投资,在哪些领域或行业中，中国对伊朗的投资是？,我们需要至少指定一个投资领域的文章，无论是皮带和道路倡议，石油，水坝等。货币的投资金额不提及...
2,202,"Emerging technologies, precision farming, smar...",What are some of the latest emerging technolog...,Find articles on emerging technologies in prec...,فن آوری های نوظهور ، کشاورزی دقیق ، کشاورزی هو...,برخی از جدیدترین فن آوری های نوظهور که صنعت کش...,مقالاتی در مورد فن آوری های نوظهور در کشاورزی ...,"Новые технологии, точное сельское хозяйство, у...",Каковы некоторые из последних новых технологий...,Найдите статьи о новых технологиях в области т...,新兴技术，精确农业，智能农业，农业,哪些最新的新兴技术正在改变农业行业？,查找有关精确农业和农业中新兴技术的文章，这些文章提供了这些技术如何用于提高农作物产量并提高农...
3,203,Ever Given's Stuck,Find information about the ship Ever Given bei...,Find articles on the event when the container ...,تا به حال گیر کرده است,اطلاعات مربوط به کشتی را که در کانال سوئز گیر ...,مقالاتی را در مورد این رویداد پیدا کنید که کشت...,Когда -либо дал застрял,"Найдите информацию о корабле, когда -либо заст...","Найдите статьи на мероприятии, когда контейнер...",曾经被卡住了,查找有关被困在苏伊士运河的船的信息,在2021年春季遇到的集装箱船被困在苏伊士运河中时，请查找有关事件的事件的文章。包括有关事件...
4,204,Penalty doping sports stop,What strategies are used to reduce/stop doping...,Any mention of the monetary penalties or exclu...,مجازات دوپینگ ورزش توقف,از چه استراتژی هایی برای کاهش/متوقف کردن دوپین...,هرگونه ذکر مجازات های پولی یا محرومیت تیم ها ی...,Спортивная остановка пенальти,Какие стратегии используются для уменьшения/ос...,Любое упоминание о денежных наказаниях или иск...,点球兴奋剂运动停止,哪些策略用于减少运动中的掺杂？,包括任何提及货币罚款或排除在团队中或夺回奖牌的罚款。接受掺杂的测试将不被视为预防措施。


In [10]:
qrels_df.head()

Unnamed: 0,query_id,doc_id,relevance,iteration
0,200,00258365-6d48-49cc-901f-578b883c8226,1,fas
1,200,010fd82c-4423-41a6-ac56-4d036ccf0524,2,fas
2,200,013c3243-d1b5-47e6-bcf5-26c092ac9ff5,0,fas
3,200,01aed8a4-f1b9-4729-97f0-525338029268,0,fas
4,200,01e59322-b610-4d44-9ecd-81bfad2f2b5e,0,fas


# Ranking Documents

### Load model

In [11]:
model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
st_model = SentenceTransformer(model_name)

### Genearte embeddings

In [12]:
def generate_embeddings(texts, model, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=True, show_progress_bar=False)
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)

In [13]:
documents_df = documents_df[documents_df['text'].notnull() & documents_df['text'].str.strip() != '']

In [14]:
# Generate embeddings for queries
print("Generating query embeddings...")
query_texts = queries_df['title'].tolist()
query_embeddings = generate_embeddings(query_texts, st_model)

# Generate embeddings for documents
print("Generating document embeddings...")
doc_texts = documents_df['text'].tolist()
doc_ids = documents_df['doc_id'].tolist()
doc_embeddings = generate_embeddings(doc_texts, st_model)

Generating query embeddings...
Generating document embeddings...


### Rank Documents

In [15]:
from tqdm import tqdm
from sentence_transformers import util

ranked_results = {}

for idx, query_embedding in tqdm(enumerate(query_embeddings), total=len(query_embeddings), desc="Processing queries"):
    query_id = queries_df.iloc[idx]['query_id']
    similarities = util.cos_sim(query_embedding, doc_embeddings)[0]
    ranked_indices = similarities.argsort(descending=True)
    ranked_results[query_id] = [{"doc_id": doc_ids[i], "score": similarities[i].item()} for i in ranked_indices]


Processing queries: 100%|██████████| 76/76 [07:47<00:00,  6.15s/it]


In [16]:
# Print the first 5
for query_id, results in list(ranked_results.items())[:5]:
    print(f"Query ID: {query_id}")
    for result in results[:5]:  # Print top 5 results for each query
        print(f"  Doc ID: {result['doc_id']}, Score: {result['score']}")

Query ID: 200
  Doc ID: 9946790d-e91f-4064-9ac5-67846b13d4a4, Score: 0.5723366737365723
  Doc ID: b039d113-560f-47cf-8196-3532134d5662, Score: 0.5650008916854858
  Doc ID: 67c1da92-1899-40dd-adb9-08d3aeccdc1a, Score: 0.5575284957885742
  Doc ID: fc18d54f-7185-4a57-8353-e139f98eb3f7, Score: 0.5438524484634399
  Doc ID: 3267a3dc-bbd7-4da4-9496-f8a428a1a706, Score: 0.5426214933395386
Query ID: 201
  Doc ID: 5a1b782e-cdad-49e1-8d94-c644fc3ad176, Score: 0.5919296741485596
  Doc ID: 32c4d668-da51-4ee6-a2bc-2391d318342c, Score: 0.5831509232521057
  Doc ID: 5dd60f12-902f-4aa0-b7cd-944ddf2f00d2, Score: 0.5809510350227356
  Doc ID: f4270163-4d40-4e1a-b9eb-7c74b354ea70, Score: 0.5633082389831543
  Doc ID: d0f2db63-03af-4057-8ae9-b35518c8b491, Score: 0.5630148649215698
Query ID: 202
  Doc ID: 9b51254a-639e-4862-9782-91779f59a0b0, Score: 0.5216076374053955
  Doc ID: e7bd849e-8ef5-438e-bb84-8a17f48043a8, Score: 0.5212656259536743
  Doc ID: 0844afd9-f8f4-44ed-84f5-ba50fa8d1e82, Score: 0.5177650451660

## Evaluation

In [None]:
valid_doc_ids = set(documents_df['doc_id'])
qrels_df = pd.DataFrame(dataset.qrels_iter())
qrels_df = qrels_df[qrels_df['doc_id'].isin(valid_doc_ids)]

In [17]:
import ir_measures
from ir_measures import ScoredDoc, Qrel


for query_id, docs in ranked_results.items():
    ranked_results[query_id] = sorted(docs, key=lambda x: x['score'], reverse=True)

qrels = [
    Qrel(query_id=row['query_id'], doc_id=row['doc_id'], relevance=int(row['relevance']))
    for _, row in qrels_df.iterrows()
]

results = [
    ScoredDoc(query_id=query_id, doc_id=doc['doc_id'], score=float(doc['score']))
    for query_id, docs in ranked_results.items()
    for doc in docs
]

metrics = [
    ir_measures.nDCG @ 20,   # nDCG@20
    ir_measures.AP,          # Average Precision
    ir_measures.RBP(rel=1),  # RBP with relevance=1
    ir_measures.R @ 100,     # Recall@100
    ir_measures.R @ 1000     # Recall@1000
]

evaluation = ir_measures.calc_aggregate(metrics, qrels, results)

print("Evaluation Metrics:")
for metric, value in evaluation.items():
    print(f"{metric}: {value:.4f}")


Evaluation Metrics:
R@1000: 0.3391
R@100: 0.1809
RBP(rel=1): 0.1715
AP: 0.0678
nDCG@20: 0.1380
