# Topic Coverage Experiment
In this notebook, I want to see whether my Topic Coverage query generation method translates to higher retrieval recall

In [None]:
import pandas as pd
qrels = pd.read_csv("/home/guest/r12922050/GitHub/d2qplus/data/nfcorpus/qrels/test.tsv", sep="\t")
# only keep rows with score > 0
qrels = qrels[qrels['score'] > 0]
qrels

In [None]:
doc_to_queries = (
    qrels.groupby("corpus-id")["query-id"]
         .apply(list)                     # list of queries per doc
         .to_dict()
)
eligible_docs = set(doc_to_queries)       # docs that have ≥1 relevant query
print(f"{len(eligible_docs):,} documents have at least one relevant query")

3,128 documents have at least one relevant query


In [None]:
#!/usr/bin/env python
# sample_docs_no_pathlib.py
import json
import random
import pickle
import pandas as pd

# ---------- file names ----------
CORPUS_PATH   = "/home/guest/r12922050/GitHub/d2qplus/data/nfcorpus/corpus.jsonl"
QRELS_PATH    = "/home/guest/r12922050/GitHub/d2qplus/data/nfcorpus/qrels/test.tsv"          # change if comma-separated
QUERIES_PATH = "/home/guest/r12922050/GitHub/d2qplus/data/nfcorpus/queries.jsonl"
OUT_DOCS      = "./sampled_docs.jsonl"

# ---------- parameters ----------
SAMPLE_SIZE   = 1000
RANDOM_SEED   = 42                   # reproducible sampling

# ---------- 1. load qrels ----------
qrels = pd.read_csv(QRELS_PATH, sep="\t")

qrels["score"] = qrels["score"].astype(int)   # <- cast to int
qrels = qrels.query("score > 0")             # keep only relevant pairs

doc_to_queries_full = (
    qrels.groupby("corpus-id")["query-id"]
         .apply(list)
         .to_dict()
)

eligible_docs = set(doc_to_queries_full)
print(f"{len(eligible_docs):,} documents with ≥1 relevant query")

# ---------- 2. stream-read the corpus & keep eligible ----------
docs = []
with open(CORPUS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        if obj["_id"] in eligible_docs:
            docs.append(obj)

print(f"Loaded {len(docs):,} eligible documents")

# random sample documents
random.seed(RANDOM_SEED)
sampled_docs = random.sample(docs, SAMPLE_SIZE)

# construct qid -> query_text mapping
with open(QUERIES_PATH, "r", encoding="utf-8") as f:
    queries = [json.loads(line) for line in f]
query_to_text = {q["_id"]: q["text"] for q in queries}

out = []
for doc in sampled_docs:
    json_obj = {
        "doc_id": doc["_id"],
        "title": doc["title"],
        "text": doc["text"],
        "queries": [query_to_text[qid] for qid in doc_to_queries_full[doc["_id"]]]
    }
    out.append(json_obj)
with open(OUT_DOCS, "w", encoding="utf-8") as f:
    for obj in out:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")
print(f"Saved {len(out):,} sampled documents to {OUT_DOCS}")
    

3,128 documents with ≥1 relevant query
Loaded 3,128 eligible documents
Saved 1,000 sampled documents to ./sampled_docs.jsonl


In [12]:
docs[0]

{'_id': 'MED-10',
 'title': 'Statin Use and Breast Cancer Survival: A Nationwide Cohort Study from Finland',
 'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years

## RQ: Do the BERTopic topics I extracted line up with the real (test-set) queries that were judged relevant for each document?

In [1]:
import pandas as pd
topic_model = pd.read_pickle("/home/guest/r12922050/GitHub/d2qplus/topics/nfcorpus/topic_model_sentence_enhanced.pickle")
topic_model

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Enhanced_Topic
0,-1,10153,-1_diet_were_fat_these,"[diet, were, fat, these, and, was, for, the, a...","[For the 0, 300, and 600 mg/day SDG groups, re...",Effects of SDG on Prostate Symptoms and Mortality
1,0,280,0_no_any_there_differences,"[no, any, there, differences, observed, signif...",[There were no significant changes in body wei...,No significant changes observed in study groups
2,1,250,1_antioxidant_antioxidants_capacity_assay,"[antioxidant, antioxidants, capacity, assay, r...",[DESIGN: The ferric-reducing ability of plasma...,Antioxidant capacity and activity assays
3,2,207,2_cholesterol_lipoprotein_hdl_ldl,"[cholesterol, lipoprotein, hdl, ldl, density, ...",[Before the intervention and at 4 and 8 weeks ...,Lipid profile changes after intervention
4,3,151,3_ncds_communicable_ncd_public,"[ncds, communicable, ncd, public, world, polic...",[Effective approaches for large-scale NCD prev...,Non-communicable disease prevention and contro...
...,...,...,...,...,...,...
1283,1282,5,1282_spl_c22_5n_ps,"[spl, c22, 5n, ps, pe, omnivores, pc, sphingo,...",[The concentrations of the fatty acids 20:3n-6...,Effects of dietary fat intake on n-3 and n-6 f...
1284,1283,5,1283_plentiful_ironically_easy_access,"[plentiful, ironically, easy, access, produces...",[To assess the direct effect of postprandial t...,Effects of high-fat diet on atherosclerosis
1285,1284,5,1284_offensive_guard_rear_essence,"[offensive, guard, rear, essence, strategy, de...","[In essence, this is an offensive strategy., T...",Military Defensive Strategy
1286,1285,5,1285_ra_offal_877_1968,"[ra, offal, 877, 1968, correlations, 1978, lin...",[It is hypothesized that meat and offal may be...,Association between meat consumption and rheum...


In [2]:
topic_model_info = {}
for idx, row in topic_model.iterrows():
    topic_model_info[row['Topic']] = {
        'Representation': row['Representation'], # list of words
        'Representative_Docs': row['Representative_Docs'], # list of sentences
        'Enhanced_Topic': row['Enhanced_Topic'],
    }

In [None]:
from sentence_transformers import SentenceTransformer
model_name = "sentence-transformers/all-mpnet-base-v2"
embedder = SentenceTransformer(model_name, device="cuda:2")
def embed(texts):            # helper that always returns L2-normalised vectors
    return embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

# embed for each topic
for topic, info in topic_model_info.items():
    rep = " ".join(info['Representation'])
    info['Representation_Embed'] = embed([rep])[0]
    info['Enhanced_Topic_Embed'] = embed([info['Enhanced_Topic']])[0]

    info['Rep_Enhanced_Embed'] = embed([rep+" "+info['Enhanced_Topic']])[0]

In [17]:
with open("/home/guest/r12922050/GitHub/d2qplus/playground/topic-modeling/sampled_docs.jsonl", "r") as f:
    sampled_docs = [json.loads(line) for line in f]
sampled_docs[0]

{'doc_id': 'MED-4635',
 'title': 'Globalization, diet, and health: an example from Tonga.',
 'text': "The increased flow of goods, people, and ideas associated with globalization have contributed to an increase in noncommunicable diseases in much of the world. One response has been to encourage lifestyle changes with educational programmes, thus controlling the lifestyle-related disease. Key assumptions with this approach are that people's food preferences are linked to their consumption patterns, and that consumption patterns can be transformed through educational initiatives. To investigate these assumptions, and policies that derive from it, we undertook a broad-based survey of food-related issues in the Kingdom of Tonga using a questionnaire. Data on the relationships between food preferences, perception of nutritional value, and frequency of consumption were gathered for both traditional and imported foods. The results show that the consumption of health-compromising imported food

In [21]:
# calculate average cosine similarity of queries to topic representation
with open("/home/guest/r12922050/GitHub/d2qplus/topics/nfcorpus/enhanced_rep_sentence.jsonl", "r") as f:
    doc_topic_info = [json.loads(line) for line in f]
doc_topic_info_dict = {doc["doc_id"]: doc['topics'] for doc in doc_topic_info}
doc_topic_info_dict['MED-10']

[{'topic_id': 526, 'weight': 0.428571},
 {'topic_id': 96, 'weight': 0.142857},
 {'topic_id': 1021, 'weight': 0.428571}]

In [None]:
import numpy as np
avg_cosine_sim = {} # doc_id -> avg cosine similarity
for doc in sampled_docs:
    doc_id = doc['doc_id']
    doc_text = doc['text']
    doc_title = doc['title']
    queries = doc['queries'] # list of queries
    topic_ids = [t['topic_id'] for t in doc_topic_info_dict[doc_id]]
    cosine_sims = []
    for topic_id in topic_ids:
        topic_info = topic_model_info[topic_id]
        rep_embed = topic_info['Representation_Embed']
        enhanced_embed = topic_info['Enhanced_Topic_Embed']
        rep_enhanced_embed = topic_info['Rep_Enhanced_Embed']
        for query in queries:
            query_embed = embed([query])[0]
            cosine_sim_rep = (query_embed @ rep_embed) / (np.linalg.norm(query_embed) * np.linalg.norm(rep_embed))
            cosine_sim_enhanced = (query_embed @ enhanced_embed) / (np.linalg.norm(query_embed) * np.linalg.norm(enhanced_embed))
            cosine_sim_rep_enhanced = (query_embed @ rep_enhanced_embed) / (np.linalg.norm(query_embed) * np.linalg.norm(rep_enhanced_embed))
            cosine_sims.append((cosine_sim_rep, cosine_sim_enhanced, cosine_sim_rep_enhanced))
    avg_cosine_sim[doc_id] = {
        'avg_cosine_sim_rep': np.mean([sim[0] for sim in cosine_sims]),
        'avg_cosine_sim_enhanced': np.mean([sim[1] for sim in cosine_sims]),
        'avg_cosine_sim_rep_enhanced': np.mean([sim[2] for sim in cosine_sims]),
    }
avg_cosine_sim

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


{'MED-4635': {'avg_cosine_sim_rep': 0.07250403,
  'avg_cosine_sim_enhanced': 0.06542132,
  'avg_cosine_sim_rep_enhanced': 0.06145745},
 'MED-1578': {'avg_cosine_sim_rep': 0.17333111,
  'avg_cosine_sim_enhanced': 0.17362395,
  'avg_cosine_sim_rep_enhanced': 0.16370457},
 'MED-1150': {'avg_cosine_sim_rep': 0.15503637,
  'avg_cosine_sim_enhanced': 0.100239694,
  'avg_cosine_sim_rep_enhanced': 0.14868991},
 'MED-5152': {'avg_cosine_sim_rep': 0.15824738,
  'avg_cosine_sim_enhanced': 0.16387501,
  'avg_cosine_sim_rep_enhanced': 0.15147369},
 'MED-2421': {'avg_cosine_sim_rep': 0.11859123,
  'avg_cosine_sim_enhanced': 0.09196148,
  'avg_cosine_sim_rep_enhanced': 0.10999618},
 'MED-2260': {'avg_cosine_sim_rep': 0.18008839,
  'avg_cosine_sim_enhanced': 0.13388796,
  'avg_cosine_sim_rep_enhanced': 0.14655413},
 'MED-2152': {'avg_cosine_sim_rep': 0.19772732,
  'avg_cosine_sim_enhanced': 0.13955253,
  'avg_cosine_sim_rep_enhanced': 0.15693872},
 'MED-1723': {'avg_cosine_sim_rep': 0.100573815,
  'av

In [27]:

results = {
    "avg_cosine_sim_rep":
        np.nanmean([v['avg_cosine_sim_rep']            for v in avg_cosine_sim.values()]),
    "avg_cosine_sim_enhanced":
        np.nanmean([v['avg_cosine_sim_enhanced']       for v in avg_cosine_sim.values()]),
    "avg_cosine_sim_rep_enhanced":
        np.nanmean([v['avg_cosine_sim_rep_enhanced']   for v in avg_cosine_sim.values()]),
}
results

{'avg_cosine_sim_rep': 0.1112142515903303,
 'avg_cosine_sim_enhanced': 0.10548924442992584,
 'avg_cosine_sim_rep_enhanced': 0.10203854355672345}

In [30]:
import numpy as np
avg_cosine_sim = {} # doc_id -> avg cosine similarity
for doc in sampled_docs:
    doc_id = doc['doc_id']
    doc_text = doc['text']
    doc_title = doc['title']
    doc_text_embed = embed([doc_text])[0]
    doc_title_embed = embed([doc_title])[0]
    queries = doc['queries'] # list of queries
    topic_ids = [t['topic_id'] for t in doc_topic_info_dict[doc_id]]
    cosine_sims = []
    for query in queries:
        query_embed = embed([query])[0]
        cos_sim_text_title = float(np.dot(doc_text_embed, doc_title_embed))
        cos_sim_query_text = float(np.dot(query_embed, doc_text_embed))
        cos_sim_query_title = float(np.dot(query_embed, doc_title_embed))
        cosine_sims.append((cos_sim_text_title, cos_sim_query_text, cos_sim_query_title))
    avg_cosine_sim[doc_id] = {
        'avg_cos_sim_text_title': np.nanmean([sim[0] for sim in cosine_sims]),
        'avg_cos_sim_query_text': np.nanmean([sim[1] for sim in cosine_sims]),
        'avg_cos_sim_query_title': np.nanmean([sim[2] for sim in cosine_sims]),
    }
avg_cosine_sim

{'MED-4635': {'avg_cos_sim_text_title': 0.8181056976318359,
  'avg_cos_sim_query_text': 0.13240932952612638,
  'avg_cos_sim_query_title': 0.16894018603488803},
 'MED-1578': {'avg_cos_sim_text_title': 0.8104920387268066,
  'avg_cos_sim_query_text': 0.2982138991355896,
  'avg_cos_sim_query_title': 0.30337146297097206},
 'MED-1150': {'avg_cos_sim_text_title': 0.8568789958953857,
  'avg_cos_sim_query_text': 0.2798227866490682,
  'avg_cos_sim_query_title': 0.24127397686243057},
 'MED-5152': {'avg_cos_sim_text_title': 0.8424941897392273,
  'avg_cos_sim_query_text': 0.23720304667949677,
  'avg_cos_sim_query_title': 0.2830433249473572},
 'MED-2421': {'avg_cos_sim_text_title': 0.8563762903213501,
  'avg_cos_sim_query_text': 0.23848023265600204,
  'avg_cos_sim_query_title': 0.10720447823405266},
 'MED-2260': {'avg_cos_sim_text_title': 0.9218974113464355,
  'avg_cos_sim_query_text': 0.12088546560456355,
  'avg_cos_sim_query_title': 0.09137757929662864},
 'MED-2152': {'avg_cos_sim_text_title': 0.7

In [32]:
results = {
    "avg_cosine_text_title":
        np.nanmean([v['avg_cos_sim_text_title']            for v in avg_cosine_sim.values()]),
    "avg_cos_sim_query_text":
        np.nanmean([v['avg_cos_sim_query_text']       for v in avg_cosine_sim.values()]),
    "avg_cos_sim_query_title":
        np.nanmean([v['avg_cos_sim_query_title']   for v in avg_cosine_sim.values()]),
}
results

{'avg_cosine_text_title': 0.8202642796039581,
 'avg_cos_sim_query_text': 0.22855729575399447,
 'avg_cos_sim_query_title': 0.2111831443273495}