# Get Few Shot Prompts (Promptagator / InPars)

In [4]:
import json
import pandas as pd

CORPUS_PATH = "/home/guest/r12922050/GitHub/d2qplus/data/nfcorpus/corpus.jsonl"
QUERIES_PATH = "/home/guest/r12922050/GitHub/d2qplus/data/nfcorpus/queries.jsonl"
QRELS_PATH = "/home/guest/r12922050/GitHub/d2qplus/data/nfcorpus/qrels/dev.tsv"


nf_corpus = [json.loads(line) for line in open(CORPUS_PATH, 'r')]
nf_queries = [json.loads(line) for line in open(QUERIES_PATH, 'r')]
dev_qrels = pd.read_csv(QRELS_PATH, sep="\t")

print(f"Number of documents in corpus: {len(nf_corpus)}")
print(nf_corpus[0])  # Print first document to check structure

print(f"Number of queries: {len(nf_queries)}")
print(nf_queries[0])  # Print first query to check structure
print(f"Number of qrels: {len(dev_qrels)}")
display(dev_qrels)


Number of documents in corpus: 3633
{'_id': 'MED-10', 'title': 'Statin Use and Breast Cancer Survival: A Nationwide Cohort Study from Finland', 'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after t

Unnamed: 0,query-id,corpus-id,score
0,PLAIN-1,MED-2421,2
1,PLAIN-1,MED-2422,2
2,PLAIN-1,MED-2416,2
3,PLAIN-1,MED-2423,2
4,PLAIN-1,MED-2417,2
...,...,...,...
11380,PLAIN-3471,MED-5338,2
11381,PLAIN-3471,MED-5339,2
11382,PLAIN-3471,MED-5340,2
11383,PLAIN-3471,MED-5341,2


In [11]:
def get_few_shot_examples(corpus, queries, qrels, num_examples=8):
    import random
    corpus_dict = {doc['_id']: doc['text'] for doc in corpus}
    queries_dict = {query['_id']: query['text'] for query in queries}
    examples = []

    qrels_pos = qrels[qrels['score'] > 1]
    qrels_dict = (qrels_pos.groupby('query-id')['corpus-id'].apply(list).to_dict())
    # 此時 qrel_dict 的結構大概就是：
    # {
    #   'PLAIN-1': ['MED-2421', 'MED-2422', 'MED-2416', 'MED-2423', 'MED-2417', …],
    #   'PLAIN-2': ['…', …],
    #   … 
    # }

    # make sure we select num_examples query ids and one document id for each query (each query should map to different documents)
    query_ids = list(qrels_dict.keys())
    if len(query_ids) < num_examples:
        raise ValueError(f"Not enough queries with positive relevance in qrels. Found {len(query_ids)}, but need {num_examples}.")
    selected_query_ids = query_ids[:num_examples]

    selected_doc_ids = set()

    for query_id in selected_query_ids:
        doc_ids = qrels_dict[query_id]
        doc_id = random.choice(doc_ids)
        while doc_id in selected_doc_ids:
            doc_id = random.choice(doc_ids)
        selected_doc_ids.add(doc_id)
        example = {
            "query_id": query_id,
            "query_text": queries_dict[query_id],
            "doc_id": doc_id,
            "doc_text": corpus_dict[doc_id]
        }
        examples.append(example)
    if len(examples) < num_examples:
        raise ValueError(f"Not enough unique examples found. Found {len(examples)}, but need {num_examples}.")
            
    return examples


In [12]:
examples = get_few_shot_examples(nf_corpus, nf_queries, dev_qrels, num_examples=8)

# save to jsonl file
FEW_SHOT_EXAMPLES_OUTPUT_PATH = "/home/guest/r12922050/GitHub/d2qplus/prompts/promptagator/few_shot_examples.jsonl"
with open(FEW_SHOT_EXAMPLES_OUTPUT_PATH, 'w') as f:
    for example in examples:
        f.write(json.dumps(example) + '\n')
print(f"Few-shot examples saved to {FEW_SHOT_EXAMPLES_OUTPUT_PATH}")

Few-shot examples saved to /home/guest/r12922050/GitHub/d2qplus/prompts/promptagator/few_shot_examples.jsonl
