In [2]:
import json

CORPUS_PATH = "/home/guest/r12922050/GitHub/d2qplus/data/nfcorpus/corpus.jsonl"
with open(CORPUS_PATH, "r") as f:
    corpus = [json.loads(line) for line in f]
print(len(corpus))
print(corpus[0])

3633
{'_id': 'MED-10', 'title': 'Statin Use and Breast Cancer Survival: A Nationwide Cohort Study from Finland', 'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 ye

# Keyword Extraction using KeyBERT + TF-IDF

In [None]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

tfidf = TfidfVectorizer(
    ngram_range=(1,3),      # or (1,1)/(2,2) depending on your n-gram needs
    max_df=0.9,             # drop very frequent tokens
    min_df=2,               # drop extremely rare tokens
    stop_words="english"
)

tfidf.fit([doc["title"] + " " + doc["text"] for doc in corpus])

# mp_net = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device="cuda:2")
sci_bert = SentenceTransformer("allenai/scibert_scivocab_uncased", device="cuda:2")


kw_model = KeyBERT(model=sci_bert)

extract_params = {
    "keyphrase_ngram_range": (1, 3),  # unigrams + bigrams
    "stop_words": "english",          # default English stop words
    "use_mmr": True,                  # use Maximal Marginal Relevance to increase diversity
    "diversity": 0.6,                 # diversity trade-off between relevance vs novelty
    "top_n": 15,                       # extract up to 10 keyphrases per document
    "vectorizer": tfidf,             # use the fitted TF-IDF vectorizer
}

all_texts = [doc["title"] + " " + doc["text"] for doc in corpus]
results = kw_model.extract_keywords(all_texts, **extract_params)
print(f"# of results generated:{len(results)}")

doc_ids = [doc["doc_id"] for doc in corpus]
# zip doc_ids with results



# save to jsonl
import json
OUTPUT_PATH = "/home/guest/r12922050/GitHub/d2qplus/augmented-data/nfcorpus/keywords/scibert_1_3_gram.jsonl"
with open(OUTPUT_PATH, "w") as f:
    for doc_id, keywords in keywords_per_doc.items():
        f.write(json.dumps({"doc_id": doc_id, "keywords": keywords}) + "\n")
print(f"saved keywords to {OUTPUT_PATH}")

No sentence-transformers model found with name allenai/scibert_scivocab_uncased. Creating a new one with mean pooling.


# of results generated:3633


In [8]:
OUTPUT_PATH = "/home/guest/r12922050/GitHub/d2qplus/augmented-data/nfcorpus/keywords/scibert_1_3_gram.jsonl"
with open(OUTPUT_PATH, "w") as f:
    for i in range(len(corpus)):
        doc_id = corpus[i]["_id"]
        title = corpus[i]["title"]
        keywords = results[i]
        f.write(json.dumps({"doc_id": doc_id, "title": title, "keywords": keywords}) + "\n")
print(f"saved keywords to {OUTPUT_PATH}")

saved keywords to /home/guest/r12922050/GitHub/d2qplus/augmented-data/nfcorpus/keywords/scibert_1_3_gram.jsonl


In [6]:
corpus[0]

{'_id': 'MED-10',
 'title': 'Statin Use and Breast Cancer Survival: A Nationwide Cohort Study from Finland',
 'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years

In [21]:
with open(f"/home/guest/r12922050/GitHub/d2qplus/augmented-data/nfcorpus/keywords/scibert_1_3_gram.jsonl", "r") as f:
    corpus_keywords = [json.loads(line) for line in f]
print(len(corpus_keywords), corpus_keywords[0])

3633 {'MED-10': [['possible causal effect', 0.5809], ['nationwide cohort study', 0.5649], ['discontinue statin use', 0.5625], ['cancer registry information', 0.5324], ['characteristics treatment selection', 0.5257], ['breast cancer', 0.5231], ['specific mortality', 0.518], ['tumor characteristics', 0.517], ['95 ci 44', 0.4994], ['clinical trial testing', 0.4993], ['patients finland', 0.4434], ['users population based', 0.4357], ['2003 31', 0.404], ['ci', 0.4002], ['54', 0.3239]]}
