In [2]:
import json

CORPUS_PATH = "/home/guest/r12922050/GitHub/d2qplus/data/nfcorpus/corpus.jsonl"
with open(CORPUS_PATH, "r") as f:
    corpus = [json.loads(line) for line in f]
print(len(corpus))
print(corpus[0])

3633
{'_id': 'MED-10', 'title': 'Statin Use and Breast Cancer Survival: A Nationwide Cohort Study from Finland', 'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 ye

# Keyword Extraction using KeyBERT + TF-IDF

In [None]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

tfidf = TfidfVectorizer(
    ngram_range=(1,3),      # or (1,1)/(2,2) depending on your n-gram needs
    max_df=0.9,             # drop very frequent tokens
    min_df=2,               # drop extremely rare tokens
    stop_words="english"
)

tfidf.fit([doc["title"] + " " + doc["text"] for doc in corpus])

# mp_net = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device="cuda:2")
sci_bert = SentenceTransformer("allenai/scibert_scivocab_uncased", device="cuda:2")


kw_model = KeyBERT(model=sci_bert)

extract_params = {
    "keyphrase_ngram_range": (1, 3),  # unigrams + bigrams
    "stop_words": "english",          # default English stop words
    "use_mmr": True,                  # use Maximal Marginal Relevance to increase diversity
    "diversity": 0.6,                 # diversity trade-off between relevance vs novelty
    "top_n": 15,                       # extract up to 10 keyphrases per document
    "vectorizer": tfidf,             # use the fitted TF-IDF vectorizer
}

all_texts = [doc["title"] + " " + doc["text"] for doc in corpus]
results = kw_model.extract_keywords(all_texts, **extract_params)
print(f"# of results generated:{len(results)}")

doc_ids = [doc["doc_id"] for doc in corpus]
# zip doc_ids with results



# save to jsonl
import json
OUTPUT_PATH = "/home/guest/r12922050/GitHub/d2qplus/augmented-data/nfcorpus/keywords/scibert_1_3_gram.jsonl"
with open(OUTPUT_PATH, "w") as f:
    for doc_id, keywords in keywords_per_doc.items():
        f.write(json.dumps({"doc_id": doc_id, "keywords": keywords}) + "\n")
print(f"saved keywords to {OUTPUT_PATH}")

No sentence-transformers model found with name allenai/scibert_scivocab_uncased. Creating a new one with mean pooling.


# of results generated:3633


In [8]:
OUTPUT_PATH = "/home/guest/r12922050/GitHub/d2qplus/augmented-data/nfcorpus/keywords/scibert_1_3_gram.jsonl"
with open(OUTPUT_PATH, "w") as f:
    for i in range(len(corpus)):
        doc_id = corpus[i]["_id"]
        title = corpus[i]["title"]
        keywords = results[i]
        f.write(json.dumps({"doc_id": doc_id, "title": title, "keywords": keywords}) + "\n")
print(f"saved keywords to {OUTPUT_PATH}")

saved keywords to /home/guest/r12922050/GitHub/d2qplus/augmented-data/nfcorpus/keywords/scibert_1_3_gram.jsonl


In [6]:
corpus[0]

{'_id': 'MED-10',
 'title': 'Statin Use and Breast Cancer Survival: A Nationwide Cohort Study from Finland',
 'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years

In [21]:
with open(f"/home/guest/r12922050/GitHub/d2qplus/augmented-data/nfcorpus/keywords/scibert_1_3_gram.jsonl", "r") as f:
    corpus_keywords = [json.loads(line) for line in f]
print(len(corpus_keywords), corpus_keywords[0])

3633 {'MED-10': [['possible causal effect', 0.5809], ['nationwide cohort study', 0.5649], ['discontinue statin use', 0.5625], ['cancer registry information', 0.5324], ['characteristics treatment selection', 0.5257], ['breast cancer', 0.5231], ['specific mortality', 0.518], ['tumor characteristics', 0.517], ['95 ci 44', 0.4994], ['clinical trial testing', 0.4993], ['patients finland', 0.4434], ['users population based', 0.4357], ['2003 31', 0.404], ['ci', 0.4002], ['54', 0.3239]]}


## Core keyword extraction

> CCQGen

Core phrases identification. From each document, we
identify core phrases used to describe its concepts. These phrases
offer fine-grained details not captured at the topic level. We note
that not all phrases in the document are equally important. Core
phrases should describe concepts strongly relevant to the document
but not frequently covered by other documents with similar topics.
For example, among documents about ‘recommender system’ topic,
the phrase ‘user-item interaction’ is very commonly used, and less
likely to represent the most important concepts of the document.

所以我們也應該需要考慮 keywords relevant to this document but not frequently covered by other documents with similar topics. 而不是直接使用整個 topic 的 keyword list 來去 guide generation


In [1]:
DOC_TOPICS_PATH = "/home/guest/r12922050/GitHub/d2qplus/augmented-data/CSFCube-1.1/topics/0609-pritamdeka_scibert-biobert-pos-keybert-mmr/doc_topics.jsonl"
TOPIC_INFO_PKL = "/home/guest/r12922050/GitHub/d2qplus/augmented-data/CSFCube-1.1/topics/0609-pritamdeka_scibert-biobert-pos-keybert-mmr/topic_info_dataframe_enhanced.pkl"
CORPUS_PATH = "/home/guest/r12922050/GitHub/d2qplus/data/CSFCube-1.1/corpus.jsonl"


In [5]:
import json
with open(DOC_TOPICS_PATH, "r") as f:
    doc_topics = [json.loads(line) for line in f]
print(len(doc_topics), doc_topics[3])

4207 {'doc_id': '16421850', 'topics': [{'topic_id': 89, 'weight': 0.25}, {'topic_id': 36, 'weight': 0.25}, {'topic_id': 72, 'weight': 0.25}, {'topic_id': 23, 'weight': 0.25}]}


In [None]:
from bertopic import BERTopic
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Sample documents
docs = [doc['text'] for doc in all_docs][:50]

# Step 1: Use BERTopic to identify topics
topic_model = BERTopic(calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)

# Step 2: Group documents by topic
topic_docs = {}
for doc, topic in zip(docs, topics):
    if topic not in topic_docs:
        topic_docs[topic] = []
    topic_docs[topic].append(doc)

# Step 3: Initialize KeyBERT
kw_model = KeyBERT()

# Step 4: Extract topic-level keywords to understand the topic's general content
topic_keywords = {}
for topic, topic_doc_list in topic_docs.items():
    if topic == -1:  # Skip outliers
        continue
    # Combine all documents in the topic to get topic-level keywords
    combined_text = " ".join(topic_doc_list)
    topic_keywords[topic] = kw_model.extract_keywords(
        combined_text,
        keyphrase_ngram_range=(1, 2),
        stop_words='english',
        top_n=10
    )

# Step 5: Extract document-level keywords and filter based on topic context
document_keywords = []
for doc_idx, (doc, topic) in enumerate(zip(docs, topics)):
    if topic == -1:  # Skip outliers
        document_keywords.append((doc, []))
        continue
    
    # Extract document-level keywords
    doc_keywords = kw_model.extract_keywords(
        doc,
        keyphrase_ngram_range=(1, 2),
        stop_words='english',
        top_n=5,
        use_mmr=True,
        diversity=0.5
    )
    
    # Get topic-level keywords for comparison
    topic_kw = topic_keywords.get(topic, [])
    topic_kw_set = set(kw[0] for kw in topic_kw)
    
    # Filter document keywords: keep only those that are not too common in the topic
    filtered_keywords = []
    for keyword, score in doc_keywords:
        # Check if the keyword is in the top topic keywords (i.e., too common in the topic)
        if keyword not in topic_kw_set:
            filtered_keywords.append((keyword, score))
        else:
            # Optionally, reduce the score of common keywords instead of filtering them out
            adjusted_score = score * 0.5  # Reduce score for common keywords
            filtered_keywords.append((keyword, adjusted_score))
    
    # Sort by adjusted score and take top 5
    filtered_keywords = sorted(filtered_keywords, key=lambda x: x[1], reverse=True)[:5]
    document_keywords.append((doc, filtered_keywords))

# Step 6: Output the results
for doc, keywords in document_keywords:
    print(f"Document: {doc}")
    print(f"Keywords: {keywords}\n")

In [6]:
import numpy as np
import json
from keybert import KeyBERT
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from collections import defaultdict
from tqdm import tqdm

class CorePhraseExtractor:
    """
    Extract core phrases using the CCQGen methodology with existing topic assignments.
    Core phrases are relevant to the document but distinctive within its topic group.
    """
    
    def __init__(self, embedding_model="allenai/scibert_scivocab_uncased", device="cuda:2"):
        """Initialize the extractor with KeyBERT and embedding model."""
        embedder = SentenceTransformer(embedding_model, device=device)
        self.kw_model = KeyBERT(model=embedder)
        
    def extract_core_phrases(self, corpus, doc_topics, 
                           top_n_candidates=20, 
                           selection_ratio=0.2,
                           min_phrases_per_doc=1,
                           max_phrases_per_doc=8,
                           keyphrase_ngram_range=(1, 3),
                           use_mmr=True,
                           diversity=0.7):
        """
        Extract core phrases for each document using the CCQGen distinctiveness score.
        
        Args:
            corpus: List of documents with '_id' and 'text' keys
            doc_topics: List of topic assignments with 'doc_id' and 'topics' keys
            top_n_candidates: Number of candidate phrases to extract initially
            selection_ratio: Ratio of candidates to select as core phrases (0.2 = top 20%)
            min_phrases_per_doc: Minimum number of phrases per document
            max_phrases_per_doc: Maximum number of phrases per document
            keyphrase_ngram_range: N-gram range for phrase extraction
            use_mmr: Use Maximal Marginal Relevance for diversity
            diversity: Diversity parameter for MMR (higher = more diverse)
        """
        
        # Create document lookup
        doc_lookup = {doc['_id']: doc for doc in corpus}
        topic_assignment = {item['doc_id']: item['topics'] for item in doc_topics}
        
        # Group documents by their assigned topics (weighted by topic strength)
        print("Grouping documents by topics...")
        topic_to_docs = self._group_docs_by_topics(topic_assignment)
        
        # Create BM25 models for each topic
        print("Creating BM25 models for each topic...")
        topic_bm25_models = self._create_topic_bm25_models(topic_to_docs, doc_lookup)
        
        # Extract core phrases for each document
        print("Extracting core phrases...")
        doc_core_phrases = {}
        
        for doc_item in tqdm(doc_topics, desc="Processing documents"):
            doc_id = doc_item['doc_id']
            doc_topics_list = doc_item['topics']
            
            if doc_id not in doc_lookup:
                continue
                
            doc_text = doc_lookup[doc_id]['text']
            
            # Extract candidate phrases with improved KeyBERT parameters
            candidate_phrases = self.kw_model.extract_keywords(
                doc_text,
                keyphrase_ngram_range=keyphrase_ngram_range,
                stop_words='english',
                top_n=top_n_candidates,
                use_mmr=use_mmr,
                diversity=diversity,  # Higher diversity to avoid redundant phrases
                use_maxsum=False,     # MMR is generally better than MaxSum
                nr_candidates=top_n_candidates * 2  # More candidates for MMR to choose from
            )
            
            if not candidate_phrases:
                doc_core_phrases[doc_id] = []
                continue
            
            # Calculate distinctiveness scores for each candidate
            distinctive_phrases = self._calculate_distinctiveness_scores(
                candidate_phrases, doc_id, doc_topics_list, 
                topic_to_docs, topic_bm25_models, doc_lookup
            )
            
            # Select top phrases based on distinctiveness
            num_to_select = max(
                min_phrases_per_doc,
                min(max_phrases_per_doc, int(len(distinctive_phrases) * selection_ratio))
            )
            
            distinctive_phrases.sort(key=lambda x: x[1], reverse=True)
            selected_phrases = distinctive_phrases[:num_to_select]
            
            doc_core_phrases[doc_id] = [
                {"phrase": phrase, "distinctiveness_score": score} 
                for phrase, score in selected_phrases
            ]
        
        return doc_core_phrases
    
    def _group_docs_by_topics(self, topic_assignment):
        """Group documents by their assigned topics, considering topic weights."""
        topic_to_docs = defaultdict(list)
        
        for doc_id, topics in topic_assignment.items():
            for topic_info in topics:
                topic_id = topic_info['topic_id']
                weight = topic_info['weight']
                topic_to_docs[topic_id].append({
                    'doc_id': doc_id,
                    'weight': weight
                })
        
        return topic_to_docs
    
    def _create_topic_bm25_models(self, topic_to_docs, doc_lookup):
        """Create BM25 models for each topic."""
        topic_bm25_models = {}
        
        for topic_id, doc_list in topic_to_docs.items():
            # Get document texts for this topic
            docs_in_topic = []
            for doc_info in doc_list:
                doc_id = doc_info['doc_id']
                if doc_id in doc_lookup:
                    docs_in_topic.append(doc_lookup[doc_id]['text'])
            
            if len(docs_in_topic) > 1:  # Need at least 2 docs for BM25 comparison
                # Tokenize documents for BM25
                tokenized_corpus = [doc.lower().split() for doc in docs_in_topic]
                topic_bm25_models[topic_id] = {
                    'model': BM25Okapi(tokenized_corpus),
                    'doc_ids': [doc_info['doc_id'] for doc_info in doc_list if doc_info['doc_id'] in doc_lookup]
                }
        
        return topic_bm25_models
    
    def _calculate_distinctiveness_scores(self, candidate_phrases, current_doc_id, 
                                        doc_topics_list, topic_to_docs, 
                                        topic_bm25_models, doc_lookup):
        """Calculate distinctiveness scores using CCQGen methodology."""
        distinctive_phrases = []
        
        for phrase, relevance_score in candidate_phrases:
            # Calculate distinctiveness across all topics this document belongs to
            total_distinctiveness = 0.0
            total_weight = 0.0
            
            for topic_info in doc_topics_list:
                topic_id = topic_info['topic_id']
                topic_weight = topic_info['weight']
                
                if topic_id not in topic_bm25_models:
                    # If topic has insufficient docs for BM25, use relevance score only
                    distinctiveness = relevance_score
                else:
                    bm25_info = topic_bm25_models[topic_id]
                    bm25_model = bm25_info['model']
                    topic_doc_ids = bm25_info['doc_ids']
                    
                    # Find current document's position in the topic
                    try:
                        current_doc_idx = topic_doc_ids.index(current_doc_id)
                    except ValueError:
                        # Document not found in topic (shouldn't happen)
                        distinctiveness = relevance_score
                        continue
                    
                    # Calculate BM25 scores for the phrase across all docs in topic
                    tokenized_phrase = phrase.lower().split()
                    all_scores = bm25_model.get_scores(tokenized_phrase)
                    
                    # Current document's BM25 score
                    current_score = all_scores[current_doc_idx]
                    
                    # Sum of exp(BM25) for other documents in topic
                    other_scores = np.concatenate([
                        all_scores[:current_doc_idx], 
                        all_scores[current_doc_idx+1:]
                    ])
                    sum_exp_others = np.sum(np.exp(other_scores))
                    
                    # CCQGen distinctiveness formula
                    distinctiveness = np.exp(current_score) / (1 + sum_exp_others)
                
                total_distinctiveness += distinctiveness * topic_weight
                total_weight += topic_weight
            
            # Weight-averaged distinctiveness score
            final_distinctiveness = total_distinctiveness / total_weight if total_weight > 0 else relevance_score
            distinctive_phrases.append((phrase, final_distinctiveness))
        
        return distinctive_phrases

# Usage example with your data
def run_core_phrase_extraction():
    """Run core phrase extraction on your corpus and doc_topics data."""
    CORPUS_PATH = "/home/guest/r12922050/GitHub/d2qplus/data/CSFCube-1.1/corpus.jsonl"
    DOC_TOPICS_PATH = "/home/guest/r12922050/GitHub/d2qplus/augmented-data/CSFCube-1.1/topics/0609-pritamdeka_scibert-biobert-pos-keybert-mmr/doc_topics.jsonl"

    # Load your corpus
    with open(CORPUS_PATH, "r") as f:
        corpus = [json.loads(line) for line in f]
    
    with open(DOC_TOPICS_PATH, "r") as f:
        doc_topics = [json.loads(line) for line in f]

    print(f"Processing {len(corpus)} documents with {len(doc_topics)} topic assignments")
    
    # Initialize extractor
    extractor = CorePhraseExtractor(
        embedding_model="pritamdeka/S-Scibert-snli-multinli-stsb",
        device="cuda:1"
    )
    
    # Extract core phrases
    core_phrases = extractor.extract_core_phrases(
        corpus=corpus,
        doc_topics=doc_topics,
        top_n_candidates=50,      # More candidates for better selection
        selection_ratio=0.25,     # Select top 25% most distinctive
        min_phrases_per_doc=2,    # At least 2 phrases per document
        max_phrases_per_doc=6,    # At most 6 phrases per document
        keyphrase_ngram_range=(1, 3),
        use_mmr=True,            # Use MMR for diversity
        diversity=0.6            # Moderate diversity (0.0=no diversity, 1.0=max diversity)
    )
    
    # Save results
    output_path = "/home/guest/r12922050/GitHub/d2qplus/augmented-data/CSFCube-1.1/keywords/core_phrases_ccqgen.jsonl"
    with open(output_path, "w") as f:
        for doc_id, phrases in core_phrases.items():
            f.write(json.dumps({"doc_id": doc_id, "core_phrases": phrases}) + "\n")
    
    print(f"Core phrases saved to {output_path}")
    
    # Show some examples
    print("\nExample results:")
    for i, (doc_id, phrases) in enumerate(list(core_phrases.items())[:3]):
        print(f"\nDocument {doc_id}:")
        for phrase_info in phrases:
            print(f"  - '{phrase_info['phrase']}' (score: {phrase_info['distinctiveness_score']:.4f})")
    
    return core_phrases

# Run the extraction
core_phrases_results = run_core_phrase_extraction()

  from .autonotebook import tqdm as notebook_tqdm


Processing 4207 documents with 4207 topic assignments
Grouping documents by topics...
Creating BM25 models for each topic...
Grouping documents by topics...
Creating BM25 models for each topic...
Extracting core phrases...
Extracting core phrases...


Processing documents: 100%|██████████| 4207/4207 [08:46<00:00,  7.99it/s]



  from .autonotebook import tqdm as notebook_tqdm


Processing 4207 documents with 4207 topic assignments
Grouping documents by topics...
Creating BM25 models for each topic...
Grouping documents by topics...
Creating BM25 models for each topic...
Extracting core phrases...
Extracting core phrases...


Processing documents: 100%|██████████| 4207/4207 [08:46<00:00,  7.99it/s]



Core phrases saved to /home/guest/r12922050/GitHub/d2qplus/augmented-data/CSFCube-1.1/keywords/core_phrases_ccqgen.jsonl

Example results:

Document 7632414:
  - 'endpoint projection type' (score: 47828.8412)
  - 'parameterised mpi programs' (score: 44512.6552)
  - 'dimensions parameterised protocols' (score: 26746.3935)
  - 'pabble guarantee safety' (score: 18875.8686)
  - 'protocols type checking' (score: 9923.5282)
  - 'local protocols type' (score: 7771.4462)

Document 143814895:
  - 'stylistic variation archaeological' (score: 8.4367)
  - 'distinction functional stylistic' (score: 8.4367)
  - 'variation defined functional' (score: 7.3520)
  - 'conclude case neutral' (score: 5.9623)
  - 'simulation processes cultural' (score: 4.1694)
  - 'suggested neutral models' (score: 4.1694)

Document 62097085:
  - 'education projects realized' (score: 26862.4643)
  - 'higher education projects' (score: 24264.0732)
  - 'innovative ideas lack' (score: 4767.3662)
  - 'lack corresponding evaluati

## Core Phrase Extraction using CCQGen Methodology

This implementation extracts **distinctive core phrases** for each document based on the CCQGen paper methodology. The goal is to identify phrases that are:
1. **Relevant** to the document (high semantic similarity)
2. **Distinctive** within the document's topic group (not commonly used by similar documents)

### Key Components:

**1. Topic-based Document Grouping**
- Uses existing topic assignments (no need to re-run BERTopic)
- Groups documents by shared topics, weighted by topic strength
- Handles multi-topic documents through weighted averaging

**2. BM25-based Distinctiveness Scoring**
- For each candidate phrase, calculates BM25 relevance across topic group
- Applies CCQGen formula: `exp(BM25_current) / (1 + sum(exp(BM25_others)))`
- Higher scores = phrase is more distinctive to this document vs. topic peers

**3. Enhanced KeyBERT Parameters**
- `use_mmr=True` with `diversity=0.6` for non-redundant phrase selection
- `nr_candidates=2x` gives MMR more options to choose diverse phrases
- N-gram range (1,3) captures single words to 3-word phrases

### Output Format:
```json
{
  "doc_id": "123",
  "core_phrases": [
    {"phrase": "neural architecture search", "distinctiveness_score": 0.847},
    {"phrase": "automl", "distinctiveness_score": 0.723}
  ]
}
```

**Why this matters:** Standard keyword extraction might return "deep learning" for all ML papers, but this approach finds phrases like "federated learning" or "graph neural networks" that are specific to individual documents within the ML topic.

In [8]:
with open("/home/guest/r12922050/GitHub/d2qplus/augmented-data/CSFCube-1.1/keywords/core_phrases_ccqgen.jsonl", "r") as f:
    core_phrases = [json.loads(line) for line in f]

CORPUS_PATH = "/home/guest/r12922050/GitHub/d2qplus/data/CSFCube-1.1/corpus.jsonl"
with open(CORPUS_PATH, "r") as f:
    corpus = [json.loads(line) for line in f]

docid2text = {doc["_id"]: doc["text"] for doc in corpus}

# random pick 5 objects in core_phrases
import random
random.seed(42)  # For reproducibility
sampled_core_phrases = random.sample(core_phrases, 5)
for item in sampled_core_phrases:
    doc_id = item["doc_id"]
    phrases = item["core_phrases"]
    print(f"Document ID: {doc_id}")
    print(f"Text: {docid2text[doc_id]}")  # Print first 200 chars of text
    print("Core Phrases:")
    for phrase_info in phrases:
        print(f"  - '{phrase_info['phrase']}' (score: {phrase_info['distinctiveness_score']:.4f})")
    print("\n")

Document ID: 1177419
Text: We present an architecture and an on-line learning algorithm and apply it to the problem of part-of-speech tagging. The architecture presented, SNOW, is a network of linear separators in the feature space, utilizing the Winnow update algorithm.Multiplicative weight-update algorithms such as Winnow have been shown to have exceptionally good behavior when applied to very high dimensional problems, and especially when the target concepts depend on only a small subset of the features in the feature space. In this paper we describe an architecture that utilizes this mistake-driven algorithm for multi-class prediction-selecting the part of speech of a word. The experimental analysis presented here provides more evidence to that these algorithms are suitable for natural language problems.The algorithm used is an on-line algorithm: every example is used by the algorithm only once, and is then discarded. This has significance in terms of efficiency, as well as quick a

In [13]:
with open("/home/guest/r12922050/GitHub/d2qplus/augmented-data/CSFCube-1.1/keywords/core_phrases_ccqgen.jsonl", "r") as f:
    core_phrases = [json.loads(line) for line in f]

CORPUS_PATH = "/home/guest/r12922050/GitHub/d2qplus/data/CSFCube-1.1/corpus.jsonl"
with open(CORPUS_PATH, "r") as f:
    corpus = [json.loads(line) for line in f]

docid2phrases = {item["doc_id"]: item["core_phrases"] for item in core_phrases}

for doc in corpus:
    doc_id = doc.pop("_id")
    doc['id'] = doc_id
    phrases = docid2phrases.get(doc_id, [])
    predicted_queries = []
    for phrase_info in phrases:
        predicted_queries.append(phrase_info['phrase'])
    doc["predicted_queries"] = predicted_queries
# Save the updated corpus with predicted queries
OUTPUT_PATH = "/home/guest/r12922050/GitHub/d2qplus/gen/CSFCube-1.1/text_add_phrase_mining_keywords.jsonl"
with open(OUTPUT_PATH, "w") as f:
    for doc in corpus:
        f.write(json.dumps(doc) + "\n")
print(f"Saved updated corpus with predicted queries to {OUTPUT_PATH}")

Saved updated corpus with predicted queries to /home/guest/r12922050/GitHub/d2qplus/gen/CSFCube-1.1/text_add_phrase_mining_keywords.jsonl


In [10]:
core_phrases[0]['core_phrases']

[{'phrase': 'endpoint projection type',
  'distinctiveness_score': 47828.84120350538},
 {'phrase': 'parameterised mpi programs',
  'distinctiveness_score': 44512.655180433576},
 {'phrase': 'dimensions parameterised protocols',
  'distinctiveness_score': 26746.3934717538},
 {'phrase': 'pabble guarantee safety',
  'distinctiveness_score': 18875.86860069233},
 {'phrase': 'protocols type checking',
  'distinctiveness_score': 9923.528223115984},
 {'phrase': 'local protocols type',
  'distinctiveness_score': 7771.446169015394}]