In [1]:
import json
CORPUS_PATH = "/home/guest/r12922050/GitHub/d2qplus/data/nfcorpus/corpus.jsonl"
corpus_text = {}
with open(CORPUS_PATH, "r", encoding="utf-8") as f:
    corpus_text = {json.loads(line).get("_id"): json.loads(line).get("text", "") for line in f}

print(f"Loaded {len(corpus_text):,} documents from '{CORPUS_PATH}'")

Loaded 3,633 documents from '/home/guest/r12922050/GitHub/d2qplus/data/nfcorpus/corpus.jsonl'


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# 1) Suppose you have `corpus_texts = [doc_0_str, doc_1_str, …]`

corpus_texts = list(corpus_text.values())

# 2) Build a document‐term matrix
vectorizer = CountVectorizer(stop_words="english", max_df=0.9, min_df=5)
dtm = vectorizer.fit_transform(corpus_texts)  # (n_docs, n_terms)

# 3) Fit LDA with α = 0.8 (relatively “wide” Dirichlet prior so docs show ≥ 3 topics)
n_topics = 100
lda = LatentDirichletAllocation(
    n_components=n_topics,
    learning_method="batch",
    doc_topic_prior=0.8,     # try 0.5, 0.8, 1.0 to see how it affects sparsity
    topic_word_prior=0.01,
    max_iter=10,
    random_state=42,
)
lda.fit(dtm)

# 4) Get doc–topic distributions ⟶ each row sums to 1
doc_topic_dists = lda.transform(dtm)            # shape: (n_docs, n_topics)

# 5) For each document, pick **exactly 5** topics with highest probability
top5_per_doc = np.argsort(doc_topic_dists, axis=1)[:, ::-1][:, :5]


In [6]:
feature_names = vectorizer.get_feature_names_out()
topic_word_list = []
n_top_words   = 10
for topic_idx, word_counts in enumerate(lda.components_):
    top_word_ids = word_counts.argsort()[::-1][:n_top_words]
    top_words    = [feature_names[i] for i in top_word_ids]
    topic_word_list.append(top_words)
    print(f"Topic {topic_idx:02d}: {', '.join(top_words)}")

Topic 00: workers, cases, exposure, exposed, air, occupational, birth, head, prevalence, poultry
Topic 01: sites, lead, scientific, public, samples, site, absence, regulatory, pops, testing
Topic 02: cells, cell, induced, expression, cancer, apoptosis, activity, human, growth, tumor
Topic 03: meat, beef, pork, human, ige, meats, sodium, emerging, test, tests
Topic 04: fish, consumption, apple, women, levels, size, dm, dried, choices, pufas
Topic 05: compounds, phytochemicals, flavonoids, interactions, heat, flavonoid, quercetin, grape, garlic, exogenous
Topic 06: hormone, hormones, testosterone, prostate, estradiol, yr, hormonal, sex, respiratory, control
Topic 07: oil, acid, juice, absorption, dha, orange, omega, epa, fatty, bioavailability
Topic 08: beta, essential, methionine, strategy, production, human, origin, concentrations, tract, arginine
Topic 09: vegetarian, vegetarians, diet, diets, dietary, patterns, bmi, non, lower, vegan
Topic 10: concentration, pesticide, collected, sam

In [None]:
for doc, topic_ids in zip(cor)

array([[44, 38, 28, 18, 16],
       [44, 38, 11, 41, 18],
       [ 8,  7, 14, 38, 23],
       ...,
       [ 5,  1, 11, 43, 22],
       [11, 18, 41, 40, 24],
       [43, 46, 14, 27, 18]])