In [31]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Load dataset (remove metadata for cleaner data)
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data



In [32]:
# Preprocess text (TF-IDF vectorization)
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)

# For count-based methods (LDA)
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
count = count_vectorizer.fit_transform(documents)

1. Basic Techniques

In [33]:
from sklearn.decomposition import LatentDirichletAllocation

# Train LDA
lda = LatentDirichletAllocation(
    n_components=10,  # Number of topics
    learning_method='online',
    random_state=42
)
lda.fit(count)

# Display topics
def print_topics(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

print_topics(lda, count_vectorizer.get_feature_names_out())

Topic 0:
god people say believe does jesus think don know evidence
Topic 1:
like just don know use good time ve does way
Topic 2:
00 edu com 1993 space ca 04 university cs 10
Topic 3:
game team year games play season hockey players league win
Topic 4:
file edu program image available files ftp use data window
Topic 5:
10 25 12 20 15 11 14 16 13 17
Topic 6:
armenian israel armenians people war jews turkish said israeli killed
Topic 7:
ax max b8f g9v a86 pl 145 1d9 0t 1t
Topic 8:
dos drive windows card disk scsi mac pc ms thanks
Topic 9:
people think government don right said time going did law


2. Non- negative matrix Factorization

In [34]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=10, random_state=42)
nmf.fit(tfidf)

print_topics(nmf, tfidf_vectorizer.get_feature_names_out())

Topic 0:
don just like think know good ve time really want
Topic 1:
windows dos file program files window use using run running
Topic 2:
god jesus bible believe christ faith christian christians sin church
Topic 3:
drive scsi ide disk card controller hard drives bus floppy
Topic 4:
key chip encryption clipper keys government escrow use algorithm phone
Topic 5:
thanks does know mail advance hi info looking information help
Topic 6:
00 new 10 sale car price 50 20 shipping offer
Topic 7:
game games team year hockey baseball season players play espn
Topic 8:
edu geb dsl cadre n3jxp chastity pitt skepticism intellect shameful
Topic 9:
people government israel armenian jews armenians gun state did children


3.1 Hierarchical Dirichlet Process (HDP)


In [35]:
from gensim.models import HdpModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess

# Preprocess for Gensim
texts = [simple_preprocess(doc, deacc=True) for doc in documents]
dictionary = Dictionary(texts)
dictionary.filter_extremes(no_below=2, no_above=0.95)  # Filter extremes
corpus = [dictionary.doc2bow(text) for text in texts]

# Train HDP
hdp = HdpModel(corpus=corpus, id2word=dictionary)

# Show topics
hdp.print_topics()



[(0,
  '0.054*the + 0.027*to + 0.024*of + 0.022*and + 0.021*ax + 0.016*in + 0.015*is + 0.014*that + 0.012*it + 0.010*for'),
 (1,
  '0.049*the + 0.026*to + 0.019*of + 0.018*and + 0.014*is + 0.014*it + 0.014*in + 0.013*that + 0.011*you + 0.010*for'),
 (2,
  '0.043*the + 0.021*to + 0.018*of + 0.016*and + 0.013*in + 0.013*is + 0.012*that + 0.011*it + 0.008*for + 0.007*this'),
 (3,
  '0.032*the + 0.014*to + 0.013*of + 0.012*and + 0.011*in + 0.008*is + 0.007*it + 0.006*at + 0.006*for + 0.006*that'),
 (4,
  '0.029*the + 0.022*of + 0.013*and + 0.011*to + 0.010*in + 0.006*by + 0.005*is + 0.005*that + 0.004*istanbul + 0.004*st'),
 (5,
  '0.033*the + 0.017*to + 0.014*of + 0.013*and + 0.011*in + 0.009*is + 0.009*that + 0.008*it + 0.006*you + 0.006*for'),
 (6,
  '0.022*the + 0.013*of + 0.010*to + 0.009*and + 0.008*is + 0.006*in + 0.006*that + 0.005*for + 0.004*it + 0.004*you'),
 (7,
  '0.020*the + 0.012*of + 0.009*to + 0.007*is + 0.006*and + 0.005*in + 0.004*that + 0.004*you + 0.004*it + 0.004*for'

Tried using GuidedLDA, it doesn't work good.

Evaluation

In [36]:
# ===================================================================
# Data Preparation for Gensim
# ===================================================================
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
from gensim.utils import simple_preprocess

# Preprocess texts for Gensim
texts = [simple_preprocess(doc, deacc=True) for doc in documents]

# Create dictionary and corpus
dictionary = Dictionary(texts)
dictionary.filter_extremes(no_below=2, no_above=0.95)  # Match sklearn's min_df/max_df
corpus = [dictionary.doc2bow(text) for text in texts]

# ===================================================================
# Corrected LDA Implementation with Gensim
# ===================================================================


In [37]:
# Train Gensim LDA
lda_gensim = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,
    random_state=42,
    passes=5,
    alpha='auto'
)

# Display topics from Gensim model
def print_gensim_topics(model, num_words=10):
    for idx, topic in model.print_topics(-1, num_words):
        print(f"Topic {idx}: {topic}\n")

print_gensim_topics(lda_gensim)

# ===================================================================
# Corrected Coherence Calculation
# ===================================================================


Topic 0: 0.091*"the" + 0.053*"of" + 0.032*"in" + 0.032*"and" + 0.018*"to" + 0.012*"by" + 0.011*"were" + 0.008*"from" + 0.007*"was" + 0.007*"their"

Topic 1: 0.654*"ax" + 0.048*"max" + 0.007*"pl" + 0.006*"di" + 0.006*"tm" + 0.005*"ei" + 0.005*"wm" + 0.005*"bhj" + 0.005*"giz" + 0.004*"ey"

Topic 2: 0.045*"the" + 0.033*"to" + 0.020*"it" + 0.019*"and" + 0.017*"is" + 0.016*"you" + 0.016*"of" + 0.016*"for" + 0.014*"in" + 0.011*"on"

Topic 3: 0.063*"the" + 0.034*"to" + 0.026*"that" + 0.025*"of" + 0.025*"and" + 0.018*"in" + 0.017*"it" + 0.016*"is" + 0.015*"you" + 0.010*"they"

Topic 4: 0.052*"the" + 0.038*"of" + 0.037*"and" + 0.021*"in" + 0.019*"for" + 0.014*"to" + 0.010*"on" + 0.008*"space" + 0.007*"by" + 0.007*"from"

Topic 5: 0.049*"the" + 0.022*"to" + 0.022*"and" + 0.016*"for" + 0.015*"with" + 0.015*"is" + 0.014*"it" + 0.011*"have" + 0.010*"of" + 0.009*"on"

Topic 6: 0.043*"the" + 0.024*"to" + 0.023*"and" + 0.018*"of" + 0.017*"is" + 0.014*"in" + 0.014*"for" + 0.009*"it" + 0.009*"you" + 0.0

In [38]:
coherence_model = CoherenceModel(
    model=lda_gensim,
    texts=texts,  # Use the preprocessed texts
    dictionary=dictionary,
    coherence='c_v'
)

coherence = coherence_model.get_coherence()
print(f"Coherence Score: {coherence:.4f}")

# ===================================================================
# Updated sklearn LDA Section (Alternative Approach)
# ===================================================================
# If you want to keep sklearn LDA for comparison


Coherence Score: 0.5022


In [39]:
from sklearn.decomposition import LatentDirichletAllocation

# sklearn LDA requires the count matrix from CountVectorizer
sklearn_lda = LatentDirichletAllocation(
    n_components=10,
    learning_method='online',
    random_state=42
)
sklearn_lda.fit(count)  # 'count' from original CountVectorizer

# Different visualization for sklearn LDA
def print_sklearn_topics(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

print_sklearn_topics(sklearn_lda, count_vectorizer.get_feature_names_out())

# ===================================================================
# Affected Code Fixes
# ===================================================================
# 1. Hierarchical Dirichlet Process (HDP) now uses same preprocessed corpus
from gensim.models import HdpModel

hdp = HdpModel(corpus=corpus, id2word=dictionary)
hdp.print_topics()





Topic 0:
god people say believe does jesus think don know evidence
Topic 1:
like just don know use good time ve does way
Topic 2:
00 edu com 1993 space ca 04 university cs 10
Topic 3:
game team year games play season hockey players league win
Topic 4:
file edu program image available files ftp use data window
Topic 5:
10 25 12 20 15 11 14 16 13 17
Topic 6:
armenian israel armenians people war jews turkish said israeli killed
Topic 7:
ax max b8f g9v a86 pl 145 1d9 0t 1t
Topic 8:
dos drive windows card disk scsi mac pc ms thanks
Topic 9:
people think government don right said time going did law




[(0,
  '0.054*the + 0.027*to + 0.024*of + 0.022*and + 0.016*is + 0.016*in + 0.013*that + 0.012*it + 0.011*for + 0.009*you'),
 (1,
  '0.059*the + 0.029*to + 0.027*of + 0.024*and + 0.017*that + 0.017*in + 0.015*is + 0.013*it + 0.011*you + 0.009*for'),
 (2,
  '0.419*ax + 0.030*max + 0.007*pl + 0.004*ei + 0.004*di + 0.004*tm + 0.003*wm + 0.003*bhj + 0.003*giz + 0.003*cx'),
 (3,
  '0.044*the + 0.021*to + 0.021*of + 0.016*and + 0.013*in + 0.012*is + 0.011*that + 0.009*it + 0.009*for + 0.007*you'),
 (4,
  '0.030*the + 0.013*to + 0.012*of + 0.011*and + 0.010*is + 0.008*in + 0.008*it + 0.006*for + 0.006*that + 0.006*dos'),
 (5,
  '0.031*the + 0.016*to + 0.013*of + 0.011*and + 0.010*is + 0.010*in + 0.008*that + 0.008*it + 0.006*you + 0.005*this'),
 (6,
  '0.025*the + 0.012*to + 0.011*and + 0.011*of + 0.007*for + 0.007*in + 0.007*is + 0.006*that + 0.006*it + 0.005*you'),
 (7,
  '0.013*pit + 0.011*det + 0.009*bos + 0.009*tor + 0.009*chi + 0.009*van + 0.007*la + 0.007*que + 0.007*nyr + 0.006*buf'),

In [42]:
# 2. pyLDAvis visualization for sklearn LDA remains valid
import pyLDAvis
import pyLDAvis.lda_model

# pyLDAvis.lda_model.prepare

pyLDAvis.enable_notebook()
vis = pyLDAvis.lda_model.prepare(sklearn_lda, count, count_vectorizer)
vis

In [None]:
!pip install pyLDAvis.sklearn
# error

In [None]:
!pip install gensim pyLDAvis guidedlda bertopic top2vec