In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

In [None]:
# Load the 20 Newsgroups dataset
categories = ['alt.atheism', 'comp.graphics', 'rec.motorcycles']
dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=2017)

In [None]:
# Print dataset details
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))

In [None]:
# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(dataset.data)
print("n_samples: %d, n_features: %d" % X.shape)

In [None]:
# Define and fit the LDA model
total_topics = 3
lda = LatentDirichletAllocation(n_components=total_topics,
                                max_iter=100,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=2017)

In [None]:
lda.fit(X)

In [None]:
# Get feature names
feature_names = np.array(vectorizer.get_feature_names_out())

In [None]:
# Display topics and their top terms
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-21:-1]]))