In [3]:
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
documents = newsgroups_train.data
labels = newsgroups_train.target
label_names = newsgroups_train.target_names

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(doc):
    words = [lemmatizer.lemmatize(word.lower()) for word in doc.split() if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(words)

preprocessed_documents = [preprocess_text(doc) for doc in documents]

vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)
X = vectorizer.fit_transform(preprocessed_documents)
vocab = vectorizer.get_feature_names_out()

n_topics = 20
alpha = 1.0
beta = 1.0
n_iterations = 100

n_docs, n_words = X.shape
word_topic_counts = np.zeros((n_words, n_topics))
doc_topic_counts = np.zeros((n_docs, n_topics))
topic_totals = np.zeros(n_topics)
doc_topic_assignments = []

for d in range(n_docs):
    doc = X[d].indices
    topics = []
    for w in doc:
        topic = random.randint(0, n_topics - 1)
        word_topic_counts[w, topic] += 1
        doc_topic_counts[d, topic] += 1
        topic_totals[topic] += 1
        topics.append(topic)
    doc_topic_assignments.append(topics)
for it in range(n_iterations):
    if it % 10 == 0:
        print(f"Done: {it}/{n_iterations} iterations")
    for d in range(n_docs):
        doc = X[d].indices
        for i, w in enumerate(doc):
            current_topic = doc_topic_assignments[d][i]
            word_topic_counts[w, current_topic] -= 1
            doc_topic_counts[d, current_topic] -= 1
            topic_totals[current_topic] -= 1

            topic_probs = (word_topic_counts[w] + beta) * (doc_topic_counts[d] + alpha) / (topic_totals + beta * n_words)
            topic_probs /= topic_probs.sum()

            new_topic = np.random.choice(np.arange(n_topics), p=topic_probs)
            word_topic_counts[w, new_topic] += 1
            doc_topic_counts[d, new_topic] += 1
            topic_totals[new_topic] += 1
            doc_topic_assignments[d][i] = new_topic

def get_top_words(word_topic_counts, vocab, n_top_words=10):
    topics = []
    for topic_idx in range(n_topics):
        top_words_idx = word_topic_counts[:, topic_idx].argsort()[::-1][:n_top_words]
        topics.append([vocab[i] for i in top_words_idx])
    return topics

topics = get_top_words(word_topic_counts, vocab)

for i, topic in enumerate(topics):
    print(f"Topic #{i + 1}: {', '.join(topic)}")

topic_assignments = np.argmax(doc_topic_counts, axis=1)
topic_to_labels = {i: [] for i in range(n_topics)}

for doc_idx, topic in enumerate(topic_assignments):
    topic_to_labels[topic].append(labels[doc_idx])

used_labels = set()
final_topic_labels = {}
for topic_idx, label_list in topic_to_labels.items():
    label_counts = np.bincount(label_list, minlength=len(label_names))
    if label_counts.sum() > 0:
        for label_idx in label_counts.argsort()[::-1]:
            if label_idx not in used_labels:
                used_labels.add(label_idx)
                final_topic_labels[topic_idx] = label_idx
                break

unused_labels = set(range(len(label_names))) - used_labels
for topic_idx in range(n_topics):
    if topic_idx not in final_topic_labels and unused_labels:
        final_topic_labels[topic_idx] = unused_labels.pop()

for topic_idx, label_idx in final_topic_labels.items():
    print(f"Topic #{topic_idx + 1}: {label_names[label_idx]} ({len(topic_to_labels[topic_idx])} documents)")
    print(f"Top words: {', '.join(topics[topic_idx])}\n")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/trxxlxrd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/trxxlxrd/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Done: 0/100 iterations
Done: 10/100 iterations
Done: 20/100 iterations
Done: 30/100 iterations
Done: 40/100 iterations
Done: 50/100 iterations
Done: 60/100 iterations
Done: 70/100 iterations
Done: 80/100 iterations
Done: 90/100 iterations
Topic #1: since, cause, different, difference, point, less, many, actually, likely, find
Topic #2: one, would, also, like, go, think, good, even, thing, take
Topic #3: time, back, went, said, took, day, got, told, put, next
Topic #4: would, one, make, take, go, like, think, seems, people, know
Topic #5: space, university, research, new, april, information, science, center, year, office
Topic #6: state, law, government, right, public, american, gun, people, federal, crime
Topic #7: would, one, think, like, see, really, people, good, make, get
Topic #8: car, good, new, get, much, price, buy, pay, cost, need
Topic #9: get, like, think, one, would, going, know, pretty, want, even
Topic #10: one, get, like, would, know, look, might, could, bit, got
Topic #