In [8]:
import numpy as np
import re
from sklearn.datasets import fetch_20newsgroups
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
documents = newsgroups_train.data
target_names = newsgroups_train.target_names

def func(text):
    text = re.sub(r'\S*@\S*\s?', '', text) 
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\'', '', text) 
    
    lemmatizer = WordNetLemmatizer()
    t = text.lower().split()
    
    english_stopwords = set(stopwords.words('english'))
    delete = {'may','also','do', 'used','use','want','would', 'get', 'like', 'one', 'know', 'dont', 'could', 'think', 'make'}
    all = english_stopwords.union(delete)
    
    t = [
        lemmatizer.lemmatize(word) 
        for word in t 
        if word.isalpha() and word not in all and len(word) > 2
    ]
    return t

new_doc = [func(doc) for doc in documents]

dictionary = Dictionary(new_doc)

dictionary.filter_extremes(no_below=5, no_above=0.85)

corpus = [dictionary.doc2bow(doc) for doc in new_doc]

n1 = 20
n2 = 100
n3 = 10

lda_model = LdaModel(
    corpus=corpus,
    num_topics=n1,
    id2word=dictionary,
    alpha=1.0,
    eta=1.0,
    iterations=n2,
    random_state=42,
    eval_every=None
)

print("Топ-10 слов")

for k in range(n1):
    terms = lda_model.get_topic_terms(k, topn=n3)
    words = [dictionary[term_id] for term_id, weight in terms]
    print(f"Тема #{k+1}: {' '.join(words)}")

print("\n Исходные темы")
newsgroups_train.target_names

Топ-10 слов
Тема #1: people see anyone good much take never thats work thing
Тема #2: team game player hockey play last year league first new
Тема #3: god people say jesus believe christian many even see thing
Тема #4: people government state right law gun president armenian new going
Тема #5: car water much good problem ive new anyone time many
Тема #6: thing please good anyone need catholic take people ive much
Тема #7: entry key chip bit rule number must line input build
Тема #8: van det pit bos chi tor que buf stl nyi
Тема #9: space launch satellite nasa earth first mission system lunar orbit
Тема #10: file window program key available using system image data information
Тема #11: wire ground good need circuit much problem many time usually
Тема #12: anyone problem good thanks please see two using take need
Тема #13: do anyone san two people much time good year ive
Тема #14: good new anyone excellent annual comic thanks cover please rider
Тема #15: medical health child number cente

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']