In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem import WordNetLemmatizer 
from tqdm import tqdm



In [2]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
nameOfTag = newsgroups_train.target_names

In [3]:
lemmatizer = WordNetLemmatizer()

for text in newsgroups_train.data:
    temp_text = nltk.word_tokenize(text)
    text = " ".join([lemmatizer.lemmatize(w) for w in temp_text])


In [4]:
n_features = 4500
n_components = 20
n_top_words = 10


vectorizer = CountVectorizer(
                    lowercase=True, stop_words=_stop_words.ENGLISH_STOP_WORDS,
                    analyzer='word', binary=True,
                    max_df=0.95, min_df=2,
                    max_features=n_features
)
# одновременно создали словарь и преобразовали строку в вектор
X_train = vectorizer.fit_transform(newsgroups_train.data).toarray()

In [17]:
len(vectorizer.vocabulary_)

4500

In [5]:
class customLDA:
    def __init__(self, n_components=10, alpha=None, beta=None, max_iter=10):
        self._n_components = n_components
        self._max_iter = max_iter
        self._alpha = alpha
        self._beta = beta
        self._n_k = None
        self._n_k_w = None
        self._n_d_k = None

        self._fit_Is = False

    def fit(self, main_matrix):
        self._n_k = np.zeros(self._n_components)                              # колво слов в теге k по всем документам
        self._n_k_w = np.zeros((self._n_components, main_matrix.shape[1]))          # колво раз сколько слово w было в теге k
        self._n_d_k = np.zeros((main_matrix.shape[0], self._n_components))        # количество вхождений тега k в документе d

        if self._alpha == None:
            self._alpha = np.ones(self._n_components)
        if self._beta == None:
            self._beta = np.ones(main_matrix.shape[1])

        documn_, word_ =  main_matrix.nonzero()
        z = np.random.choice(self._n_components, len(documn_))

        for i,j,k in zip(documn_, word_, z):
            self._n_k[k] += 1
            self._n_k_w[k, j] += 1
            self._n_d_k[i, k] += 1
        
        for i in tqdm(range(self._max_iter)):
            for j in range(len(documn_)):
                current_word = word_[j]
                current_dc = documn_[j]
                current_tag = z[j]
                self._n_d_k[current_dc, current_tag] -= 1
                self._n_k_w[current_tag, current_word] -= 1
                self._n_k[current_tag] -= 1
                p = (self._n_d_k[current_dc, :] + self._alpha) * (self._n_k_w[:, current_word] + self._beta[current_word]) / (self._n_k + self._beta.sum())
                z[j] = np.random.choice(self._n_components, p = p / p.sum())
                self._n_d_k[current_dc, z[j]] += 1
                self._n_k_w[z[j], current_word] += 1
                self._n_k[z[j]] += 1
        
        self._fit_Is = True
        return self
    
    def get_table_tags_and_word(self):
        if self._fit_Is:
            return self._n_k_w

        
        

In [19]:
lda = customLDA(n_components, max_iter=50)
lda.fit(X_train)

100%|██████████| 50/50 [25:19<00:00, 30.38s/it]


<__main__.customLDA at 0x7ff167b12b20>

In [24]:
result = np.argsort(lda.get_table_tags_and_word(), axis=1)[:, -n_top_words:]

for i in range(n_components):
    matrix = np.zeros((1, X_train.shape[1]))
    for j in result[i]:
        matrix[0, j] = 1
    print('Tag {} \t{}'.format(i + 1, '\t'.join(vectorizer.inverse_transform(matrix)[0])))

Tag 1 	case	government	gun	guns	law	laws	people	right	rights	state
Tag 2 	bike	buy	car	cars	condition	good	new	power	sell	used
Tag 3 	don	just	know	let	like	people	right	think	time	want
Tag 4 	code	file	files	ftp	program	use	using	version	window	windows
Tag 5 	bad	better	don	good	just	like	make	right	think	ve
Tag 6 	does	don	just	like	problem	problems	try	use	using	work
Tag 7 	country	government	history	israel	israeli	jews	killed	people	war	world
Tag 8 	1993	ca	com	contact	date	edu	information	internet	send	university
Tag 9 	card	computer	disk	drive	memory	pc	software	use	video	windows
Tag 10 	believe	did	doesn	going	isn	just	let	people	say	things
Tag 11 	cost	data	high	large	long	low	new	program	project	space
Tag 12 	ago	good	just	like	new	sure	think	time	ve	years
Tag 13 	came	day	days	did	left	night	saw	second	took	went
Tag 14 	better	doing	don	idea	little	probably	reason	things	think	way
Tag 15 	believe	bible	christian	does	god	jesus	life	people	say	true
Tag 16 	cause	course	don	edu

Сопопставим полученные топ слова для каждого тега:  
1. 'talk.politics.mideast'
4. 'comp.os.ms-windows.misc'  
7. 'talk.politics.guns'  
9. 'comp.windows.x'  
15. 'soc.religion.christian'  