In [2]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [86]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [123]:
newsgroups_train.target[:100]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4,  8, 19,  4, 14,  6,  0,  1,
        7, 12,  5,  0, 10,  6,  2,  4,  1, 12,  9, 15,  7,  6, 13, 12, 17,
       18, 10,  8, 11,  8, 16,  9,  4,  3,  9,  9,  4,  4,  8, 12, 14,  5,
       15,  2, 13, 17, 11,  7, 10,  2, 14, 12,  5,  4,  6,  7,  0, 11, 16,
        0,  6, 17,  7, 12,  7,  3, 12, 11,  7,  2,  2,  0, 16,  1,  2,  7,
        3,  2,  1, 10, 12, 12, 17, 12,  2,  8,  8, 18,  5,  0,  1])

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True, min_df=10, max_df=.04)
vectorizer.fit(newsgroups_train.data)

CountVectorizer(binary=True, max_df=0.04, min_df=10,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [101]:
text = 'I was wondering if anyone out there could enlighten me on this car I saw'
x = vectorizer.transform([text])

In [102]:
type(x)

scipy.sparse.csr.csr_matrix

In [103]:
x.data

array([1, 1, 1, 1], dtype=int64)

In [104]:
x.nonzero()

(array([0, 0, 0, 0]), array([ 1905,  3576,  8221, 10138]))

In [105]:
x.toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [106]:
vectorizer.inverse_transform(x)

[array(['car', 'enlighten', 'saw', 'wondering'], dtype='<U79')]

In [107]:
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_train.shape

(11314, 10299)

Задача: запустить модель LDA и Gibbs Sampling с числов тегов 20. Вывести топ-10 слов по каждому тегу. Соотнести полученные теги с тегами из датасета, сделать выводы.

In [109]:
print(X_train)

  (0, 10138)	1
  (0, 3576)	1
  (0, 1905)	1
  (0, 8221)	1
  (0, 3288)	1
  (0, 8787)	1
  (0, 5726)	1
  (0, 5506)	1
  (0, 542)	1
  (0, 3399)	1
  (0, 593)	1
  (0, 3289)	1
  (0, 8633)	1
  (0, 824)	1
  (0, 1791)	1
  (0, 8381)	1
  (0, 7936)	1
  (0, 1628)	1
  (0, 6155)	1
  (0, 3560)	1
  (0, 8750)	1
  (0, 7337)	1
  (0, 4668)	1
  (0, 4995)	1
  (1, 3856)	1
  :	:
  (11313, 3560)	1
  (11313, 4461)	1
  (11313, 9590)	1
  (11313, 1858)	1
  (11313, 4708)	1
  (11313, 8405)	1
  (11313, 8393)	1
  (11313, 10065)	1
  (11313, 9464)	1
  (11313, 1610)	1
  (11313, 8530)	1
  (11313, 8907)	1
  (11313, 6844)	1
  (11313, 1387)	1
  (11313, 6118)	1
  (11313, 7091)	1
  (11313, 9192)	1
  (11313, 8797)	1
  (11313, 6111)	1
  (11313, 1963)	1
  (11313, 7062)	1
  (11313, 8010)	1
  (11313, 9414)	1
  (11313, 5647)	1
  (11313, 5392)	1


In [110]:
X_train.data

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [117]:
from tqdm import tqdm_notebook as tqdm
def LDA(X, niter = 100, alpha = np.ones(20)):
    K = 20
    N = X.shape[1]
    Betta = np.ones(N)
    Docs = X.nonzero()[0]#номера документов для слов из словаря 
    Words = X.nonzero()[1]#место в словаре
    n_KW =  np.zeros((K, N))#число вхождений слова в тег из словаря
    n_DK = np.zeros((len(Docs), K))#число слов тега K в документе D
    nK = np.zeros(K)
    tegs = np.random.choice(K, len(Docs))
    for Doc, Word, teg in zip(Docs, Words, tegs):#одновременно проходимся по трем массивам
        n_KW[teg, Word] += 1
        n_DK[Doc, teg] += 1
        nK[teg] += 1
    for i in tqdm(range(niter)):
        for j in range(len(Docs)):
            teg = tegs[j]
            Doc = Docs[j]
            Word = Words[j]
            n_KW[teg, Word] -= 1
            n_DK[Doc, teg] -= 1
            nK[teg] -= 1
            
            P = (n_DK[Doc, :] + alpha)*(n_KW[:, Word] + Betta[Word])/(nK + Betta.sum())
            P = np.array(P)
            P /= P.sum()
            tegs[j] = np.random.choice(np.arange(K), p = P)
            teg = tegs[j]
            n_KW[teg, Word] += 1
            n_DK[Doc, teg] += 1
            nK[teg] += 1
            
    T = [np.argsort(n_KW[k, :]) for k in range(K)]
    T = np.array(T)
    for k in range(K):
        a = np.zeros(N)
        for j in T[k, -11: -1]:
            a[j] = 1
        print('Tag {}: {}'.format(k, '\\t'.join(str(x) for x in vectorizer.inverse_transform(a))))
LDA(X_train)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(niter)):


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Tag 0: ['algorithm' 'chip' 'clipper' 'encryption' 'keys' 'phone' 'public'
 'secure' 'security' 'soon']
Tag 1: ['1993' '93' 'ca' 'computer' 'contact' 'date' 'ftp' 'net' 'posted'
 'university']
Tag 2: ['btw' 'couple' 'guy' 'mark' 'mentioned' 'mike' 'net' 'posting' 'sorry'
 'thank']
Tag 3: ['agree' 'argument' 'certainly' 'discussion' 'evidence' 'making' 'saying'
 'sense' 'simply' 'understand']
Tag 4: ['00' '100' 'asking' 'condition' 'includes' 'offer' 'original' 'price'
 'sell' 'shipping']
Tag 5: ['27' '34' 'ah' 'hi' 'hp' 'id' 'ma' 'mi' 'mr' 'ms']
Tag 6: ['center' 'cost' 'development' 'earth' 'launch' 'low' 'nasa' 'research'
 'systems' 'technology']
Tag 7: ['bible' 'christ' 'christian' 'christians' 'church' 'faith' 'love' 'man'
 'religion' 'says']
Tag 8: ['american' 'country' 'crime' 'federal' 'guns' 'law' 'laws' 'national'
 'rights' 'states']
Tag 9: ['address' 'deleted' 'guess' 'haven' 'hey' 'news' 'sorry' 'stuff' 'wonder'
 'yeah']
Tag 10: ['disease' 'doctor' 'food' 'interesting' 'medic

В итоге можно заметить, что слова довольно хорошо разбились по своим темам, которые легко выделить.
Например, слова из Tag 0 относятся к криптографии, из Tag 7 - к религии (христианству), из Tag 12 - к гонкам, из Tag 19 - к составляющим компьютера. 