<div style="font-size:18pt; padding-top:20px; text-align:center">СЕМИНАР 18. <b>Кластеризация текстовых документов и </b> <span style="font-weight:bold; color:green">Sklearn/NLTK/Gensim</span></div><hr>
<div style="text-align:right;">Папулин С.Ю. <span style="font-style: italic;font-weight: bold;">(papulin_hse@mail.ru)</span></div>

<a name="0"></a>
<div><span style="font-size:14pt; font-weight:bold">Содержание</span>
    <ol>
        <li><a href="#1">Загрузка исходных данных</a></li>
        <li><a href="#2">Лемматизация, стемминг и POS</a></li>
        <li><a href="#3">Latent Semantic Analisys (LSA)</a>
        <li><a href="#4">Визуализация с использованием WordCloud</a>
        <li><a href="#5">Источники</a>
        </li>
    </ol>
</div>

In [1]:
import numpy as np

In [2]:
from gensim import corpora, matutils, models
from sklearn.datasets import fetch_20newsgroups



In [3]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline

<a name="1"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">1. Загрузка исходных данных</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

<p><b>Исходные данные</b></p>

In [5]:
topics = ["sci.space", "soc.religion.christian", "rec.sport.baseball", "comp.sys.mac.hardware"]

In [6]:
newsgroups_train = fetch_20newsgroups(subset="train", data_home="data", #в папке data данные будут; тк fetch  с сервера, так мы их сохраняем
                                      categories=topics, remove=["headers", "footers", "quotes"])

In [7]:
newsgroups_test = fetch_20newsgroups(subset="test", data_home="data", 
                                      categories=topics, remove=["headers", "footers", "quotes"])

In [8]:
newsgroups_train.target_names

['comp.sys.mac.hardware',
 'rec.sport.baseball',
 'sci.space',
 'soc.religion.christian']

In [9]:
documents = newsgroups_train.data
topics = newsgroups_train.target
documents[0]

'\nI have no doubt that God hears everybody\'s prayers.\nHowever, He does things His way, i.e. things will happen\nonly if it is His will.\n\nNow if the question really is "Does God grant everybody\'s wishes ?"\nthen you\'ll get a brutal shot of reality similar to when you didn\'t\nget that toy you wanted for Christmas. You just cannot expect\nto get everything you want in this world.\n\n-- '

In [10]:
topic_indx = dict(list(zip(newsgroups_train.target_names, range(len(newsgroups_train.target_names)))))

In [11]:
list(zip(newsgroups_train.target_names, np.bincount(topics)))

[('comp.sys.mac.hardware', 578),
 ('rec.sport.baseball', 597),
 ('sci.space', 593),
 ('soc.religion.christian', 599)]

<a name="2"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">2. Лемматизация, стемминг и POS</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

In [12]:
import nltk

In [88]:
nltk.download(download_dir = "C/:Documents")
#много моделей, в том числе обученные

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

<p><b>Разделение текста на токены</b></p>

In [68]:
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem.porter import PorterStemmer

In [69]:
tokenizer = RegexpTokenizer(r"\w+") #регулярные выражения

In [70]:
doc_tokens = tokenizer.tokenize(documents[0])
doc_tokens[:20]

['I',
 'have',
 'no',
 'doubt',
 'that',
 'God',
 'hears',
 'everybody',
 's',
 'prayers',
 'However',
 'He',
 'does',
 'things',
 'His',
 'way',
 'i',
 'e',
 'things',
 'will']

In [71]:
doc_tokens = word_tokenize(documents[0])
doc_tokens[:20]

['I',
 'have',
 'no',
 'doubt',
 'that',
 'God',
 'hears',
 'everybody',
 "'s",
 'prayers',
 '.',
 'However',
 ',',
 'He',
 'does',
 'things',
 'His',
 'way',
 ',',
 'i.e']

In [72]:
#в менюшке nltk выбрать то, на что ругается

<p><b>Cтемминг</b></p>

In [73]:
from nltk.corpus import stopwords

In [74]:
stop_words = stopwords.words("english") 
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [75]:
stemmer = PorterStemmer()

In [76]:
stemmer.stem("everybody") #stemmer переводит все форрмы слов в одну

'everybodi'

In [77]:
stemmer.stem("trying")

'tri'

In [78]:
stemmer.stem("tried")

'tri'

In [79]:
stemmer.stem("try")

'tri'

<p>Пример</p>

In [80]:
chars = "-[],.<>()={}'\"?!`~/|+*&^%$#@;: \\_"

In [81]:
docs_tokens = list()

for doc in documents:
    doc = doc.lower()
    stemmed_tokens = [stemmer.stem(el) for el in word_tokenize(doc) 
                      if len(el) > 1 and not any(ch.isdigit() or ch in chars for ch in el) 
                      and el not in stop_words]
    docs_tokens.append(stemmed_tokens)
    
docs_tokens[:2]

[['doubt',
  'god',
  'hear',
  'everybodi',
  'prayer',
  'howev',
  'thing',
  'way',
  'thing',
  'happen',
  'question',
  'realli',
  'god',
  'grant',
  'everybodi',
  'wish',
  'get',
  'brutal',
  'shot',
  'realiti',
  'similar',
  'get',
  'toy',
  'want',
  'christma',
  'expect',
  'get',
  'everyth',
  'want',
  'world'],
 ['somebodi',
  'mention',
  'hst',
  'mission',
  'mean',
  'weight',
  'tight',
  'margin',
  'mission',
  'said',
  'done',
  'grappl',
  'hst',
  'stow',
  'cargo',
  'bay',
  'om',
  'burn',
  'high',
  'altitud',
  'unstow',
  'hst',
  'repair',
  'gyro',
  'costar',
  'instal',
  'fix',
  'solar',
  'array',
  'return',
  'earth',
  'guess',
  'bother',
  'usingth',
  'shuttl',
  'reboost',
  'grappl',
  'said',
  'fix',
  'bolt',
  'small',
  'liquid',
  'fuel',
  'thruster',
  'modul',
  'hst',
  'let',
  'make',
  'cheaper',
  'mass',
  'usingth',
  'shuttl',
  'tug',
  'way',
  'go',
  'need',
  'least',
  'spacewalk',
  'carri',
  'edo',
  'pa

<p><b>Определение части речи (POS)</b></p>

In [82]:
nltk.pos_tag(["Me"])

[('Me', 'NN')]

<a href="http://www.winwaed.com/blog/2011/11/08/part-of-speech-tags/">Part of Speech Tags</a>

In [83]:
nltk.pos_tag(documents[0].split())

[('I', 'PRP'),
 ('have', 'VBP'),
 ('no', 'DT'),
 ('doubt', 'NN'),
 ('that', 'IN'),
 ('God', 'NNP'),
 ('hears', 'VBZ'),
 ("everybody's", 'JJ'),
 ('prayers.', 'NN'),
 ('However,', 'NNP'),
 ('He', 'PRP'),
 ('does', 'VBZ'),
 ('things', 'NNS'),
 ('His', 'PRP$'),
 ('way,', 'JJ'),
 ('i.e.', 'JJ'),
 ('things', 'NNS'),
 ('will', 'MD'),
 ('happen', 'VB'),
 ('only', 'RB'),
 ('if', 'IN'),
 ('it', 'PRP'),
 ('is', 'VBZ'),
 ('His', 'PRP$'),
 ('will.', 'NN'),
 ('Now', 'RB'),
 ('if', 'IN'),
 ('the', 'DT'),
 ('question', 'NN'),
 ('really', 'RB'),
 ('is', 'VBZ'),
 ('"Does', 'JJ'),
 ('God', 'NNP'),
 ('grant', 'NN'),
 ("everybody's", 'NN'),
 ('wishes', 'NNS'),
 ('?"', 'VBP'),
 ('then', 'RB'),
 ("you'll", 'RB'),
 ('get', 'VB'),
 ('a', 'DT'),
 ('brutal', 'JJ'),
 ('shot', 'NN'),
 ('of', 'IN'),
 ('reality', 'NN'),
 ('similar', 'JJ'),
 ('to', 'TO'),
 ('when', 'WRB'),
 ('you', 'PRP'),
 ("didn't", 'VBP'),
 ('get', 'VB'),
 ('that', 'IN'),
 ('toy', 'NN'),
 ('you', 'PRP'),
 ('wanted', 'VBD'),
 ('for', 'IN'),
 ('Chri

<a name="3"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">3. Latent Dirichlet Allocation (LDA)</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

<p>Векторизация TF-IDF</p>

In [84]:
def mytokenizer(x):
    for el in x.split():
        if len(el) > 1 and not any(ch.isdigit() or ch in chars for ch in el) \
        and el not in stop_words and nltk.pos_tag([el])[0][1]=="NN":
            yield el #stemmer.stem(el)

In [85]:
vectorizer = TfidfVectorizer(min_df=20,
                             max_df=0.1, tokenizer=mytokenizer,
                             lowercase=True)#"english")#, max_features=1000)

In [90]:
X = vectorizer.fit_transform(documents)

In [91]:
vectorizer.vocabulary_

{'ability': 0,
 'absolute': 1,
 'accept': 2,
 'access': 3,
 'account': 4,
 'act': 5,
 'action': 6,
 'address': 7,
 'admit': 8,
 'advance': 9,
 'advantage': 10,
 'age': 11,
 'agency': 12,
 'agree': 13,
 'air': 14,
 'al': 15,
 'allen': 16,
 'alomar': 17,
 'amount': 18,
 'analysis': 19,
 'answer': 20,
 'anybody': 21,
 'anything': 22,
 'apollo': 23,
 'apple': 24,
 'application': 25,
 'appreciate': 26,
 'approach': 27,
 'appropriate': 28,
 'april': 29,
 'area': 30,
 'argue': 31,
 'argument': 32,
 'article': 33,
 'ask': 34,
 'astronomy': 35,
 'attempt': 36,
 'attention': 37,
 'attitude': 38,
 'authority': 39,
 'avoid': 40,
 'ball': 41,
 'base': 42,
 'baseball': 43,
 'basis': 44,
 'batting': 45,
 'become': 46,
 'begin': 47,
 'belief': 48,
 'bill': 49,
 'bit': 50,
 'blue': 51,
 'board': 52,
 'bob': 53,
 'body': 54,
 'book': 55,
 'born': 56,
 'boston': 57,
 'bought': 58,
 'box': 59,
 'bring': 60,
 'brought': 61,
 'build': 62,
 'building': 63,
 'built': 64,
 'business': 65,
 'buying': 66,
 'cabl

In [92]:
tokens = vectorizer.get_feature_names()
tokens

['ability',
 'absolute',
 'accept',
 'access',
 'account',
 'act',
 'action',
 'address',
 'admit',
 'advance',
 'advantage',
 'age',
 'agency',
 'agree',
 'air',
 'al',
 'allen',
 'alomar',
 'amount',
 'analysis',
 'answer',
 'anybody',
 'anything',
 'apollo',
 'apple',
 'application',
 'appreciate',
 'approach',
 'appropriate',
 'april',
 'area',
 'argue',
 'argument',
 'article',
 'ask',
 'astronomy',
 'attempt',
 'attention',
 'attitude',
 'authority',
 'avoid',
 'ball',
 'base',
 'baseball',
 'basis',
 'batting',
 'become',
 'begin',
 'belief',
 'bill',
 'bit',
 'blue',
 'board',
 'bob',
 'body',
 'book',
 'born',
 'boston',
 'bought',
 'box',
 'bring',
 'brought',
 'build',
 'building',
 'built',
 'business',
 'buying',
 'cable',
 'california',
 'call',
 'cannot',
 'card',
 'care',
 'career',
 'careful',
 'carry',
 'case',
 'catholic',
 'cause',
 'center',
 'centris',
 'chance',
 'change',
 'chapter',
 'charge',
 'cheaper',
 'check',
 'chip',
 'choose',
 'christ',
 'christian',
 

In [93]:
id2token = dict()
for tid in range(len(tokens)):
    id2token[tid] = tokens[tid]

In [94]:
corpus_new = list()
for el in X:
    corpus_new.append(list(zip(el.indices, el.data)))

In [95]:
corpus_new[1][:5]

[(500, 0.24675995479523452),
 (312, 0.24424189709064797),
 (203, 0.26551213163162829),
 (499, 0.48139432168768415),
 (453, 0.23739709034908171)]

<p>Кластеризация</p>

In [96]:
lda = models.ldamodel.LdaModel(corpus=corpus_new, num_topics=4, id2word=id2token, passes=5) #passes - 5 раз пройдемся для обучения

In [97]:
lda.print_topics(20)

[(0,
  '0.017*"year" + 0.013*"baseball" + 0.012*"team" + 0.011*"sox" + 0.011*"lot" + 0.011*"league" + 0.010*"please" + 0.009*"win" + 0.009*"play" + 0.008*"game"'),
 (1,
  '0.030*"space" + 0.014*"nasa" + 0.012*"launch" + 0.011*"orbit" + 0.011*"lunar" + 0.010*"shuttle" + 0.009*"heard" + 0.009*"program" + 0.009*"home" + 0.009*"figure"'),
 (2,
  '0.027*"god" + 0.019*"jesus" + 0.010*"part" + 0.010*"day" + 0.010*"man" + 0.009*"church" + 0.009*"christianity" + 0.008*"christ" + 0.008*"way" + 0.008*"hell"'),
 (3,
  '0.021*"mac" + 0.013*"book" + 0.012*"apple" + 0.012*"scsi" + 0.011*"system" + 0.011*"problem" + 0.011*"disk" + 0.011*"want" + 0.010*"drive" + 0.010*"color"')]

In [98]:
tops = lda.get_topic_terms(2, topn=100)
tops[:4]

[(219, 0.027449347235968955),
 (260, 0.018702710614444729),
 (364, 0.0098985503446653023),
 (128, 0.0096877207219859037)]

<p>Простой тест</p>

In [99]:
x_test = newsgroups_test.data[1]
x_test

'So what is your definition of "interfering with the fielder taking the throw"?\n\nThe rule book certainly doesn\'t have a definiton or clarification, so it\'s\npossible to interpret the rule as saying that if the catcher has to alter\nhis throw to avoid hitting the batter-runner, then again we have interference.\nYou know, it seems that there is no way to apply this rule justly--if the\ncatcher (or the pitcher, say Rob Dibble, for example) throws toward first\nand hits the runner running inside the baseline, the fielder takes the chance\nof being ejected.  Therefore he probably would throw around the runner or\n(your scenario) above him.\n\nYou should note that in our American Legion League, (which uses MLB rules) we\ninterpret the rule to say in this very circumstance there IS interference\nper rule 7.09.'

In [100]:
corpus_test = list()
for el in mytokenizer(x_test):
    
    if el in vectorizer.vocabulary_:
        corpus_test.append((vectorizer.vocabulary_[el], 1.0))

corpus_test

[(136, 1.0),
 (55, 1.0),
 (564, 1.0),
 (40, 1.0),
 (595, 1.0),
 (81, 1.0),
 (564, 1.0),
 (349, 1.0)]

In [101]:
newsgroups_test.target

array([3, 1, 3, ..., 3, 2, 1], dtype=int64)

In [102]:
lda.get_document_topics(corpus_test)

[(0, 0.40979828990102418),
 (1, 0.029146764800081531),
 (2, 0.52868915339238276),
 (3, 0.032365791906511635)]

<a name="4"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">4. Визуализация с использованием WordCloud</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

In [1]:
from wordcloud import WordCloud

ModuleNotFoundError: No module named 'wordcloud'

In [107]:
dict_top = {id2token[id]:pr for id, pr in tops}
dict_top

{'accept': 0.0042006939473176004,
 'answer': 0.0057221349129499286,
 'ask': 0.0052369813648890694,
 'bill': 0.0031230515335756012,
 'body': 0.0073567663350291865,
 'cannot': 0.0040565110819015303,
 'cause': 0.0042517525287223048,
 'centris': 0.0039903741332256324,
 'check': 0.004203106288183588,
 'choose': 0.0045641224658503647,
 'christ': 0.0084791612066368247,
 'christian': 0.0041253753293957482,
 'christianity': 0.0087333396885486986,
 'church': 0.0094201428976230413,
 'claim': 0.0043485718620933948,
 'claimed': 0.0032330748536481637,
 'concept': 0.0031762138190376999,
 'day': 0.0096877207219859037,
 'death': 0.0051422983089297464,
 'defensive': 0.0036158841426596573,
 'existence': 0.0032252710754860476,
 'experience': 0.0038184282165291726,
 'explain': 0.0043013168960917801,
 'fact': 0.0046143808241997071,
 'faith': 0.0051061641377886484,
 'father': 0.0047808214055777825,
 'fear': 0.0035163153563122036,
 'felt': 0.0037651222529385834,
 'found': 0.0036000603867523805,
 'friend': 0.0

In [108]:
wc = WordCloud(background_color="white", 
               max_words=100, 
               width=1600,
               height=1200).generate_from_frequencies(dict_top)

NameError: name 'WordCloud' is not defined

In [None]:
plt.figure(figsize=[10,10])
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

<a name="5"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">5. Источники</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

In [None]:
http://qwone.com/~jason/20Newsgroups/
http://scikit-learn.org/stable/datasets/twenty_newsgroups.html

In [None]:
https://radimrehurek.com/gensim/models/ldamodel.html
https://amueller.github.io/word_cloud/auto_examples/simple.html
http://www.peculiarparity.com/using-gensim-with-andreas-muellers-word-cloud/