In [1]:
import gensim
import pyLDAvis
from pyLDAvis import gensim as gensimvis
import spacy

import logging
from tqdm import tqdm
from pprint import pprint


from sklearn import datasets
import pandas as pd

In [2]:
# Set categories
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 
              'rec.motorcycles', 'sci.space', 'talk.politics.mideast']
# Download the training subset of the 20 NG dataset, with headers, footers, quotes removed
# Only keep docs from the 6 categories above
ng_train = datasets.fetch_20newsgroups(subset='train', categories=categories, 
                                      remove=('headers', 'footers', 'quotes'))

In [3]:
ng_train.data[0]

'Well, the Red Sox have apparenly resigned Herm Winningham to a AAA contract.\nTed "Larry" Simmons signed him to a AAA contract then released him from\nBuffalo, allowing Lou "Curly" Gorman to circumvent the rule about not\nresigning free agents until May 1. Clearly, neither of these guys is bright\nenough to be Moe.\n\n Mike Jones | AIX High-End Development | mjones@donald.aix.kingston.ibm.com'

In [4]:
logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        

clean_sents = list(sent_to_words(ng_train.data))

In [6]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(clean_sents, min_count=20) # higher threshold fewer phrases.

trigram = gensim.models.Phrases(bigram[clean_sents], min_count=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)



In [7]:
for s in clean_sents[0:5]:
    print(f'{" ".join(trigram_model[bigram_model[s]]) } \n')

well the red_sox have apparenly resigned herm winningham to aaa contract ted larry simmons signed him to aaa contract then released him from buffalo allowing lou curly gorman to circumvent the rule about not resigning free agents until may clearly neither of these_guys is bright enough to_be moe mike jones aix high end development mjones donald aix kingston ibm_com 

was_wondering_if anyone knows where_can_get more information_about the graphics in the wingcommander series and the realspace system they use think it really awesome and wouldn mind being_able_to use similar features in programs thanks_in_advance 

do is still concept the software is what sells and what will determine its success apparantly you dont keep up on the news do was shown at ces to developers and others at private showings over software licensees currently developing software for it 

this posts contains too_many fallacies to respond too the abolishment of divinity requires the elimination of freewill you have no

In [8]:
clean_sents = [trigram_model[bigram_model[t]] for t in clean_sents]

In [9]:
for t in clean_sents[:5]:
    print(f'{" ".join(t)} \n')

well the red_sox have apparenly resigned herm winningham to aaa contract ted larry simmons signed him to aaa contract then released him from buffalo allowing lou curly gorman to circumvent the rule about not resigning free agents until may clearly neither of these_guys is bright enough to_be moe mike jones aix high end development mjones donald aix kingston ibm_com 

was_wondering_if anyone knows where_can_get more information_about the graphics in the wingcommander series and the realspace system they use think it really awesome and wouldn mind being_able_to use similar features in programs thanks_in_advance 

do is still concept the software is what sells and what will determine its success apparantly you dont keep up on the news do was shown at ces to developers and others at private showings over software licensees currently developing software for it 

this posts contains too_many fallacies to respond too the abolishment of divinity requires the elimination of freewill you have no

In [10]:
nlp = spacy.load('en', disable=['parser', 'ner'])

# NOUN, ADJ, VERB, ADV
def lemmatization(texts, allowed_postags=['NOUN']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        
        output_text = [token.lemma_ for token in doc if token.pos_ in allowed_postags and token.is_stop == False]
        
        if len(output_text) > 0:
            texts_out.append(output_text)
    return texts_out

In [11]:
clean_sents = lemmatization(clean_sents)

In [12]:
for t in clean_sents[0:5]:
    print(f'{" ".join(t)} \n')

red_sox winningham aaa contract larry simmon aaa contract buffalo rule agent these_guy to_be moe mike jone end development mjone kingston ibm_com 

where_can_get graphic wingcommander series realspace system wouldn mind feature program thanks_in_advance 

software success news ce developer showing software licensee software 

post fallacy abolishment divinity elimination freewill existance omniscience freewill mortal reason angel do_not freewill 

motorcycle enthusiast motorcycle advice bike one_thing bike time question racing bike cbr gsx ve_never bike learner love advice help_me search place bike you_want thank 



![The Office](theoffice.gif)

# Now for the LDA part

In [13]:
id2word = gensim.corpora.Dictionary(clean_sents)

corpus = [id2word.doc2bow(t) for t in clean_sents]

In [14]:
[(id2word[id], freq) for id, freq in corpus[0]]

[('aaa', 2),
 ('agent', 1),
 ('buffalo', 1),
 ('contract', 2),
 ('development', 1),
 ('end', 1),
 ('ibm_com', 1),
 ('jone', 1),
 ('kingston', 1),
 ('larry', 1),
 ('mike', 1),
 ('mjone', 1),
 ('moe', 1),
 ('red_sox', 1),
 ('rule', 1),
 ('simmon', 1),
 ('these_guy', 1),
 ('to_be', 1),
 ('winningham', 1)]

In [15]:
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=128,
                                           per_word_topics=True)

In [16]:
pprint(lda_model.print_topics())

[(0,
  '0.014*"armenian" + 0.012*"people" + 0.010*"turkey" + 0.008*"turk" + '
  '0.008*"village" + 0.006*"jew" + 0.006*"armenia" + 0.006*"genocide" + '
  '0.005*"azerbaijan" + 0.005*"population"'),
 (1,
  '0.015*"edu" + 0.013*"graphic" + 0.011*"line" + 0.009*"mail" + '
  '0.008*"driver" + 0.007*"mode" + 0.006*"thank" + 0.006*"com" + 0.005*"card" '
  '+ 0.005*"program"'),
 (2,
  '0.020*"people" + 0.011*"woman" + 0.009*"time" + 0.008*"armenian" + '
  '0.007*"apartment" + 0.007*"car" + 0.007*"neighbor" + 0.006*"child" + '
  '0.006*"way" + 0.006*"thing"'),
 (3,
  '0.018*"space" + 0.008*"program" + 0.008*"system" + 0.007*"year" + '
  '0.007*"time" + 0.006*"technology" + 0.006*"rocket" + 0.005*"nasa" + '
  '0.005*"satellite" + 0.005*"flight"'),
 (4,
  '0.024*"game" + 0.020*"team" + 0.015*"player" + 0.013*"year" + 0.009*"fan" + '
  '0.009*"baseball" + 0.009*"season" + 0.008*"run" + 0.007*"time" + '
  '0.006*"last_year"'),
 (5,
  '0.030*"bike" + 0.012*"dod" + 0.010*"motorcycle" + 0.008*"time" 

In [20]:
print(ng_train.data[0])

Well, the Red Sox have apparenly resigned Herm Winningham to a AAA contract.
Ted "Larry" Simmons signed him to a AAA contract then released him from
Buffalo, allowing Lou "Curly" Gorman to circumvent the rule about not
resigning free agents until May 1. Clearly, neither of these guys is bright
enough to be Moe.

 Mike Jones | AIX High-End Development | mjones@donald.aix.kingston.ibm.com


In [21]:
top_topics = lda_model.get_document_topics(corpus[0])
top_topics.sort(key=lambda x: x[1], reverse=True)

print(top_topics)

[(4, 0.5354464), (2, 0.3018573), (3, 0.13087486)]


In [22]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, 'lda.html')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
