In [43]:
import spacy
from __future__ import unicode_literals
import codecs
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [44]:
import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /home/abhishek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [45]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abhishek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [47]:
import random
text_data = []
file = codecs.open("/home/abhishek/projects/TopicmodellingSample/dataset3.csv", "r", "utf-8")
for line in file:
       tokens = prepare_text_for_lda(line)
       if random.random() > .99:
           print(tokens)
           text_data.append(tokens)

[u'language', u'shading', u'lighting', u'calculation']
[u'localization', u'surface', u'sensor', u'network', u'challenge', u'solution']
[u'highly', u'programmable', u'infrastructure', u'prototyping', u'developing', u'deploy', u'genomics', u'centric', u'application']
[u'experimental', u'study', u'leakage', u'delay', u'trade', u'germanium', u'pmosfets', u'logic', u'circuit']
[u'security', u'using', u'shannon', u'elia', u'code']
[u'synchronization', u'kuramoto', u'model', u'multi', u'scale', u'property']
[u'bureo', u'census']
[u'approach', u'construct', u'interface']
[u'online', u'modeling', u'proactive', u'moderation', u'system', u'auction', u'fraud', u'detection']
[u'streaming', u'code', u'channels', u'burst', u'isolate', u'erasure']
[u'spectrum', u'cloud', u'session', u'base', u'spectrum', u'trading', u'system', u'multi', u'cognitive', u'radio', u'network']
[u'distribute', u'secure', u'balancing', u'heterogeneity', u'churn']
[u'optimize', u'asynchronous', u'multi', u'channel', u'neighbo

In [48]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [49]:
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [50]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, u'0.024*"code" + 0.024*"detection" + 0.024*"efficiency" + 0.024*"isolate"')
(1, u'0.021*"noise" + 0.021*"transistor" + 0.021*"readout" + 0.021*"current"')
(2, u'0.043*"using" + 0.030*"system" + 0.016*"developing" + 0.016*"genomics"')
(3, u'0.049*"multi" + 0.027*"neighbor" + 0.027*"asynchronous" + 0.027*"model"')
(4, u'0.052*"base" + 0.052*"spectrum" + 0.028*"network" + 0.028*"semantic"')


In [51]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[]
[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]


In [52]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, u'0.033*"multi" + 0.023*"system" + 0.023*"spectrum" + 0.013*"efficiency"')
(1, u'0.023*"network" + 0.023*"using" + 0.023*"code" + 0.023*"detection"')
(2, u'0.020*"semantic" + 0.020*"using" + 0.020*"document" + 0.020*"underapproximation"')


In [53]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, u'0.026*"moderation" + 0.026*"projection" + 0.026*"language" + 0.026*"projectile"')
(1, u'0.040*"detection" + 0.040*"base" + 0.040*"cache" + 0.040*"duplicate"')
(2, u'0.063*"multi" + 0.063*"semantic" + 0.062*"efficient" + 0.062*"maintenance"')
(3, u'0.054*"system" + 0.054*"spectrum" + 0.028*"multi" + 0.028*"session"')
(4, u'0.053*"network" + 0.053*"challenge" + 0.053*"solution" + 0.053*"sensor"')
(5, u'0.062*"tracing" + 0.062*"homepage" + 0.062*"block" + 0.062*"automatic"')
(6, u'0.036*"logic" + 0.036*"leakage" + 0.036*"pmosfets" + 0.036*"trade"')
(7, u'0.056*"using" + 0.056*"factorization" + 0.056*"nonnegative" + 0.056*"document"')
(8, u'0.051*"application" + 0.051*"highly" + 0.051*"infrastructure" + 0.051*"centric"')
(9, u'0.035*"using" + 0.035*"velocity" + 0.035*"readout" + 0.035*"pattern"')


In [55]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [56]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

In [57]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)