In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\T.Aathman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [4]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [5]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\T.Aathman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [7]:
import random
text_data = []
with open('dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['socialsensor', 'sensing', 'generate', 'input', 'improve', 'medium', 'discovery', 'experience']
['potter', 'wheel', 'interactive', 'cleaning', 'system']
['towards', 'highly', 'scalable', 'effective', 'metasearch', 'engine']
['detection', 'temperature', 'gradient', 'using', 'power', 'temperature', 'sensor']
['personalize', 'recommendation', 'dynamic', 'content', 'using', 'predictive', 'bilinear', 'model']
['step', 'hybrid', 'simulation', 'large', 'scale', 'network']
['orient', 'query', 'robot', 'natural', 'language', 'query', 'system']
['model', 'parameter', 'estimation', 'h.264/avc', 'control']
['learning', 'classify', 'human', 'object', 'sketch']
['efficient', 'raytracing', 'dynamic', 'metaballs']
['cosmicai', 'generate', 'background', 'content', 'base', 'search', 'flexible', 'composition']
['symmetry', 'aware', 'analog', 'layout', 'placement', 'design', 'handling', 'substrate', 'sharing', 'constraint']
['energy', 'efficient', 'reconfigurable', 'viterbi', 'decoder', 'programmable', '

In [8]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [9]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [10]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

### Try 5 topics

In [11]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [12]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.038*"query" + 0.020*"drive" + 0.020*"scale" + 0.020*"large"')
(1, '0.034*"generate" + 0.034*"model" + 0.034*"content" + 0.019*"dynamic"')
(2, '0.033*"image" + 0.033*"efficient" + 0.018*"substrate" + 0.018*"sharing"')
(3, '0.041*"temperature" + 0.022*"sensor" + 0.022*"using" + 0.022*"fluorescent"')
(4, '0.027*"system" + 0.027*"base" + 0.027*"generate" + 0.027*"design"')


In [13]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(49, 1), (121, 1)]
[(0, 0.06669849), (1, 0.39907223), (2, 0.06669479), (3, 0.40082628), (4, 0.06670825)]


In [14]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.027*"efficient" + 0.016*"design" + 0.016*"network" + 0.016*"base"')
(1, '0.023*"system" + 0.023*"query" + 0.023*"drive" + 0.013*"large"')
(2, '0.033*"model" + 0.023*"generate" + 0.023*"using" + 0.023*"temperature"')


In [15]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.043*"efficient" + 0.043*"energy" + 0.043*"programmable" + 0.043*"decoder"')
(1, '0.039*"sensing" + 0.039*"improve" + 0.039*"medium" + 0.039*"socialsensor"')
(2, '0.064*"garden" + 0.064*"cultivation" + 0.064*"henry" + 0.064*"grow"')
(3, '0.044*"design" + 0.044*"system" + 0.044*"query" + 0.023*"aware"')
(4, '0.072*"image" + 0.038*"using" + 0.038*"0.18-&micro;m" + 0.038*"transformer"')
(5, '0.060*"learning" + 0.060*"object" + 0.060*"classify" + 0.060*"sketch"')
(6, '0.060*"control" + 0.060*"parameter" + 0.060*"estimation" + 0.060*"h.264/avc"')
(7, '0.026*"scale" + 0.026*"large" + 0.026*"drive" + 0.026*"network"')
(8, '0.083*"temperature" + 0.043*"using" + 0.043*"sensor" + 0.043*"gradient"')
(9, '0.058*"generate" + 0.058*"model" + 0.058*"content" + 0.058*"base"')


### pyLDAvis

In [16]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [17]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary,sort_topics=False)
pyLDAvis.save_html(lda_display, 'lda.html')
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [18]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.save_html(lda_display3, 'lda3.html')
pyLDAvis.display(lda_display3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [19]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.save_html(lda_display10, 'lda10.html')
pyLDAvis.display(lda_display10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
