In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [3]:
#NLTK wordnet for synonyms and antonyms
import nltk
nltk.download('wordnet')

#Wordnet lemmatizer to get root word
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sivard2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
#Stopwords
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sivard2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [6]:
import random
text_data = []
with open('dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['gesture', 'world', 'technology', 'estimation', 'system', 'unspecified', 'user', 'using', 'compact', 'speed', 'camera']
['integrating', 'fluid', 'model', 'packet', 'simulation']
['opinion', 'receive', 'online', 'community', 'study', 'helpfulness', 'vote']
['textual', 'contextual', 'pattern', 'sentiment', 'analysis', 'microblogs']
['hardware', 'organization', 'nonnumeric', 'processing']
['apply', 'database', 'technology', 'storage', 'system']
['mix', 'domain', 'receiver', 'architecture', 'white', 'space', 'software', 'define', 'radio', 'scenario']
['automate', 'design', 'multi', 'dimensional', 'clustering', 'table', 'relational', 'database']
['voltage', 'single', 'end', 'fully', 'differential', 'amplifier', 'programmable']
['tile', 'interleave', 'multi', 'level', 'discrete', 'wavelet', 'transform']
['dipzoom', 'internet', 'measurement', 'marketplace']
['3.5-d', 'integration', 'study']
['architecture', 'manage', 'application', 'services', 'global', 'network']
['intelligent', 'remotely',

In [7]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')



# 5 Topics

In [19]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.017*"hardware" + 0.017*"array" + 0.017*"field" + 0.017*"digitally"')
(1, '0.024*"system" + 0.024*"single" + 0.024*"amplifier" + 0.024*"technology"')
(2, '0.041*"database" + 0.022*"architecture" + 0.022*"storage" + 0.022*"apply"')
(3, '0.030*"application" + 0.030*"internet" + 0.017*"study" + 0.017*"merge"')
(4, '0.022*"network" + 0.022*"system" + 0.022*"phase" + 0.022*"processing"')


In [20]:
#Test
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))
#print(ldamodel.print_topics(num_words=4))

[(87, 1)]
[(0, 0.10001788), (1, 0.10002507), (2, 0.10002322), (3, 0.10304928), (4, 0.5968846)]


In [10]:
#3 Topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.031*"system" + 0.021*"technology" + 0.021*"database" + 0.012*"study"')
(1, '0.028*"network" + 0.028*"application" + 0.016*"processing" + 0.016*"algorithm"')
(2, '0.022*"programmable" + 0.022*"single" + 0.022*"amplifier" + 0.022*"hardware"')


In [11]:
#10 Topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.027*"technology" + 0.027*"compact" + 0.027*"user" + 0.027*"unspecified"')
(1, '0.041*"processing" + 0.041*"application" + 0.041*"parallel" + 0.041*"discrete"')
(2, '0.048*"network" + 0.048*"service" + 0.048*"support" + 0.048*"wireless"')
(3, '0.034*"study" + 0.034*"vote" + 0.034*"community" + 0.034*"packet"')
(4, '0.031*"architecture" + 0.031*"software" + 0.031*"scenario" + 0.031*"radio"')
(5, '0.088*"system" + 0.031*"aware" + 0.031*"move" + 0.031*"recommender"')
(6, '0.052*"programmable" + 0.052*"amplifier" + 0.052*"single" + 0.052*"differential"')
(7, '0.029*"hardware" + 0.029*"contextual" + 0.029*"analysis" + 0.029*"switching"')
(8, '0.044*"database" + 0.044*"multi" + 0.044*"relational" + 0.044*"design"')
(9, '0.044*"internet" + 0.044*"algorithm" + 0.044*"phase" + 0.044*"multilevel"')


# PYLDAvis

In [15]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model10.gensim')

import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
