In [1]:
import os
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

# from utils.helpers import trigram_bow_generator, explore_topic

import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle

intermediate_directory = os.path.join('../data/intermediate')
trigram_essays_all_filepath = os.path.join(intermediate_directory, 'trigram_essays_all6.txt')
trigram_dictionary_filepath = os.path.join(intermediate_directory, 'trigram_dict_all6.dict')

In [2]:
# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if 1 == 1:

    trigram_essays = LineSentence(trigram_essays_all_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_essays)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

trigram_bow_filepath = os.path.join(intermediate_directory, 'trigram_bow_corpus_all6.mm')

In [3]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for essay in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(essay)

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if 1 == 1:

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath, trigram_bow_generator(trigram_essays_all_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all6')

In [71]:
# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if 1 == 1:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=5,
                           id2word=trigram_dictionary,
                           workers=3,
                           random_state=5)
    
    lda.save(lda_model_filepath)
    
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [72]:
def explore_topic(topic_number, topn=10):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """

    print('{:20} {}'.format('term', 'frequency') + '\n')

    for term, frequency in lda.show_topic(topic_number, topn=10):
        print('{:20} {:.3f}'.format(term, round(frequency, 3)))

In [73]:
explore_topic(topic_number=0)

term                 frequency

mooring_mast         0.022
frame                0.019
problem              0.013
mast                 0.013
moor                 0.011
stress               0.010
land                 0.010
nature               0.010
add_stress           0.010
safety               0.009


In [23]:
def words_in_a_topic(topic_number):
    """
    accepts a user-suplied topic number and
    prints out a list of words
    """
    words = []
    for term, frequency in lda.show_topic(topic_number):
        words.append((term, frequency))
    
    return words

In [29]:
topic1 = words_in_a_topic(0)

In [31]:
len(topic1)

10

In [74]:

'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.022*"mooring_mast" + 0.019*"frame" + 0.013*"problem" + 0.013*"mast" + 0.011*"moor" + 0.010*"stress" + 0.010*"land" + 0.010*"nature" + 0.010*"add_stress" + 0.009*"safety"


Topic: 1 
Words: 0.022*"build" + 0.015*"problem" + 0.015*"mooring_mast" + 0.014*"Empire_State" + 0.014*"frame" + 0.010*"nature" + 0.008*"stress" + 0.008*"exist_law_airship_fly" + 0.008*"idea" + 0.008*"mast"


Topic: 2 
Words: 0.027*"problem" + 0.020*"mast" + 0.018*"mooring_mast" + 0.014*"frame" + 0.012*"blimp" + 0.012*"architect" + 0.012*"paragraph" + 0.012*"area" + 0.011*"great" + 0.011*"Empire_State"


Topic: 3 
Words: 0.029*"problem" + 0.018*"mooring_mast" + 0.016*"safety" + 0.013*"mast" + 0.013*"idea" + 0.012*"Empire_State" + 0.012*"current" + 0.011*"nature" + 0.010*"cause" + 0.010*"build"


Topic: 4 
Words: 0.021*"mooring_mast" + 0.020*"blimp" + 0.017*"nature" + 0.015*"mast" + 0.013*"law" + 0.012*"problem" + 0.011*"allow" + 0.011*"safety" + 0.011*"cause" + 0.010*"dangerous"


