In [1]:
import pandas as pd
import csv
import re
import warnings
import time 
warnings.filterwarnings('ignore')

In [2]:
file = "big_df.csv"  #large file for production
#file = "small_R_df.csv"       #small file for development
df = pd.read_csv('private/'+file, delimiter='|', encoding='latin-1')

In [3]:
#### stripping HTML tags ####
p = '<.*?>'
for col in df.columns:
    df[col] = df[col].replace(p, '', regex=True)

In [4]:
# splitting categories into main and sub, dropping non-Book main categories #
regex = r"\/(.*?)\/.*"

df['sub_cat'] = df['categoryPath'].str.extract(regex) # sub categories, one level below Books.
df['main_cat'] = df['categoryPath'].astype(str).str[:5] 
df = df[df.main_cat == 'Books'] #dropping non-Book categories

# drop rows missing critical data #
df = df.dropna(axis=0, how='any', subset=['longDn', 'sub_cat','name'])
df.isnull().sum().sort_values(ascending=False)

model           241307
parentItemId      3826
itemId            3375
shortDn            202
main_cat             0
sub_cat              0
catNode              0
longDn               0
categoryPath         0
name                 0
dtype: int64

In [5]:
print("There are " + str(len(df.sub_cat.value_counts())) + " categories of books.")
counts = df['sub_cat'].value_counts()
threshold = 100
print(str(len(counts[counts>threshold])) + " categories have more than " + str(threshold) +" books.")

There are 74 categories of books.
56 categories have more than 100 books.


In [6]:
Top_40_cats = df.sub_cat.value_counts().iloc[:39].index.tolist()

#df.sub_cat.value_counts().iloc[30:40]
print(df.sub_cat.value_counts().iloc[:39])


Business & Money Books                      20751
Children's & Kids' Books                    19802
Medical Books                               19238
History Books                               16723
Education Books                             14108
Computers & Technology Books                12785
Biographies & Memoirs                       10257
Sports & Outdoor Books                      10054
Law Books                                    9635
Political Books                              9273
Self-Help Books                              7097
Health & Wellness Books                      6852
Crafts & Hobbies Books                       6473
Mathematics Books                            6387
Philosophy Books                             6110
Travel Books                                 5810
Language Arts & Disciplines Books            5767
Reference Books                              5670
Foreign Language Study & Reference Books     5547
Teen & Young Adult Books                     5358


In [7]:
# Only keep records for top n populated categories (drop sparsely populated categories)
df = df.loc[df['sub_cat'].isin(Top_40_cats)]

In [77]:
df.sub_cat.value_counts().tail(n=30)

Political Books                             9273
Self-Help Books                             7097
Health & Wellness Books                     6852
Crafts & Hobbies Books                      6473
Mathematics Books                           6387
Philosophy Books                            6110
Travel Books                                5810
Language Arts & Disciplines Books           5767
Reference Books                             5670
Foreign Language Study & Reference Books    5547
Teen & Young Adult Books                    5358
Cookbooks, Food & Wine                      5031
Study Aids & Test Prep Books                3846
Humor Books                                 3491
Comic Books & Graphic Novels                3206
Literature & Fiction Books                  3163
Arts & Entertainment Books                  2570
Religion & Spirituality Books               2515
Psychology & Social Science Books           2336
Science & Nature Books                      1930
Libros en Espanol   

In [8]:
df.sub_cat.value_counts().iloc[:39].index.tolist()

#df.sub_cat.value_counts().iloc[30:40]
print(df.sub_cat.value_counts().iloc[:39])


Business & Money Books                      20751
Children's & Kids' Books                    19802
Medical Books                               19238
History Books                               16723
Education Books                             14108
Computers & Technology Books                12785
Biographies & Memoirs                       10257
Sports & Outdoor Books                      10054
Law Books                                    9635
Political Books                              9273
Self-Help Books                              7097
Health & Wellness Books                      6852
Crafts & Hobbies Books                       6473
Mathematics Books                            6387
Philosophy Books                             6110
Travel Books                                 5810
Language Arts & Disciplines Books            5767
Reference Books                              5670
Foreign Language Study & Reference Books     5547
Teen & Young Adult Books                     5358


In [9]:
#R_data_for_LDA = df
#R_data_for_LDA.to_csv('private/py_export_'+file, sep='|')

# Preparing Corpus and BOW for LDA Topic Modeling with Gensim
#### reference used: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [10]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import nltk; nltk.download('stopwords')
#!python3 -m spacy download en

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [11]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

  """
  """
  """


In [35]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['book'])

In [13]:
print(df.sub_cat.unique())

['Arts & Entertainment Books' "Children's & Kids' Books"
 'Business & Money Books' 'Cookbooks, Food & Wine'
 'Crafts & Hobbies Books' 'Literature & Fiction Books'
 'Religion & Spirituality Books' 'Dieting & Fitness Books'
 'Study Aids & Test Prep Books' 'Health & Wellness Books'
 'Biographies & Memoirs' 'Reference Books' 'Humor Books'
 'Libros en Espanol' 'Computers & Technology Books'
 'Comic Books & Graphic Novels' 'Teen & Young Adult Books' 'Medical Books'
 'Self-Help Books' 'History Books' 'True Crime Books'
 'Psychology & Social Science Books' 'House, Home & Gardening Books'
 'Travel Books' 'Education Books' 'Engineering & Transportation Books'
 'Science & Nature Books' 'Language Arts & Disciplines Books' 'Law Books'
 'Foreign Language Study & Reference Books' 'Mathematics Books'
 'Political Books' 'Philosophy Books' 'Sports & Outdoor Books'
 'Language Arts & Disciplines' 'Business & Money' 'Nonfiction' 'History'
 'Law']


In [32]:
data = df.longDn.values.tolist()

In [33]:
# converting contractions
data = [re.sub("won't","will not", sent) for sent in data]
data = [re.sub("can't","can not", sent) for sent in data]
data = [re.sub("n't","not", sent) for sent in data]
data = [re.sub("\'ll"," will", sent) for sent in data]
data = [re.sub("\'re"," are", sent) for sent in data]
data = [re.sub("\'ve"," have", sent) for sent in data]

# removing any single quotes
data = [re.sub("\'", " ", sent) for sent in data] 
# removing pipes to help with saving csv as pipe-delimited
data = [re.sub("\|", " ", sent) for sent in data]
# leave only letters and numbers (makes above lines redundant, but that's ok)
data = [re.sub("[^a-zA-Z0-9 ]", " ", sent) for sent in data]
# removing any extraneous spaces
data = [re.sub("\s+", " ", sent) for sent in data] #strip extra spaces

  data = [re.sub("\|", " ", sent) for sent in data]
  data = [re.sub("\s+", " ", sent) for sent in data] #strip extra spaces


In [34]:
data[300]

'Robust DC Comics Roster Engage in epic match ups with a massive roster of DC Comics heroes and villains including Batman Cyborg The Flash Harley Quinn Nightwing Solomon Grundy Superman and Wonder Woman Master God Like Powers Unleash each character s unique super powers with individual move sets and environmental interactions Uncontainable Battles Fight through multi tiered fighting arenas by hurling opponents through buildings and launching them off cliffs Original DC Comics Saga Set in Iconic Environments Experience an authentic DC Comics storyline created in collaboration between NetherRealm Studios DC Entertainment and writers Justin Gray and Jimmy Palmiotti Players will discover and do battle in numerous iconic locales pulled from DC Comics lore Destructive Fighter in which the World is a Weapon Experience epic battles on a massive scale as players control the most powerful beings in the universe in destructible interactive environments Destroy elements of the world and use super 

In [36]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [37]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=50) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
#print(trigram_mod[bigram_mod[data_words[4]]])

['dr_seuss', 'wonderfully', 'wise', 'oh', 'the', 'places', 'you', 'll', 'go', 'is', 'the', 'perfect', 'send', 'off', 'for', 'grads', 'from', 'nursery', 'school', 'high', 'school', 'college', 'and', 'beyond', 'from', 'soaring', 'to', 'high', 'heights', 'and', 'seeing', 'great', 'sights', 'to', 'being', 'left', 'in', 'lurch', 'on', 'prickle', 'ly', 'perch', 'dr_seuss', 'addresses', 'life', 'ups', 'and', 'downs', 'with', 'his', 'trademark', 'humorous', 'verse', 'and', 'illustrations', 'while', 'encouraging', 'readers', 'to', 'find', 'the', 'success', 'that', 'lies', 'within', 'in', 'starred_review', 'booklist', 'notes', 'seuss', 'message', 'is', 'simple', 'but', 'never', 'sappy', 'life', 'may', 'be', 'great', 'balancing_act', 'but', 'through', 'it', 'all', 'there', 'fun', 'to', 'be', 'done', 'perennial_favorite', 'and', 'perfect', 'gift', 'for', 'anyone', 'starting', 'new', 'phase', 'in', 'their', 'life']


In [38]:
start = time.time()
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

end = time.time()
print(end-start)

In [39]:
start = time.time()
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

#print(data_lemmatized[:4])
end = time.time()
print(end-start)

  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=

  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])


[['garth_brook', 'anthology'], ['perfect', 'gift', 'collector', 'new', 'reader', 'alike', 'present', 'breathtaking', 'special', 'edition', 'boxed_set', 'jharry', 'potter', 'sorcerer_stone', 'harry_potter', 'chamber', 'secret', 'harry_potter', 'prisoner', 'azkaban', 'harry_potter', 'goblet', 'fire', 'harry_potter', 'order', 'phoenix', 'harry_potter', 'half', 'blood', 'prince', 'harry_potter', 'deathly_hallow', 'isbn', 'series', 'harry', 'potterprimary', 'category', 'juvenile', 'fiction', 'fantasy', 'magic'], ['strengthsfinder', 'feature', 'new', 'upgrade', 'version', 'strengthsfinder', 'program', 'main', 'selling', 'point', 'mega', 'bestseller', 'discover', 'strength', 'million_copie', 'sell', 'access', 'new', 'upgrade', 'strengthsfinder', 'program', 'available', 'exclusively', 'opportunity', 'good', 'day', 'chance', 'often', 'natural', 'talent', 'go', 'untapped', 'cradle', 'cubicle', 'devote', 'time', 'fix', 'shortcoming', 'develop', 'strength', 'help', 'people', 'uncover', 'talent', '

In [40]:
start = time.time()

id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
end = time.time()
print(end-start)

In [41]:
#reviewing term frequency
#[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:2]]

# Begin LDA Multicore Model

In [42]:
start = time.time()

lda_multicore_model = gensim.models.ldamulticore.LdaMulticore(~corpus=corpus,
                                           id2word=id2word,
                                           workers=3,
                                           num_topics=40, 
                                           random_state=100,
                                           eval_every=1,   # update_every = how often the model should be updated 
                                           chunksize=5000,  # chunksize = number of docs to be used in each training chunk
                                           passes=2,        # passes = total number of training passes
                                           #alpha='auto',
                                           per_word_topics=True)
end = time.time()
print(end-start)

297.54716086387634


In [45]:
pprint(lda_multicore_model.print_topics())
doc_lda = lda_multicore_model[corpus]

[(10,
  '0.034*"food" + 0.027*"recipe" + 0.013*"make" + 0.012*"eat" + 0.010*"cook" + '
  '0.009*"cookbook" + 0.008*"include" + 0.007*"meal" + 0.007*"easy" + '
  '0.007*"delicious"'),
 (39,
  '0.011*"book" + 0.007*"image" + 0.007*"edition" + 0.007*"classic" + '
  '0.006*"imperfection" + 0.006*"zen" + 0.005*"text" + 0.005*"reproduction" + '
  '0.005*"nail" + 0.005*"quality"'),
 (24,
  '0.007*"jewish" + 0.006*"addiction" + 0.006*"world" + 0.006*"people" + '
  '0.006*"right" + 0.005*"time" + 0.004*"make" + 0.004*"country" + 0.004*"jew" '
  '+ 0.004*"first"'),
 (21,
  '0.023*"game" + 0.008*"play" + 0.004*"make" + 0.004*"young" + 0.004*"world" '
  '+ 0.004*"find" + 0.004*"new" + 0.003*"princess" + 0.003*"royal" + '
  '0.003*"learn"'),
 (15,
  '0.008*"provide" + 0.007*"research" + 0.006*"theory" + 0.006*"use" + '
  '0.005*"system" + 0.005*"law" + 0.005*"health" + 0.005*"issue" + '
  '0.005*"approach" + 0.005*"development"'),
 (38,
  '0.018*"plant" + 0.014*"garden" + 0.011*"collection" + '
  '

In [46]:
start = time.time()
# Compute Perplexity
print('\nPerplexity: ', lda_multicore_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_multicore_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
end = time.time()
print(end-start)


Perplexity:  -8.653923361102978

Coherence Score:  0.5232047439787118


In [29]:
pprint(lda_multicore_model.print_topics())
doc_lda = lda_multicore_model[corpus]

[(15,
  '0.009*"new" + 0.006*"history" + 0.005*"year" + 0.005*"business" + '
  '0.005*"life" + 0.005*"war" + 0.004*"american" + 0.004*"become" + '
  '0.003*"man" + 0.003*"state"'),
 (27,
  '0.020*"la" + 0.010*"para" + 0.008*"que" + 0.007*"con" + 0.006*"como" + '
  '0.005*"por" + 0.005*"los" + 0.004*"una" + 0.003*"sobre" + 0.003*"new"'),
 (34,
  '0.006*"horse" + 0.005*"map" + 0.005*"time" + 0.005*"new" + 0.004*"volume" + '
  '0.004*"book" + 0.004*"write" + 0.004*"great" + 0.004*"city" + '
  '0.003*"world"'),
 (39,
  '0.008*"work" + 0.007*"new" + 0.007*"include" + 0.006*"guide" + '
  '0.006*"imperfection" + 0.006*"publish" + 0.005*"part" + 0.005*"use" + '
  '0.005*"may" + 0.005*"print"'),
 (22,
  '0.006*"know" + 0.006*"new" + 0.005*"get" + 0.005*"game" + 0.004*"time" + '
  '0.004*"life" + 0.004*"world" + 0.004*"include" + 0.003*"team" + '
  '0.003*"adventure"'),
 (13,
  '0.015*"health" + 0.007*"game" + 0.006*"author" + 0.005*"include" + '
  '0.004*"help" + 0.004*"care" + 0.004*"first" + 

In [48]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_multicore_model, corpus, id2word)
vis

  nbits = re.compile('(\d+)bit').search(abits).group(1)
  "\s+stepping\s+(?P<STP>\d+)", re.IGNORECASE)


In [None]:
- intentional fail to prevent subsequent code to run -
fail.

# Begin LDA Model

In [43]:

start = time.time()
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=40, 
                                           random_state=100,
                                           #update_every=1,   # update_every = how often the model should be updated 
                                           chunksize=5000,    # chunksize = number of docs to be used in each training chunk
                                           passes=2,
                                           #alpha='auto',
                                           per_word_topics=True)
end = time.time()
print(end-start)


1530414258.3731391
1530415127.724409
869.3512699604034


In [44]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(16,
  '0.054*"die" + 0.053*"und" + 0.050*"der" + 0.010*"den" + 0.010*"eine" + '
  '0.010*"das" + 0.010*"fur" + 0.009*"ist" + 0.008*"auf" + 0.008*"be"'),
 (13,
  '0.016*"student" + 0.015*"include" + 0.014*"research" + 0.014*"chapter" + '
  '0.014*"study" + 0.012*"theory" + 0.011*"science" + 0.011*"text" + '
  '0.011*"provide" + 0.010*"analysis"'),
 (0,
  '0.026*"book" + 0.023*"imperfection" + 0.020*"may" + 0.019*"work" + '
  '0.017*"original" + 0.017*"historical" + 0.016*"classic" + 0.016*"find" + '
  '0.016*"rare" + 0.016*"remain"'),
 (30,
  '0.025*"life" + 0.010*"make" + 0.009*"time" + 0.009*"people" + 0.008*"know" '
  '+ 0.008*"live" + 0.008*"get" + 0.007*"take" + 0.007*"way" + 0.007*"year"'),
 (5,
  '0.016*"french" + 0.013*"part" + 0.012*"africa" + 0.012*"travel" + '
  '0.012*"europe" + 0.011*"france" + 0.010*"museum" + 0.010*"publish" + '
  '0.010*"japanese" + 0.010*"country"'),
 (34,
  '0.085*"work" + 0.020*"edition" + 0.019*"important" + 0.018*"part" + '
  '0.017*"english" + 0.

In [47]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.030667364663111

Coherence Score:  0.5307975222715445


In [49]:
start = time.time()
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

end = time.time()
print(end-start)
vis

# LDA Multicore model approach #2, fewer categories

In [69]:
start = time.time()
lda_model_2 = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30, 
                                           random_state=100,
                                           eval_every=1,      # update_every = how often the model should be updated 
                                           chunksize=10000,    # chunksize = number of docs to be used in each training chunk
                                           passes=2,
                                           per_word_topics=True)
end = time.time()
print(end-start)

204.76681232452393


In [70]:
# Compute Perplexity
print('\nPerplexity: ', lda_model_2.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model_2, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.883864614819519

Coherence Score:  0.4452825313034552


In [71]:
start = time.time()
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_2, corpus, id2word)

end = time.time()
print(end-start)
vis

464.9169855117798


# Begin LDA Mallet

In [None]:
from gensim.models.wrappers import LdaMallet

In [64]:
start = time.time()
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = "mallet-2.0.8/bin/" # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word, workers=3)
end = time.time()
print(end-start)

CalledProcessError: Command 'mallet-2.0.8/bin import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input C:\Users\BS\AppData\Local\Temp\421d0c_corpus.txt --output C:\Users\BS\AppData\Local\Temp\421d0c_corpus.mallet' returned non-zero exit status 1.

In [None]:
start = time.time()
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)
end = time.time()
print(end-start)

In [73]:
#redo of LDA model

start = time.time()
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=40, 
                                           random_state=100,
                                           #update_every=1,   # update_every = how often the model should be updated 
                                           chunksize=5000,    # chunksize = number of docs to be used in each training chunk
                                           passes=2,
                                           #alpha='auto',
                                           per_word_topics=True)


pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

end = time.time()
print(end-start)
vis

789.8060405254364


In [74]:
#redo of LDA multicore model. Should be same as above

start = time.time()
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=40, 
                                           random_state=100,
                                           #update_every=1,   # update_every = how often the model should be updated 
                                           chunksize=5000,    # chunksize = number of docs to be used in each training chunk
                                           passes=2,
                                           #alpha='auto',
                                           per_word_topics=True)


pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

end = time.time()
print(end-start)
vis

794.335693359375


In [75]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=20, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

start_time = time.time()
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=30, limit=45, step=3)

end = time.time()
print(end-start_time)

# Show graph
limit=45; start=30; step=3;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

end = time.time()
print(end-start)

CalledProcessError: Command 'mallet-2.0.8/bin import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input C:\Users\BS\AppData\Local\Temp\5555ca_corpus.txt --output C:\Users\BS\AppData\Local\Temp\5555ca_corpus.mallet' returned non-zero exit status 1.