In [None]:
import numpy as np
import pandas as pd
from ast import literal_eval 
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
import re
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim

In [None]:
df_cmu = pd.read_csv('booksummaries.txt', sep='\t', header=None)
df_cmu = df_cmu.rename(columns={0:"id_wikipedia", 1:'id_freebase', 2:'title', 3:'author', 4:'pub_date', 5:'genre', 6:'summary'})

In [None]:
# Converting genre into comma-separated string
def extract_genre(d):
    output = ''
    if pd.isna(d):
        return
    for genre in eval(d).values():
        output = output + genre + ', '
    output = output[:-2]
    return output

extract_genre(df_cmu['genre'][0])

df_cmu['genre'] = df_cmu['genre'].apply(extract_genre)

# Drop books missing genre list
df_cmu = df_cmu[df_cmu['genre'].notna()]
df_cmu.shape

In [None]:
# Tokenize summary text
word_tokenize(df_cmu['summary'][0])

df_cmu['summary_token'] = df_cmu['summary'].apply(word_tokenize)


In [None]:
# Remove stopwords included in the NLTK stopwords dictionary
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]

df_cmu['summary_token'] = df_cmu['summary_token'].apply(remove_stopwords)


In [None]:
# Lemmatizing tokens: ~1.5 min. runtime for Andrew
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token, 'n') for token in tokens]
    lemmas = [lemmatizer.lemmatize(lemma, 'v') for lemma in lemmas]
    lemmas = [lemmatizer.lemmatize(lemma, 'a') for lemma in lemmas]
    lemmas = [lemmatizer.lemmatize(lemma, 'r') for lemma in lemmas]
    lemmas = [lemmatizer.lemmatize(lemma, 's') for lemma in lemmas]
    return lemmas

df_cmu['summary_token'] = df_cmu['summary_token'].apply(lemmatize)

In [None]:
# Export csv
df_cmu.to_csv('cmu_cleaned.csv', index=False)

In [None]:
stop_words = stopwords.words('english')

In [None]:
df = pd.read_csv('cmu_cleaned.csv', converters={'filtered_genre':literal_eval, 'summary_token': lambda x: x.strip("[]").replace("'","").split(", ")})

In [None]:
df.head()

In [None]:
summary_tok = list(df['summary_token'])

In [None]:
corp_dict = corpora.Dictionary(summary_tok)

In [None]:
bow = [corp_dict.doc2bow(tok) for tok in summary_tok]

In [None]:
coherence_matrix = np.zeros((10,83))

In [None]:
for i in range(3,86):
    for seed in range(10):
        LDA = gensim.models.ldamodel.LdaModel(bow, i, corp_dict, random_state=seed, per_word_topics=True)
        coherence = CoherenceModel(model=LDA, texts=summary_tok, dictionary=corp_dict, coherence='c_v')
        c = coherence.get_coherence()
        coherence_matrix[seed, i-3] = c

In [None]:
coherence_agg = np.mean(coherence_matrix, axis=0)

In [None]:
for i in range(3,86):
    avg = coherence_agg[i-3]
    print('Mean Coherence for', i, 'Topics:', avg)

In [None]:
for i in range(46,101):
    LDA = gensim.models.ldamodel.LdaModel(bow, i, corp_dict, random_state=seed, per_word_topics=True)
    coherence = CoherenceModel(model=LDA, texts=summary_tok, dictionary=corp_dict, coherence='c_v')
    print('Coherence for', i, 'Topics:', coherence.get_coherence())

In [None]:
LDA = gensim.models.ldamodel.LdaModel(bow, 56, corp_dict, random_state=27, per_word_topics=True)

In [None]:
LDA.print_topics()

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(LDA, bow, corp_dict)
vis