In [None]:
import json
import re
import numpy as np
import pandas as pd
from pprint import pprint
from nltk.corpus import stopwords

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import corpora, models, similarities
# spacy for lemmatization
import spacy
! python -m spacy download en

# Removing Stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.test.utils import datapath

In [None]:
temp_file = datapath("lda_1_percent")
model =  models.LdaModel.load(temp_file)

In [None]:
model

In [None]:
df = pd.read_csv('review_extracted.csv')
df.head()

In [None]:
df_sampled = df.sample(frac=0.05,random_state=1)
print(len(df_sampled))

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 'use']) # you can add more words into it to filter out other words
df_text_list = df_sampled['text'].tolist()
print(len(df_text_list))

In [None]:
# Remove new line characters
data = [re.sub('\s+', ' ', str(sent)) for sent in df_text_list]
data = [re.sub("\'", "", sent) for sent in data]

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc

In [None]:
# Tokenize words and clean up text
data_words = list(sent_to_words(data))

In [None]:
print (data_words[0])
print (df_text_list[0])

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [None]:
data_lemmatized[0]

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = '/home/ubuntu/Desktop/Mallet/bin/mallet' # update this path

In [None]:
def compute_coherence_values(dictionary, corpus, texts, start,stop,step):
    coherence_values = []
    model_list = []
    for num_topics in range(start,stop,step):
        print('Calculating {}-topic model'.format(num_topics))
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
def compute_perplexity_values(dictionary, corpus, texts, start,stop,step):
    perplexity_values = []
    model_list = []
    for num_topics in range(start,stop,step):
        print('Calculating {}-topic model'.format(num_topics))
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        model_list.append(model)
        perplexity_values.append(lda_model.log_perplexity(corpus))

    return model_list, perplexity_values

In [None]:
start = 9;stop = 58;step = 6;
model_list, coherence_values = compute_coherence_values(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=texts,
                                                        start=start,
                                                        stop=stop,
                                                        step=step)

In [None]:
# Show graph
x = range(9, 58, 6)
print(len(x))
print(len(coherence_values))
plt.figure(figsize=(10, 8))
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
start = 9;stop = 58;step = 6;
model_list, perplexity_values = compute_perplexity_values(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=texts,
                                                        start=start,
                                                        stop=stop,
                                                        step=step)

In [None]:
# Show graph
x = range(9, 58, 6)
print(len(x))
print(len(perplexity_values))
plt.figure(figsize=(10, 8))
plt.plot(x, perplexity_values)
plt.xlabel("Num Topics")
plt.ylabel("Perplexity score")
# plt.legend(("coherence_values"), loc='best')
plt.show()