# Topic Modeling using LDA

In [51]:
# # To be run only once
# if 0 == 1:
#     !pip install gensim
#     !pip install PyLDAvis
#     !pip install spacy
#     !python -m spacy download en_core_web_sm

In [52]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt

import re
import spacy
import tqdm

import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import gensim.corpora as corpora

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import en_core_web_sm

from pprint import pprint

import pyLDAvis.gensim
import pickle 
import pyLDAvis

import time
from collections import Counter
import random

In [53]:
random.seed(3)

In [54]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [55]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asaid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Preprocessing data

In [56]:
start = time.time()

In [57]:
# one_drive_path = "C:/Users/cfowle/The Estée Lauder Companies Inc/TeamAnis - General/"
one_drive_path = "C:/Users/asaid/The Estée Lauder Companies Inc/TeamAnis - General/"

In [58]:
# reviews = pd.read_pickle('reviews.pickle')

# cols = ['type', 'onlinepost_id', 'source_product_identifier', 'onlinestatement_id',
#        'date', 'title', 'description', 'geography', 'channel', 'product_id',
#        'rating', 'sentiment']
# reviews = reviews[cols]

# reviews.loc[reviews['type']=='Cosmetics','type']='temps'
# reviews.loc[reviews['type']=='Skincare','type']='Cosmetics'
# reviews.loc[reviews['type']=='temps','type']='Skincare'

# reviews = reviews[reviews['geography']=='USA']
# reviews.drop(columns=['geography'], inplace=True)

# reviews.to_pickle('reviews_filtered.pickle')

In [59]:
# reviews = pd.read_pickle('reviews_filtered.pickle')

In [60]:
# # Creating date columns in the right dtype and dropping the day of the month: 2019-02-24 => 2019-02-01
# reviews.loc[:,'clean_date'] = pd.to_datetime(reviews['date'], errors='coerce')
# if reviews['clean_date'].isna().sum() > 0:
#     print('{} rows have been dropped because the date format is wrong.'.format(reviews['clean_date'].isna().sum()))
#     display(reviews.loc[reviews['clean_date'].isna(), 'date'])
#     reviews = reviews.dropna(subset='date')
# reviews['date'] = reviews['clean_date']  
# reviews = reviews.drop('clean_date', axis=1)
# reviews['date'] = reviews['date'].dt.to_period('m')

# # Checking for missing data (NA => -1)
# if reviews['rating'].isna().sum()>0:
#     print('{} rows are missing ratings'.format(reviews['rating'].isna().sum()))
#     reviews.loc[:,'rating'] = reviews['rating'].fillna(-1).astype(int)

# if reviews['sentiment'].isna().sum()>0:
#     print('{} rows are missing sentiments'.format(reviews['sentiment'].isna().sum()))
#     reviews.loc[:,'sentiment'] = reviews['sentiment'].fillna(-1).astype(int)
    
# # Transforming rating and sentiment to dummy variables (one-hot encoding)
# reviews.loc[:,'sentiment'] = reviews['sentiment'].str.lower()
# reviews.loc[:,'rating'] = reviews['rating'].astype(int)
# reviews = pd.concat([reviews, pd.get_dummies(data=reviews[['rating','sentiment']], columns=['rating','sentiment'], dtype=int)], axis=1)

# # Readding NAs data to ratings
# reviews.loc[reviews['rating']==-1,'rating'] = np.nan
# reviews.loc[reviews['sentiment']==-1,'sentiment'] = np.nan

# # Transforming sentiment to integer data (positive:1; netural:0, negative:-1)
# reviews.loc[:,'sentiment'] = reviews['sentiment_positive'] - reviews['sentiment_negative']

# # Aggregating RR data by OnlinePost_ID

# # Creating a column to count the number of statements by review once aggreagtion happens
# reviews['nb_statements'] = reviews['sentiment']

# reviews = reviews.groupby(['type',
#                            'channel',
#                            'source_product_identifier',  
#                            'date',
#                            'onlinepost_id']).agg({
#     'description': lambda x:'. '.join(list(x)),
#     'nb_statements':'count',
#     'rating':'first',
#     'rating_1':'first',
#     'rating_2':'first',
#     'rating_3':'first',
#     'rating_4':'first',
#     'rating_5':'first',
#     'sentiment_negative':'sum',
#     'sentiment_neutral':'sum',
#     'sentiment_positive':'sum',
#     'sentiment':'mean'
# }).reset_index()

# # Normalize the one hot sentiment encoding counts (sentiment_negative, sentiment_neutral, sentiment_positive) by the nb_statement.
# reviews[['sentiment_negative', 'sentiment_neutral', 'sentiment_positive']] = reviews[['sentiment_negative', 'sentiment_neutral', 'sentiment_positive']].div(reviews['nb_statements'], axis=0)

# reviews.to_pickle('reviews_wrangled.pickle')

In [61]:
reviews = pd.read_pickle('reviews_wrangled.pickle')

In [62]:
# Adding product related words to the stop words
stop_words = stopwords.words('english')
stop_words.extend(['from'])

brands = pd.read_csv('elc_brands.csv', encoding='ISO-8859-1')
catalogue = pd.read_csv('elc_catalogue.csv', encoding='ISO-8859-1')

stop_words.extend(list(set(str(w).lower() for w in ' '.join(brands['ELC_Brand'].unique().tolist() +
                                          catalogue['Major_Category'].unique().tolist() +
                                          catalogue['Application'].unique().tolist() +
                                          catalogue['Category'].unique().tolist() +
                                          catalogue['SubCategory'].unique().tolist() 
                                         ).replace('/', ' ').split())))

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
stop_words_nlp = nlp(' '.join([' '.join(gensim.utils.simple_preprocess(str(word), deacc=True)) for word in stop_words]))
stop_words = [token.lemma_ for token in stop_words_nlp]


# Cache stop_words into hash
stop_words = Counter(stop_words)


def preprocess(sentences, stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    for sentence in sentences:
        doc = nlp(' '.join([token for token in gensim.utils.simple_preprocess(str(sentence), deacc=True)]) )
        yield([token.lemma_ for token in doc if (token.pos_ in allowed_postags) and (not (token in stop_words))])

In [None]:
reviews['tokens'] = list(tqdm.tqdm(preprocess(reviews['description'].values.tolist(), stop_words), position=0, leave=True, total=len(reviews)))

 14%|█████████▌                                                            | 631921/4636608 [25:38<2:59:26, 371.96it/s]

In [None]:
reviews.to_pickle('reviews_wrangled_w_tokens.pickle')

In [None]:
reviews = pd.read_pickle('reviews_wrangled_w_tokens.pickle')

In [None]:
rp = pd.read_csv('reviews_products.csv', low_memory=False)
products = rp[['type', 'channel', 'source_product_identifier', 'product', 'brand_abbrev', 'elc_brand',
       'brand_score', 'item_description', 'product_score', 'itemid_4',
       'major_category_id', 'major_category', 'application_id', 'application',
       'category_id', 'category', 'sub_category_id', 'sub_category']].drop_duplicates()
products = products.dropna(subset=['elc_brand'], axis=0)

In [None]:
reviews  = reviews.merge(products, left_on=['type', 'channel', 'source_product_identifier'], right_on=['type', 'channel', 'source_product_identifier'], how='inner')

In [None]:
reviews.to_pickle('reviews_products.pickle')

In [None]:
# reviews = pd.read_pickle('reviews_products.pickle')

In [None]:
reviews

## Using class

In [None]:
class TopicModeller():
    def __init__(self, df, column):
        self.df = df.reset_index(drop=True)
        self.column = column
        self.docs = df[column].values.tolist()
    
    def compute_coherence_values(self, corpus, id2word, text, k, a, b):

        lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=k, 
                                               random_state=3,
                                               chunksize=100,
                                               passes=10,
                                               alpha=a,
                                               eta=b)

        coherence_model_lda = CoherenceModel(model=lda_model, texts=text, dictionary=id2word, coherence='c_v')

        return coherence_model_lda.get_coherence()
    
    def train_valid_lda(self,
                        nb_samples: int = 100000,
                        topics_range: list = [3,5,7,9],
                        alpha: list = [0.1, 0.01, 'symmetric','asymmetric'],
                        beta: list = [0.1, 'symmetric']                        
                        ):
        
        nb_samples = min(nb_samples, len(self.docs))
        
        # Training set
        train = random.sample(self.docs, nb_samples)
        
        # Create Dictionary
        self.id2word = corpora.Dictionary(train)
        
        # Term Document Frequency
        self.corpus = [self.id2word.doc2bow(text) for text in train]
        
        # Result dictionary
        self.model_results = {'Topics': [],
                         'Alpha': [],
                         'Beta': [],
                         'Coherence': []
                        }

        # Can take a long time to run
        print('Validating hyperparameters...')
        if 1 == 1:
            pbar = tqdm.tqdm(total=(len(beta)*(len(alpha))*len(topics_range)), position=0, leave=True)
            # iterate through number of topics
            for k in topics_range:
                # iterate through alpha values
                for a in alpha:
                    # iterare through beta values
                    for b in beta:
                        # get the coherence score for the given parameters
                        cv = self.compute_coherence_values(corpus=self.corpus, id2word=self.id2word, text=train,
                                                      k=k, a=a, b=b)
                        # Save the model results
                        self.model_results['Topics'].append(k)
                        self.model_results['Alpha'].append(a)
                        self.model_results['Beta'].append(b)
                        self.model_results['Coherence'].append(cv)
                        print('nb topics: {}, alpha: {}, beta: {}, coherence: {}'.format(k,a,b,cv))
                        pbar.update(1)
        self.cv_results = pd.DataFrame(self.model_results).sort_values("Coherence", ascending=False)
        self.cv_results.to_csv('lda_tuning_results.csv', index=False)
        pbar.close()
        
        self.best_param = self.cv_results.iloc[0]
        self.best_num_topics = int(self.best_param['Topics'])
        self.best_alpha = self.best_param['Alpha']
        self.best_beta = self.best_param['Beta']
        name = 'cv_best'
        
        # Build LDA model
        print('Training best model...')
        self.lda_model = gensim.models.LdaMulticore(corpus=self.corpus,
                                                   id2word=self.id2word,
                                                   num_topics=self.best_num_topics,
                                                   alpha = self.best_alpha,
                                                   eta = self.best_beta,
                                                   random_state=3,
                                                   chunksize=100,
                                                   passes=10,
                                                   per_word_topics=True)
        print('Done.')
        pyLDAvis.enable_notebook()
        LDAvis_prepared = pyLDAvis.gensim.prepare(self.lda_model, self.corpus, self.id2word)
        display(LDAvis_prepared)
        pyLDAvis.save_html(LDAvis_prepared, name+'_'+str(self.best_alpha)+'_'+str(self.best_beta)+'_'+str(self.best_num_topics)+'.html')
        self.lda_model.save(name+'_'+str(self.best_alpha)+'_'+str(self.best_beta)+'_'+str(self.best_num_topics))
        pickle.dump(self.id2word, open( "id2word_"+name+'_'+str(self.best_alpha)+'_'+str(self.best_beta)+'_'+str(self.best_num_topics)+".pickle", "wb" ))
        return self.lda_model, self.id2word
        
    def get_docs_topics(self):
        # Term Document Frequency
        corpus = [self.id2word.doc2bow(text) for text in self.docs]
        self.output = pd.concat([self.df, pd.DataFrame(gensim.matutils.corpus2csc(self.lda_model.get_document_topics(corpus)).T.toarray(), columns=['topic_'+str(i) for i in range(1,self.best_num_topics+1)])], axis=1)
        return self.output

    def view(self):
        pyLDAvis.enable_notebook()
        self.LDAvis_prepared = pyLDAvis.gensim.prepare(self.lda_model, self.corpus, self.id2word)
        display(LDAvis_prepared)

In [None]:
all_lda = TopicModeller(reviews, 'tokens')
lda, id2word = all_lda.train_valid_lda()

In [None]:
def train_display_save(tokens, nb_samples, num_topics, alpha, eta, name):
    nb_samples = min(nb_samples, len(tokens))
        
    # Training set
    train = tokens.sample(nb_samples, random_state=3).values.tolist()

    # Create Dictionary
    id2word = corpora.Dictionary(train)

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in train]
    start=time.time()
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                                   id2word=id2word,
                                                   num_topics=num_topics,
                                                   alpha =alpha ,
                                                   eta =eta ,
                                                   random_state=3,
                                                   chunksize=100,
                                                   passes=10,
                                                   per_word_topics=True)
    pyLDAvis.enable_notebook()
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    display(LDAvis_prepared)
    pyLDAvis.save_html(LDAvis_prepared, name+'_'+str(alpha)+'_'+str(eta)+'_'+str(num_topics)+'.html')
    lda_model.save(name+'_'+str(alpha)+'_'+str(eta)+'_'+str(num_topics))
    pickle.dump(id2word, open( "id2word_"+ name+'_'+str(alpha)+'_'+str(eta)+'_'+str(num_topics)+".p", "wb" ))
    print(time.time()-start)
    return lda_model, id2word

In [None]:
# lda, id2word = train_display_save(reviews['tokens'], 500000, 8, alpha='asymmetric', eta='symmetric', name='full')

In [None]:
docs = reviews['tokens'].values.tolist()

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in docs]

In [None]:
topics = gensim.matutils.corpus2csc(lda.get_document_topics(corpus)).T.toarray()

In [None]:
topics = pd.DataFrame(topics, columns=['topic_'+str(i) for i in range(1,topics.shape[1]+1)])

In [None]:
output = pd.concat([reviews, topics], axis=1)

In [None]:
output.to_pickle('reviews_w_topics.pkl')

In [None]:
topics = lda.print_topics(num_words=6)
for topic in topics:
    print('Topic #{}'.format(topic[0]+1))
    print('Terms: ', topic[1])

In [None]:
train_display_save(reviews['tokens'], 100000, 10, alpha='asymmetric', eta='symmetric', name='full')

In [None]:
train_display_save(reviews['tokens'], 100000, 9, alpha='asymmetric', eta='symmetric', name='full')

In [None]:
train_display_save(reviews['tokens'], 100000, 8, alpha='asymmetric', eta='symmetric', name='full')

In [None]:
train_display_save(reviews['tokens'], 100000, 7, alpha='asymmetric', eta='symmetric', name='full')

In [None]:
train_display_save(reviews['tokens'], 100000, 6, alpha='asymmetric', eta='symmetric', name='full')

In [None]:
train_display_save(reviews['tokens'], 100000, 5, alpha='asymmetric', eta='symmetric', name='full')

In [None]:
train_display_save(reviews['tokens'], 100000, 4, alpha='asymmetric', eta=0.1, name='full')

In [None]:
train_display_save(reviews['tokens'], 100000, 3, alpha='asymmetric', eta=0.1, name='full')

In [None]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

## Splitting by review

In [None]:
bad_reviews['tokens'].sample(100000)

In [None]:
good_reviews_6 = train_display_save(tokens=good_reviews['tokens'], nb_samples=100000, num_topics=6, alpha='asymmetric', eta=0.1, name='good_reviews')

In [None]:
bad_reviews_6 = train_display_save(tokens=bad_reviews['tokens'], nb_samples=100000, num_topics=6, alpha='asymmetric', eta=0.1, name='bad_reviews')

In [None]:
neutral_reviews_6 = train_display_save(tokens=neutral_reviews['tokens'], nb_samples=100000, num_topics=6, alpha='asymmetric', eta=0.1, name='neutral_reviews')

In [None]:
def save_display(self, title):
    p = pyLDAvis.gensim.prepare(self.lda_model, self.corpus, self.id2word)
    pyLDAvis.save_html(p, title+'.html')

In [None]:
save_display(good_lda, 'good_reviews_8_topics')

In [None]:
save_display(bad_lda, 'bad_reviews_8_topics')

In [None]:
save_display(neutral_lda, 'neutral_reviews_8_topics')

In [None]:
good_reviews = reviews[reviews['rating']>=4]
neutral_reviews = reviews[(reviews['rating']>2) & (reviews['rating']<4)]
bad_reviews = reviews[reviews['rating']<=2]

In [None]:
print(len(good_reviews))
good_lda = TopicModeller(good_reviews, 'tokens')
good_lda.train_valid_lda()

In [None]:
display(good_lda.cv_results)

In [None]:
good_lda.view()

In [None]:
print(len(bad_reviews))
bad_lda = TopicModeller(bad_reviews, 'tokens')
bad_lda.train_valid_lda()
bad_lda.view()

In [None]:
display(bad_lda.cv_results)

In [None]:
bad_lda.view()

In [None]:
print(len(neutral_reviews))
neutral_lda = TopicModeller(neutral_reviews, 'tokens')
neutral_lda.train_valid_lda()
neutral_lda.view()

In [None]:
display(neutral_lda.cv_results)

In [None]:
neutral_lda.view()

In [None]:
good_reviews = good_lda.get_docs_topics()

In [None]:
neutral_reviews = neutral_lda.get_docs_topics()

In [None]:
bad_reviews = bad_lda.get_docs_topics()

In [None]:
good_reviews.to_pickle('good_reviews.pickle')
bad_reviews.to_pickle('bad_reviews.pickle')
neutral_reviews.to_pickle('neutral_reviews.pickle')

In [None]:
good_lda.cv_results

In [None]:
good_reviews_3 = train_display_save(tokens=good_reviews['tokens'], nb_samples=100000, num_topics=3, alpha='asymmetric', eta=0.1, name='good_reviews')
bad_reviews_3 = train_display_save(tokens=bad_reviews['tokens'], nb_samples=100000, num_topics=3, alpha='asymmetric', eta=0.1, name='bad_reviews')
neutral_reviews_3 = train_display_save(tokens=neutral_reviews['tokens'], nb_samples=100000, num_topics=3, alpha='asymmetric', eta='symmetric', name='neutral_reviews')

In [None]:
int_start=time.time()
# Inputs
tokens_good = random.sample(good_reviews['tokens'].values.tolist(),100000)
# Create Dictionary
id2word_good = corpora.Dictionary(tokens_good)
# Term Document Frequency
corpus_good = [id2word_good.doc2bow(text) for text in tokens_good]
print(time.time()-int_start)

# Build LDA model
int_start=time.time()
lda_model_good = gensim.models.LdaMulticore(corpus=corpus_good,
                                       id2word=id2word_good,
                                       num_topics=3,
                                       alpha = 'asymmetric',
                                       eta = 0.1,
                                       random_state=3,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
print(time.time()-int_start)

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_good, corpus_good, id2word_good)
LDAvis_prepared

In [None]:
pyLDAvis.gensim.prepare(lda_model_good, corpus_good, id2word_good)

In [None]:
bad_lda.cv_results

In [None]:
int_start=time.time()
# Inputs
tokens_bad = random.sample(bad_reviews['tokens'].values.tolist(),100000)
# Create Dictionary
id2word_bad = corpora.Dictionary(tokens_bad)
# Term Document Frequency
corpus_bad = [id2word_bad.doc2bow(text) for text in tokens_bad]
print(time.time()-int_start)

# Build LDA model
int_start=time.time()
lda_model_bad = gensim.models.LdaMulticore(corpus=corpus_bad,
                                       id2word=id2word_bad,
                                       num_topics=3,
                                       alpha = 'asymmetric',
                                       eta = 0.1,
                                       random_state=3,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
print(time.time()-int_start)

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_bad, corpus_bad, id2word_bad)
LDAvis_prepared

In [None]:
int_start=time.time()
# Inputs
tokens_neutral = random.sample(neutral_reviews['tokens'].values.tolist(),100000)
# Create Dictionary
id2word_neutral = corpora.Dictionary(tokens_neutral)
# Term Document Frequency
corpus_neutral = [id2word_neutral.doc2bow(text) for text in tokens_neutral]
print(time.time()-int_start)

# Build LDA model
int_start=time.time()
lda_model_neutral = gensim.models.LdaMulticore(corpus=corpus_neutral,
                                       id2word=id2word_neutral,
                                       num_topics=3,
                                       alpha = 'asymmetric',
                                       eta = 'symmetric',
                                       random_state=3,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
print(time.time()-int_start)

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_neutral, corpus_neutral, id2word_neutral)
LDAvis_prepared

## NLP Preprocessing

In [None]:
docs = pickle.load(open("docs.pickle", "rb"))

d = corpora.Dictionary(docs)

freq = pd.DataFrame(d.dfs.values(), index=d.dfs.keys(), columns=['freq'])
freq.index.name = 'idx'
freq = freq.reset_index()
freq['token'] = freq['idx'].apply(lambda x:d[x])
freq = freq.sort_values(by='freq', ascending=False)

In [None]:
freq[freq['freq']>500]

## Validating Model

In [None]:
# Inputs
docs = random.sample(docs, 100000)

# Create Dictionary
id2word = corpora.Dictionary(docs)
pickle.dump(id2word, open( "id2word.p", "wb" ) )

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in docs]

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, text, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=text, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
grid = {}
grid['Validation_Set'] = {}

nb_words = len(id2word)

# Topics range
min_topics = 6
max_topics = 10
step_size = 1
# topics_range = range(min_topics, max_topics, step_size)
topics_range = [8]
# Alpha parameter
# Added in the loop

# Alpha
alpha = [
#     0.1, 
#     'symmetric',
#     'asymmetric'
]

# Beta parameter
beta = [
    0.1, 
#     200/nb_words
]
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [
#     gensim.utils.ClippedCorpus(corpus, num_of_docs*0.05), 
#     gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
#     gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
    self.corpus
]

corpus_title = [
#     '25% Corpus',
#     '50% Corpus',
#     '75% Corpus',
    '100% Corpus'
]

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*(len(alpha)+1)*len(topics_range)))
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            alpha.append(50/k)
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, text=docs,
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    pbar.update(1)
    res = pd.DataFrame(model_results)
    res = pd.DataFrame(model_results).sort_values("Coherence", ascending=False)
    res.to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

In [None]:
res

## Training Best Model

In [None]:
best_param = res.iloc[0]
num_topics = best_param['Topics']
alpha = best_param['Alpha']
eta = best_param['Beta']

In [None]:
# Build LDA model
int_start=time.time()
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       alpha = alpha,
                                       eta = eta,
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
print(time.time()-int_start)

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

In [None]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared

In [None]:
lda_model.save('lda_test.model')

In [None]:
print(time.time()-start)

## Predictions

In [None]:
# df = pd.read_pickle('reviews_concat.pkl')

In [None]:
# df = df.loc['2019']

In [None]:
# print(len(df))

In [None]:
# docs = list(tqdm.tqdm(preprocess(df.values.tolist(), stop_words), position=0, leave=True))

In [None]:
# pickle.dump(docs, open( "docs.p", "wb" ) )

In [None]:
# docs = pickle.load(open("docs.p", "rb"))
# id2word = pickle.load(open("id2word.p", "rb"))

# # Term Document Frequency
# corpus = [id2word.doc2bow(text) for text in docs]

# output = pd.concat([df.to_frame('description').reset_index(), pd.DataFrame(gensim.matutils.corpus2csc(lda_model.get_document_topics(corpus)).T.toarray(), columns=['topic_'+str(i) for i in range(1,num_topics+1)])], axis=1, ignore_index=True)

# output.to_pickle('reviews_w_topics_test.pkl')

## Appendix

In [None]:
# # Build the bigram and trigram models
# bigram = gensim.models.Phrases(data_words, min_count=5, threshold=150) # higher threshold fewer phrases.
# # trigram = gensim.models.Phrases(bigram[data_words], threshold=150)

# # Faster way to get a sentence clubbed as a trigram/bigram
# bigram_mod = gensim.models.phrases.Phraser(bigram)
# # trigram_mod = gensim.models.phrases.Phraser(trigram)

# def remove_stopwords(texts):
#     return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# def make_bigrams(texts):
#     return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

# def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     """https://spacy.io/api/annotation"""
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent)) 
#         texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#     return texts_out

# # Remove Stop Words
# data_words = remove_stopwords(data_words)

# # Do lemmatization keeping only noun, adj, vb, adv
# data_words = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# # Form Bigrams
# data_words = make_bigrams(data_words)

In [None]:
# # Build LDA model
# lda_model = gensim.models.LdaMulticore(corpus=corpus,
#                                        id2word=id2word,
#                                        num_topics=8, 
#                                        random_state=100,
#                                        chunksize=100,
#                                        passes=10,
#                                        per_word_topics=True)

In [None]:
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

In [None]:
# coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
# coherence_lda = coherence_model_lda.get_coherence()
# print('Coherence Score: ', coherence_lda)