# Topic Modeling using LDA

In [1]:
# # To be run only once
# if 0 == 1:
#     !pip install gensim
#     !pip install PyLDAvis
#     !pip install spacy
#     !python -m spacy download en_core_web_sm

In [19]:
import pandas as pd
import numpy as np
import pickle


import re
import spacy
import tqdm

import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import gensim.corpora as corpora

from nltk.corpus import stopwords
import nltk
import en_core_web_sm

from pprint import pprint

import pyLDAvis.gensim
import pickle 
import pyLDAvis

import time
from collections import Counter
import random

In [21]:
# Cache stop_words into hash
stop_words = stopwords.words('english')
stop_words.extend(['from'])
stop_words = Counter(stop_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

random.seed(3)

In [4]:
# nltk.download('stopwords')

## Preprocessing data

In [5]:
start = time.time()

In [6]:
# one_drive_path = "C:/Users/cfowle/The Estée Lauder Companies Inc/TeamAnis - General/"
one_drive_path = "C:/Users/asaid/The Estée Lauder Companies Inc/TeamAnis - General/"

In [7]:
df = pd.read_pickle('reviews_concat.pkl')

In [8]:
# df = df.sample(100, random_state=3)
# del df

In [9]:
# Cache stop_words into hash
stop_words = stopwords.words('english')
stop_words.extend(['from'])
stop_words = Counter(stop_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

def preprocess(sentences, stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    # lower case
    # Remove stop words
    # Lemmatize
    for sentence in sentences:
        doc = nlp(' '.join([token for token in gensim.utils.simple_preprocess(str(sentence), deacc=True) if token not in stop_words]) )
        yield([token.lemma_ for token in doc if token.pos_ in allowed_postags])

In [10]:
docs = list(tqdm.tqdm(preprocess(df.values.tolist(), stop_words), position=0, leave=True))

5131380it [3:00:24, 474.04it/s]


In [18]:
len(docs)

5131380

## Validating Model

In [61]:
# Inputs
docs = random.sample(docs, 100000)

# Create Dictionary
id2word = corpora.Dictionary(docs)
pickle.dump(id2word, open( "id2word.p", "wb" ) )

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in docs]

In [24]:
# supporting function
def compute_coherence_values(corpus, dictionary, text, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=text, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [33]:
grid = {}
grid['Validation_Set'] = {}

nb_words = len(id2word)

# Topics range
min_topics = 6
max_topics = 10
step_size = 1
# topics_range = range(min_topics, max_topics, step_size)
topics_range = [8]
# Alpha parameter
# Added in the loop

# Alpha
alpha = [
#     0.1, 
#     'symmetric',
#     'asymmetric'
]

# Beta parameter
beta = [
    0.1, 
#     200/nb_words
]
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [
#     gensim.utils.ClippedCorpus(corpus, num_of_docs*0.05), 
#     gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
#     gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
    corpus
]

corpus_title = [
#     '25% Corpus',
#     '50% Corpus',
#     '75% Corpus',
    '100% Corpus'
]

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*(len(alpha)+1)*len(topics_range)*len(corpus_title)))
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            alpha.append(50/k)
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, text=docs,
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    pbar.update(1)
    res = pd.DataFrame(model_results)
    res = pd.DataFrame(model_results).sort_values("Coherence", ascending=False)
    res.to_csv('lda_tuning_results.csv', index=False)
    pbar.close()







  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A[A[A[A





 50%|█████████████████████████████████████████▌                                         | 1/2 [03:52<03:52, 232.60s/it][A[A[A[A[A[A





100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [07:55<00:00, 237.63s/it][A[A[A[A[A[A


In [34]:
res

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,100% Corpus,8,6.25,0.1,0.533193
1,100% Corpus,8,6.25,symmetric,0.531085


## Training Best Model

In [35]:
best_param = res.iloc[0]
num_topics = best_param['Topics']
alpha = best_param['Alpha']
eta = best_param['Beta']

In [36]:
# Build LDA model
int_start=time.time()
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       alpha = alpha,
                                       eta = eta,
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
print(time.time()-int_start)

214.14255166053772


In [37]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.522543142845403


In [38]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared

In [39]:
lda_model.save('lda_test.model')

In [40]:
print(time.time()-start)

31247.86803984642


## Predictions

In [None]:
# df = pd.read_pickle('reviews_concat.pkl')

In [None]:
# df = df.loc['2019']

In [None]:
# print(len(df))

In [None]:
# docs = list(tqdm.tqdm(preprocess(df.values.tolist(), stop_words), position=0, leave=True))

In [None]:
# pickle.dump(docs, open( "docs.p", "wb" ) )

In [None]:
docs = pickle.load(open( "docs.p", "rb" ))
id2word = pickle.load(open( "id2word.p", "rb" ))

In [None]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in docs]

In [15]:
# lda_model = gen?sim.models.LdaModel.load('lda.model')

In [69]:
output = pd.concat([df.to_frame('description').reset_index(), pd.DataFrame(gensim.matutils.corpus2csc(lda_model.get_document_topics(corpus)).T.toarray(), columns=['topic_'+str(i) for i in range(1,num_topics+1)])], axis=1, ignore_index=True)

In [None]:
pd.DataFrame(gensim.matutils.corpus2csc(lda_model.get_document_topics(corpus)).T.toarray(), columns=['topic_'+str(i) for i in range(1,num_topics+1)])

In [71]:
df.to_frame('description').reset_index()

Unnamed: 0,date,onlinepost_id,description
0,2014-12-01,OnlinePost_20191027_181650262,It's lightweight yet hides my imperfections.. ...
1,2014-12-01,OnlinePost_20191027_181666571,"Even with oily skin, I prefer the look of this..."
2,2014-12-01,OnlinePost_20191027_181666940,I wear foundation nearly every day & this is p...
3,2014-12-01,OnlinePost_20191027_181694251,i'm a mascara junky…. i really like this masca...
4,2014-12-01,OnlinePost_20191027_181695799,I received this as a sample in my GWP and love...
...,...,...,...
5131375,2020-04-30,OnlinePost_20200524_ff7e18ee-3f03-4bfe-bf9a-98...,"- expensive cuz its just a moisturizer, could ..."
5131376,2020-04-30,OnlinePost_20200524_ff907024-361c-4bc4-959e-0b...,I appreciate that this cleanser is very moistu...
5131377,2020-04-30,OnlinePost_20200524_ffadec77-5361-4a6b-9134-83...,It is sticky once applied but this doesn't bot...
5131378,2020-04-30,OnlinePost_20200524_ffcde3c4-2f75-4431-ba64-2e...,The pump isnt excessive either so you definite...


In [None]:
output.to_pickle('reviews_w_topics_test.pkl')

## Appendix

In [None]:
# # Build the bigram and trigram models
# bigram = gensim.models.Phrases(data_words, min_count=5, threshold=150) # higher threshold fewer phrases.
# # trigram = gensim.models.Phrases(bigram[data_words], threshold=150)

# # Faster way to get a sentence clubbed as a trigram/bigram
# bigram_mod = gensim.models.phrases.Phraser(bigram)
# # trigram_mod = gensim.models.phrases.Phraser(trigram)

# def remove_stopwords(texts):
#     return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# def make_bigrams(texts):
#     return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

# def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     """https://spacy.io/api/annotation"""
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent)) 
#         texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#     return texts_out

# # Remove Stop Words
# data_words = remove_stopwords(data_words)

# # Do lemmatization keeping only noun, adj, vb, adv
# data_words = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# # Form Bigrams
# data_words = make_bigrams(data_words)

In [None]:
# # Build LDA model
# lda_model = gensim.models.LdaMulticore(corpus=corpus,
#                                        id2word=id2word,
#                                        num_topics=8, 
#                                        random_state=100,
#                                        chunksize=100,
#                                        passes=10,
#                                        per_word_topics=True)

In [None]:
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

In [None]:
# coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
# coherence_lda = coherence_model_lda.get_coherence()
# print('Coherence Score: ', coherence_lda)