In [1]:
import pandas as pd
import numpy as np

import re
import spacy
import tqdm

import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import gensim.corpora as corpora

from nltk.corpus import stopwords

from pprint import pprint

import pyLDAvis.gensim
import pickle 
import pyLDAvis

In [2]:
one_drive_path = "C:/Users/cfowle/The Estée Lauder Companies Inc/TeamAnis - General/"

In [3]:
rr = pd.read_csv(one_drive_path + "Data/Ratings and Reviews/reviews_cosmetics_sample.csv")

In [7]:
rr

Unnamed: 0,onlinepost_id,source_product_identifier,onlinestatement_id,date,title,description,geography,channel,product_id,rating,...,skin_tone,gender,i_shop_at_macys.com,make-up_style,purchase_location,cons,pros,describe_yourself,reviewer_skin_type,age
0,OnlinePost_20191027_184110681,pimprod2006667,OnlineStatement_20191027_184110681_5,2019-09-06,Nice!,I am very pleased!,USA,Ulta,Product_20191016_5443258,4.0,...,,,,,,,Light spritz. Makeup stays put,,,
1,OnlinePost_20191027_184748557,pimprod2006667,OnlineStatement_20191027_184748557_5,2019-09-07,. I would recommended if in budget.,However I wish it got rid of the powdery look ...,USA,Ulta,Product_20191016_5443258,4.0,...,,,,,,,,,,
2,OnlinePost_20191027_184112758,pimprod2006667,OnlineStatement_20191027_184112758_1,2019-09-13,Love it!,Love it!,USA,Ulta,Product_20191016_5443258,5.0,...,,,,,,,,,,
3,OnlinePost_20191027_183920306,pimprod2006667,OnlineStatement_20191027_183920306_4,2019-09-27,Perfect Product!,It helps my makeup lady all day!,USA,Ulta,Product_20191016_5443258,5.0,...,,,,,,,Easy application. Lightweight. Smell. Straight...,Female. Late 20s,,
4,OnlinePost_20191027_184386195,pimprod2006667,OnlineStatement_20191027_184386195_1,2019-09-19,AmaIng!,AmaIng!,USA,Ulta,Product_20191016_5443258,5.0,...,,,,,,,Minimize pores,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,OnlinePost_20191121_185487494,P415232,OnlineStatement_20191121_185487494_3,2019-10-10,Endless colors,A must for any beauty lover.,USA,Sephora,Product_20191016_5505315,5.0,...,Fair,,,,,,,,Combination,
9996,OnlinePost_20191121_186408908,3509378,OnlineStatement_20191121_186408908_3,2019-10-16,THE BEST,The large brush makes my eyelashes huge withou...,USA,Bloomingdales,Product_20191016_5167644,5.0,...,,,,,,,,,,
9997,OnlinePost_20191121_185599358,pimprod2007828,OnlineStatement_20191121_185599358_2,2019-10-25,good but others are better,I was not impressed with it.,USA,Ulta,Product_20191016_5443498,3.0,...,,,,,,,,,,
9998,OnlinePost_20191121_192800293,3324554,OnlineStatement_20191121_192800293_1,2019-10-24,Happy with my purchase,Happy with my purchase.,USA,Macy's,Product_20191016_4962760,4.0,...,,,,,,,,,,


In [9]:
documents = rr["description"]

In [10]:
documents = documents.map(lambda x: re.sub('[,\.!?]', '', x))
documents = documents.map(lambda x: x.lower())

  documents = documents.map(lambda x: re.sub('[,\.!?]', '', x))


In [11]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = documents.values.tolist()
data_words = list(sent_to_words(data))

In [13]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [14]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [15]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [16]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [17]:
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [18]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [19]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1][0][:30])

['pleased']


In [20]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1)]


In [29]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=8, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [30]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.107*"good" + 0.082*"powder" + 0.063*"set" + 0.047*"use" + 0.036*"spray" + '
  '0.026*"ever" + 0.018*"try" + 0.018*"well" + 0.016*"primer" + 0.014*"bad"'),
 (1,
  '0.084*"skin" + 0.047*"look" + 0.037*"make" + 0.035*"really" + 0.032*"feel" '
  '+ 0.025*"nice" + 0.023*"smooth" + 0.022*"oily" + 0.021*"smell" + '
  '0.021*"great"'),
 (2,
  '0.062*"apply" + 0.042*"easy" + 0.036*"amazing" + 0.028*"part" + '
  '0.026*"review" + 0.025*"perfect" + 0.023*"promotion" + 0.023*"collect" + '
  '0.021*"natural" + 0.020*"brush"'),
 (3,
  '0.032*"look" + 0.032*"dry" + 0.029*"makeup" + 0.029*"give" + 0.025*"think" '
  '+ 0.022*"skin" + 0.020*"even" + 0.019*"product" + 0.018*"keep" + '
  '0.015*"finish"'),
 (4,
  '0.075*"lip" + 0.042*"great" + 0.025*"use" + 0.016*"dry" + 0.015*"come" + '
  '0.012*"worth" + 0.012*"night" + 0.011*"work" + 0.011*"lipstick" + '
  '0.010*"time"'),
 (5,
  '0.156*"product" + 0.133*"love" + 0.055*"buy" + 0.050*"would" + '
  '0.033*"definitely" + 0.029*"recommend" + 0.02

In [31]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.3266834498182955


In [33]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [34]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
               corpus]

corpus_title = ['100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

100%|██████████| 270/270 [1:13:28<00:00, 16.33s/it]


In [37]:
pd.DataFrame(model_results).sort_values("Coherence")

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
15,100% Corpus,2,0.91,0.01,0.160536
21,100% Corpus,2,symmetric,0.31,0.160643
23,100% Corpus,2,symmetric,0.91,0.162406
3,100% Corpus,2,0.01,0.91,0.163633
24,100% Corpus,2,symmetric,symmetric,0.168174
...,...,...,...,...,...
178,100% Corpus,7,asymmetric,0.91,0.413328
267,100% Corpus,10,asymmetric,0.61,0.416818
268,100% Corpus,10,asymmetric,0.91,0.419163
238,100% Corpus,9,asymmetric,0.91,0.449889


In [40]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=8,
                                       alpha = "asymmetric",
                                       eta = 0.91,
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [41]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.46000253950949693


In [42]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared