# Topic Modeling using LDA

In [1]:
# # To be run only once
# if 0 == 1:
#     !pip install gensim
#     !pip install PyLDAvis
#     !pip install spacy
#     !python -m spacy download en_core_web_sm

In [2]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt

import re
import spacy
import tqdm

import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import gensim.corpora as corpora

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import en_core_web_sm

from pprint import pprint

import pyLDAvis.gensim
import pickle 
import pyLDAvis

import time
from collections import Counter
import random

In [3]:
random.seed(3)

In [4]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asaid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Preprocessing data

In [6]:
start = time.time()

In [7]:
# one_drive_path = "C:/Users/cfowle/The Estée Lauder Companies Inc/TeamAnis - General/"
one_drive_path = "C:/Users/asaid/The Estée Lauder Companies Inc/TeamAnis - General/"

In [8]:
# reviews = pd.read_pickle('reviews.pickle')

# cols = ['type', 'onlinepost_id', 'source_product_identifier', 'onlinestatement_id',
#        'date', 'title', 'description', 'geography', 'channel', 'product_id',
#        'rating', 'sentiment']
# reviews = reviews[cols]

# reviews.loc[reviews['type']=='Cosmetics','type']='temps'
# reviews.loc[reviews['type']=='Skincare','type']='Cosmetics'
# reviews.loc[reviews['type']=='temps','type']='Skincare'

# reviews = reviews[reviews['geography']=='USA']
# reviews.drop(columns=['geography'], inplace=True)

# reviews.to_pickle('reviews_filtered.pickle')

In [9]:
# reviews = pd.read_pickle('reviews_filtered.pickle')

In [10]:
# # Creating date columns in the right dtype and dropping the day of the month: 2019-02-24 => 2019-02-01
# reviews.loc[:,'clean_date'] = pd.to_datetime(reviews['date'], errors='coerce')
# if reviews['clean_date'].isna().sum() > 0:
#     print('{} rows have been dropped because the date format is wrong.'.format(reviews['clean_date'].isna().sum()))
#     display(reviews.loc[reviews['clean_date'].isna(), 'date'])
#     reviews = reviews.dropna(subset='date')
# reviews['date'] = reviews['clean_date']  
# reviews = reviews.drop('clean_date', axis=1)
# reviews['date'] = reviews['date'].dt.to_period('m')

# # Checking for missing data (NA => -1)
# if reviews['rating'].isna().sum()>0:
#     print('{} rows are missing ratings'.format(reviews['rating'].isna().sum()))
#     reviews.loc[:,'rating'] = reviews['rating'].fillna(-1).astype(int)

# if reviews['sentiment'].isna().sum()>0:
#     print('{} rows are missing sentiments'.format(reviews['sentiment'].isna().sum()))
#     reviews.loc[:,'sentiment'] = reviews['sentiment'].fillna(-1).astype(int)
    
# # Transforming rating and sentiment to dummy variables (one-hot encoding)
# reviews.loc[:,'sentiment'] = reviews['sentiment'].str.lower()
# reviews.loc[:,'rating'] = reviews['rating'].astype(int)
# reviews = pd.concat([reviews, pd.get_dummies(data=reviews[['rating','sentiment']], columns=['rating','sentiment'], dtype=int)], axis=1)

# # Readding NAs data to ratings
# reviews.loc[reviews['rating']==-1,'rating'] = np.nan
# reviews.loc[reviews['sentiment']==-1,'sentiment'] = np.nan

# # Transforming sentiment to integer data (positive:1; netural:0, negative:-1)
# reviews.loc[:,'sentiment'] = reviews['sentiment_positive'] - reviews['sentiment_negative']

# # Aggregating RR data by OnlinePost_ID

# # Creating a column to count the number of statements by review once aggreagtion happens
# reviews['nb_statements'] = reviews['sentiment']

# reviews = reviews.groupby(['type',
#                            'channel',
#                            'source_product_identifier',  
#                            'date',
#                            'onlinepost_id']).agg({
#     'description': lambda x:'. '.join(list(x)),
#     'nb_statements':'count',
#     'rating':'first',
#     'rating_1':'first',
#     'rating_2':'first',
#     'rating_3':'first',
#     'rating_4':'first',
#     'rating_5':'first',
#     'sentiment_negative':'sum',
#     'sentiment_neutral':'sum',
#     'sentiment_positive':'sum',
#     'sentiment':'mean'
# }).reset_index()

# # Normalize the one hot sentiment encoding counts (sentiment_negative, sentiment_neutral, sentiment_positive) by the nb_statement.
# reviews[['sentiment_negative', 'sentiment_neutral', 'sentiment_positive']] = reviews[['sentiment_negative', 'sentiment_neutral', 'sentiment_positive']].div(reviews['nb_statements'], axis=0)

# reviews.to_pickle('reviews_wrangled.pickle')

In [11]:
reviews = pd.read_pickle('reviews_wrangled.pickle')

In [12]:
# Adding product related words to the stop words
stop_words = stopwords.words('english')
stop_words.extend(['from'])

brands = pd.read_csv('elc_brands.csv', encoding='ISO-8859-1')
catalogue = pd.read_csv('elc_catalogue.csv', encoding='ISO-8859-1')

stop_words.extend(list(set(str(w).lower() for w in ' '.join(brands['ELC_Brand'].unique().tolist() +
                                          catalogue['Major_Category'].unique().tolist() +
                                          catalogue['Application'].unique().tolist() +
                                          catalogue['Category'].unique().tolist() +
                                          catalogue['SubCategory'].unique().tolist() 
                                         ).replace('/', ' ').split())))

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
stop_words_nlp = nlp(' '.join([' '.join(gensim.utils.simple_preprocess(str(word), deacc=True)) for word in stop_words]))
stop_words = [token.lemma_ for token in stop_words_nlp]


# Cache stop_words into hash
stop_words = Counter(stop_words)


def preprocess(sentences, stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    for sentence in sentences:
        doc = nlp(' '.join([token for token in gensim.utils.simple_preprocess(str(sentence), deacc=True)]) )
        yield([token.lemma_ for token in doc if (token.pos_ in allowed_postags) and (not (token.lemma_ in stop_words))])

In [13]:
'lipstick' in stop_words

True

In [14]:
reviews['tokens'] = list(tqdm.tqdm(preprocess(reviews['description'].values.tolist(), stop_words), position=0, leave=True, total=len(reviews)))

100%|█████████████████████████████████████████████████████████████████████| 4636608/4636608 [4:21:43<00:00, 295.26it/s]


In [15]:
reviews.to_pickle('reviews_wrangled_w_tokens.pickle')

In [16]:
reviews = pd.read_pickle('reviews_wrangled_w_tokens.pickle')

In [17]:
rp = pd.read_csv('reviews_products.csv', low_memory=False)
products = rp[['type', 'channel', 'source_product_identifier', 'product', 'brand_abbrev', 'elc_brand',
       'brand_score', 'item_description', 'product_score', 'itemid_4',
       'major_category_id', 'major_category', 'application_id', 'application',
       'category_id', 'category', 'sub_category_id', 'sub_category']].drop_duplicates()
products = products.dropna(subset=['elc_brand'], axis=0)

In [18]:
reviews  = reviews.merge(products, left_on=['type', 'channel', 'source_product_identifier'], right_on=['type', 'channel', 'source_product_identifier'], how='inner')

In [19]:
reviews.to_pickle('reviews_products.pickle')

In [6]:
reviews = pd.read_pickle('reviews_products.pickle')

In [7]:
reviews

Unnamed: 0,type,channel,source_product_identifier,date,onlinepost_id,description,nb_statements,rating,rating_1,rating_2,rating_3,rating_4,rating_5,sentiment_negative,sentiment_neutral,sentiment_positive,sentiment,tokens,product,brand_abbrev,elc_brand,brand_score,item_description,product_score,itemid_4,major_category_id,major_category,application_id,application,category_id,category,sub_category_id,sub_category
0,Skincare,Amazon USA,B00015GYC2,2015-03,OnlinePost_20200301_24654a95-b252-4dee-8dfa-77...,This was very disappointing.. When I received ...,4,1.0,1,0,0,0,0,0.5,0.500000,0.000000,-0.500000,"[disappointing, receive, cleansing, use, night...",La Mer Cleansing Gel 6.7 oz / 200 ml,CM,La Mer,1.0,THE CLEANSING GEL,0.731931,51T0,520.0,Skincare,25,Face,34,Cleansers (Incl Soap,Z5,All Cleansers (Incl
1,Skincare,Amazon USA,B00015GYC2,2017-01,OnlinePost_20200301_84729e90-665d-4108-8ca5-0b...,I have tried many face washes- this is the bes...,2,5.0,0,0,0,0,1,0.0,0.000000,1.000000,1.000000,"[try, many, good, get, dirt, soft]",La Mer Cleansing Gel 6.7 oz / 200 ml,CM,La Mer,1.0,THE CLEANSING GEL,0.731931,51T0,520.0,Skincare,25,Face,34,Cleansers (Incl Soap,Z5,All Cleansers (Incl
2,Skincare,Amazon USA,B00015GYC2,2017-12,OnlinePost_20200301_9c216502-fba9-48a8-95cb-3b...,Fast service.. could find it from my usual ven...,3,5.0,0,0,0,0,1,0.0,0.000000,1.000000,1.000000,"[fast, could, find, usual, vendor, one, fast, ...",La Mer Cleansing Gel 6.7 oz / 200 ml,CM,La Mer,1.0,THE CLEANSING GEL,0.731931,51T0,520.0,Skincare,25,Face,34,Cleansers (Incl Soap,Z5,All Cleansers (Incl
3,Skincare,Amazon USA,B00015GYC2,2018-01,OnlinePost_20200301_01d7a936-32dd-454d-b4ef-ab...,Five Stars.. great great product.,2,5.0,0,0,0,0,1,0.0,0.000000,1.000000,1.000000,"[star, great, great]",La Mer Cleansing Gel 6.7 oz / 200 ml,CM,La Mer,1.0,THE CLEANSING GEL,0.731931,51T0,520.0,Skincare,25,Face,34,Cleansers (Incl Soap,Z5,All Cleansers (Incl
4,Skincare,Amazon USA,B00015GYC2,2018-07,OnlinePost_20200301_53a09274-200c-47dc-9cca-52...,Order from La Mer only.. Can't go wrong with L...,3,4.0,0,0,0,1,0,0.0,0.666667,0.333333,0.333333,"[order, go, wrong, order, site]",La Mer Cleansing Gel 6.7 oz / 200 ml,CM,La Mer,1.0,THE CLEANSING GEL,0.731931,51T0,520.0,Skincare,25,Face,34,Cleansers (Incl Soap,Z5,All Cleansers (Incl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388052,Cosmetics,Ulta,xlsImpprod6490080,2019-06,OnlinePost_20191027_184332048,Maybe I got an old one but this thing was awfu...,6,1.0,1,0,0,0,0,0.5,0.166667,0.333333,-0.166667,"[maybe, get, old, thing, awful, bad, batch, pa...",Photo Finish 24-Hour Shadow Primer,SX,Smashbox,1.0,PHOTO FNSH 24 HR SHADOW PRIMER,0.896071,C1JJ,510.0,Makeup,15,Eyes,2A,Eye Makeup Primer,MB,All Eye Makeup Prime
388053,Cosmetics,Ulta,xlsImpprod6490080,2019-06,OnlinePost_20191027_185330102,Love most Smashbox products and used the origi...,5,1.0,1,0,0,0,0,0.2,0.400000,0.400000,0.200000,"[love, use, original, year, disappoint, disapp...",Photo Finish 24-Hour Shadow Primer,SX,Smashbox,1.0,PHOTO FNSH 24 HR SHADOW PRIMER,0.896071,C1JJ,510.0,Makeup,15,Eyes,2A,Eye Makeup Primer,MB,All Eye Makeup Prime
388054,Cosmetics,Ulta,xlsImpprod6490080,2019-07,OnlinePost_20191027_184010998,The Best Primer.. this is literally the best p...,3,5.0,0,0,0,0,1,0.0,0.000000,1.000000,1.000000,"[good, literally, good, ever, use, use, else, ...",Photo Finish 24-Hour Shadow Primer,SX,Smashbox,1.0,PHOTO FNSH 24 HR SHADOW PRIMER,0.896071,C1JJ,510.0,Makeup,15,Eyes,2A,Eye Makeup Primer,MB,All Eye Makeup Prime
388055,Cosmetics,Ulta,xlsImpprod6490080,2019-07,OnlinePost_20191027_184176048,I didn't wear it for 24 hours but my eyeshadow...,2,5.0,0,0,0,0,1,0.0,0.000000,1.000000,1.000000,"[wear, hour, crease, stay, entire, time, exact...",Photo Finish 24-Hour Shadow Primer,SX,Smashbox,1.0,PHOTO FNSH 24 HR SHADOW PRIMER,0.896071,C1JJ,510.0,Makeup,15,Eyes,2A,Eye Makeup Primer,MB,All Eye Makeup Prime


## Using class

In [1]:
class TopicModeller():
    def __init__(self, df, column, name):
        self.df = df.reset_index(drop=True)
        self.column = column
        self.docs = df[column].values.tolist()
        self.name = name
        
    def compute_coherence_values(self, corpus, id2word, text, k, a, b):

        lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=k, 
                                               random_state=3,
                                               chunksize=100,
                                               passes=10,
                                               alpha=a,
                                               eta=b)

        coherence_model_lda = CoherenceModel(model=lda_model, texts=text, dictionary=id2word, coherence='c_v')

        return coherence_model_lda.get_coherence()
    
    def train_valid_lda(self,
                        nb_samples: int = 100000,
                        topics_range: list = [3,5,7,9],
                        alpha: list = [0.1, 0.01, 'symmetric','asymmetric'],
                        beta: list = [0.1, 'symmetric']                        
                        ):
        
        nb_samples = min(nb_samples, len(self.docs))
        
        # Training set
        train = random.sample(self.docs, nb_samples)
        
        # Create Dictionary
        self.id2word = corpora.Dictionary(train)
        
        # Term Document Frequency
        self.corpus = [self.id2word.doc2bow(text) for text in train]
        
        # Result dictionary
        self.model_results = {'Topics': [],
                         'Alpha': [],
                         'Beta': [],
                         'Coherence': []
                        }

        # Can take a long time to run
#         print('Validating hyperparameters...')
        if 1 == 1:
            pbar = tqdm.tqdm(total=(len(beta)*(len(alpha))*len(topics_range)), position=0, leave=True)
            # iterate through number of topics
            for k in topics_range:
                # iterate through alpha values
                for a in alpha:
                    # iterare through beta values
                    for b in beta:
                        # get the coherence score for the given parameters
                        cv = self.compute_coherence_values(corpus=self.corpus, id2word=self.id2word, text=train,
                                                      k=k, a=a, b=b)
                        # Save the model results
                        self.model_results['Topics'].append(k)
                        self.model_results['Alpha'].append(a)
                        self.model_results['Beta'].append(b)
                        self.model_results['Coherence'].append(cv)
#                         print('nb topics: {}, alpha: {}, beta: {}, coherence: {}'.format(k,a,b,cv))
                        pbar.update(1)
        self.cv_results = pd.DataFrame(self.model_results).sort_values("Coherence", ascending=False)
        self.cv_results.to_csv(self.name+'_lda_tuning_results.csv', index=False)
        display(self.cv_results)
        pbar.close()
        
        self.best_param = self.cv_results.iloc[0]
        self.best_num_topics = int(self.best_param['Topics'])
        self.best_alpha = self.best_param['Alpha']
        self.best_beta = self.best_param['Beta']
        name = self.name
        
        # Build LDA model
        print('Training best model...')
        self.lda_model = gensim.models.LdaMulticore(corpus=self.corpus,
                                                   id2word=self.id2word,
                                                   num_topics=self.best_num_topics,
                                                   alpha = self.best_alpha,
                                                   eta = self.best_beta,
                                                   random_state=3,
                                                   chunksize=100,
                                                   passes=10,
                                                   per_word_topics=True)
        print('Done.')
        pyLDAvis.enable_notebook()
        LDAvis_prepared = pyLDAvis.gensim.prepare(self.lda_model, self.corpus, self.id2word)
        display(LDAvis_prepared)
        pyLDAvis.save_html(LDAvis_prepared, name+'_'+str(self.best_alpha)+'_'+str(self.best_beta)+'_'+str(self.best_num_topics)+'.html')
        self.lda_model.save(name+'_'+str(self.best_alpha)+'_'+str(self.best_beta)+'_'+str(self.best_num_topics))
        pickle.dump(self.id2word, open( "id2word_"+name+'_'+str(self.best_alpha)+'_'+str(self.best_beta)+'_'+str(self.best_num_topics)+".pickle", "wb" ))
#         return self.lda_model, self.id2word
        
    def get_docs_topics(self):
        # Term Document Frequency
        corpus = [self.id2word.doc2bow(text) for text in self.docs]
        self.output = pd.concat([self.df, pd.DataFrame(gensim.matutils.corpus2csc(self.lda_model.get_document_topics(corpus)).T.toarray(), columns=['topic_'+str(i) for i in range(1,self.best_num_topics+1)])], axis=1)
        return self.output

    def view(self):
        pyLDAvis.enable_notebook()
        self.LDAvis_prepared = pyLDAvis.gensim.prepare(self.lda_model, self.corpus, self.id2word)
        display(LDAvis_prepared)

In [23]:
all_lda = TopicModeller(reviews, 'tokens')
lda, id2word = all_lda.train_valid_lda()

  0%|                                                                                           | 0/32 [00:00<?, ?it/s]

Validating hyperparameters...


  3%|██▌                                                                             | 1/32 [03:21<1:44:19, 201.92s/it]

nb topics: 3, alpha: 0.1, beta: 0.1, coherence: 0.43450483502760867


  6%|█████                                                                           | 2/32 [06:59<1:43:18, 206.61s/it]

nb topics: 3, alpha: 0.1, beta: symmetric, coherence: 0.4173797801446084


  9%|███████▌                                                                        | 3/32 [10:31<1:40:40, 208.29s/it]

nb topics: 3, alpha: 0.01, beta: 0.1, coherence: 0.4321078880093703


 12%|██████████                                                                      | 4/32 [14:03<1:37:40, 209.29s/it]

nb topics: 3, alpha: 0.01, beta: symmetric, coherence: 0.4187972426794901


 16%|████████████▌                                                                   | 5/32 [17:57<1:37:29, 216.66s/it]

nb topics: 3, alpha: symmetric, beta: 0.1, coherence: 0.44785537056494834


 19%|███████████████                                                                 | 6/32 [21:43<1:35:07, 219.53s/it]

nb topics: 3, alpha: symmetric, beta: symmetric, coherence: 0.4529891896705008


 22%|█████████████████▌                                                              | 7/32 [25:00<1:28:39, 212.78s/it]

nb topics: 3, alpha: asymmetric, beta: 0.1, coherence: 0.4898328844678008


 25%|████████████████████                                                            | 8/32 [28:35<1:25:25, 213.54s/it]

nb topics: 3, alpha: asymmetric, beta: symmetric, coherence: 0.46715282540513114


 28%|██████████████████████▌                                                         | 9/32 [32:20<1:23:08, 216.90s/it]

nb topics: 5, alpha: 0.1, beta: 0.1, coherence: 0.4867383389264779


 31%|████████████████████████▋                                                      | 10/32 [36:12<1:21:09, 221.34s/it]

nb topics: 5, alpha: 0.1, beta: symmetric, coherence: 0.48143841730331083


 34%|███████████████████████████▏                                                   | 11/32 [40:05<1:18:42, 224.89s/it]

nb topics: 5, alpha: 0.01, beta: 0.1, coherence: 0.4760170134367533


 38%|█████████████████████████████▋                                                 | 12/32 [43:32<1:13:09, 219.50s/it]

nb topics: 5, alpha: 0.01, beta: symmetric, coherence: 0.4785393959773647


 41%|████████████████████████████████                                               | 13/32 [46:57<1:08:07, 215.12s/it]

nb topics: 5, alpha: symmetric, beta: 0.1, coherence: 0.47545130821491705


 44%|██████████████████████████████████▌                                            | 14/32 [50:20<1:03:26, 211.48s/it]

nb topics: 5, alpha: symmetric, beta: symmetric, coherence: 0.484344326903534


 47%|█████████████████████████████████████▉                                           | 15/32 [53:41<59:02, 208.40s/it]

nb topics: 5, alpha: asymmetric, beta: 0.1, coherence: 0.49257048636004086


 50%|████████████████████████████████████████▌                                        | 16/32 [57:03<55:05, 206.61s/it]

nb topics: 5, alpha: asymmetric, beta: symmetric, coherence: 0.48905763402879343


 53%|█████████████████████████████████████████▉                                     | 17/32 [1:00:34<51:59, 207.96s/it]

nb topics: 7, alpha: 0.1, beta: 0.1, coherence: 0.4990065665293537


 56%|████████████████████████████████████████████▍                                  | 18/32 [1:04:08<48:52, 209.50s/it]

nb topics: 7, alpha: 0.1, beta: symmetric, coherence: 0.49978804339008953


 59%|██████████████████████████████████████████████▉                                | 19/32 [1:07:41<45:38, 210.62s/it]

nb topics: 7, alpha: 0.01, beta: 0.1, coherence: 0.497264938262603


 62%|█████████████████████████████████████████████████▍                             | 20/32 [1:11:16<42:23, 211.99s/it]

nb topics: 7, alpha: 0.01, beta: symmetric, coherence: 0.4955786319671159


 66%|███████████████████████████████████████████████████▊                           | 21/32 [1:14:49<38:54, 212.25s/it]

nb topics: 7, alpha: symmetric, beta: 0.1, coherence: 0.5036494254020598


 69%|██████████████████████████████████████████████████████▎                        | 22/32 [1:18:22<35:26, 212.60s/it]

nb topics: 7, alpha: symmetric, beta: symmetric, coherence: 0.5129538818615804


 72%|████████████████████████████████████████████████████████▊                      | 23/32 [1:21:57<31:59, 213.26s/it]

nb topics: 7, alpha: asymmetric, beta: 0.1, coherence: 0.4812388037440901


 75%|███████████████████████████████████████████████████████████▎                   | 24/32 [1:25:33<28:32, 214.03s/it]

nb topics: 7, alpha: asymmetric, beta: symmetric, coherence: 0.47292250364850386


 78%|█████████████████████████████████████████████████████████████▋                 | 25/32 [1:29:23<25:32, 218.93s/it]

nb topics: 9, alpha: 0.1, beta: 0.1, coherence: 0.5095316372908516


 81%|████████████████████████████████████████████████████████████████▏              | 26/32 [1:33:09<22:06, 221.14s/it]

nb topics: 9, alpha: 0.1, beta: symmetric, coherence: 0.5159173007799454


 84%|██████████████████████████████████████████████████████████████████▋            | 27/32 [1:36:55<18:32, 222.44s/it]

nb topics: 9, alpha: 0.01, beta: 0.1, coherence: 0.5033550531214757


 88%|█████████████████████████████████████████████████████████████████████▏         | 28/32 [1:40:38<14:50, 222.68s/it]

nb topics: 9, alpha: 0.01, beta: symmetric, coherence: 0.513511852668258


 91%|███████████████████████████████████████████████████████████████████████▌       | 29/32 [1:44:20<11:07, 222.50s/it]

nb topics: 9, alpha: symmetric, beta: 0.1, coherence: 0.5102478899331442


 94%|██████████████████████████████████████████████████████████████████████████     | 30/32 [1:48:02<07:24, 222.23s/it]

nb topics: 9, alpha: symmetric, beta: symmetric, coherence: 0.5112260315712213


 97%|████████████████████████████████████████████████████████████████████████████▌  | 31/32 [1:51:44<03:42, 222.08s/it]

nb topics: 9, alpha: asymmetric, beta: 0.1, coherence: 0.5376250455256808


100%|███████████████████████████████████████████████████████████████████████████████| 32/32 [1:55:26<00:00, 216.45s/it]

nb topics: 9, alpha: asymmetric, beta: symmetric, coherence: 0.5188056235047867
Training best model...





Done.


In [8]:
grid = {
    "makeup_face":{},
    "makeup_lips":{},
    "makeup_eyes":{},
    "makeup_face_color":{},
    "makeup_other":{},
    "skincare_all":{}
}

In [9]:
grid['makeup_face']['df'] = reviews[(reviews['major_category']=='Makeup') & (reviews['application']=='Face')]
grid['makeup_lips']['df'] = reviews[(reviews['major_category']=='Makeup') & (reviews['application']=='Lips')]
grid['makeup_eyes']['df'] = reviews[(reviews['major_category']=='Makeup') & (reviews['application']=='Eyes')]
grid['makeup_face_color']['df'] = reviews[(reviews['major_category']=='Makeup') & (reviews['application']=='Face Color')]
grid['makeup_other']['df'] = reviews[(reviews['major_category']=='Makeup') & (~reviews['application'].isin(['Face', 'Lips', 'Eyes', 'Face Color']))]
grid['skincare_all']['df'] = reviews[reviews['major_category']=='Skincare']

In [17]:
grid['makeup_face']['best_param'] = {'Alpha': 'asymmetric',
                                    'Beta': 0.1}
grid['makeup_lips']['best_param'] = {'Alpha': 'asymmetric',
                                    'Beta': 0.1}
grid['makeup_eyes']['best_param'] = {'Alpha': 'asymmetric',
                                    'Beta': 'symmetric'}
grid['makeup_face_color']['best_param'] = {'Alpha': 'asymmetric',
                                    'Beta': 0.1}
grid['makeup_other']['best_param'] = {'Alpha': 0.1,
                                    'Beta': 'symmetric'}
grid['skincare_all']['best_param'] = {'Alpha': 'asymmetric',
                                    'Beta': 0.1}

In [72]:
for cat in grid.keys():
    print('Processing '+cat)
    grid[cat]['LDA'] = TopicModeller(df=grid[cat]['df'], column='tokens', name=cat)
    grid[cat]['LDA'].train_valid_lda(nb_samples = 50000,
                        topics_range = [2,4,6],
                        alpha = [0.1, 'symmetric','asymmetric'],
                        beta = [0.1, 'symmetric'])

Processing makeup_face


100%|█████████████████████████████████████████████████████████████████████████████████| 18/18 [28:22<00:00, 103.52s/it]

Unnamed: 0,Topics,Alpha,Beta,Coherence
17,6,asymmetric,symmetric,0.508606
16,6,asymmetric,0.1,0.50151
14,6,symmetric,0.1,0.499363
15,6,symmetric,symmetric,0.483744
13,6,0.1,symmetric,0.483396
12,6,0.1,0.1,0.48327
10,4,asymmetric,0.1,0.471039
11,4,asymmetric,symmetric,0.468145
6,4,0.1,0.1,0.465047
9,4,symmetric,symmetric,0.459581


100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [28:22<00:00, 94.57s/it]


Training best model...
Done.


Processing makeup_lips


100%|█████████████████████████████████████████████████████████████████████████████████| 18/18 [29:42<00:00, 110.04s/it]

Unnamed: 0,Topics,Alpha,Beta,Coherence
16,6,asymmetric,0.1,0.515026
15,6,symmetric,symmetric,0.493769
14,6,symmetric,0.1,0.49374
10,4,asymmetric,0.1,0.491521
12,6,0.1,0.1,0.490182
17,6,asymmetric,symmetric,0.49005
13,6,0.1,symmetric,0.49002
11,4,asymmetric,symmetric,0.484032
9,4,symmetric,symmetric,0.466809
8,4,symmetric,0.1,0.461962


100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [29:42<00:00, 99.04s/it]


Training best model...
Done.


Processing makeup_eyes


100%|█████████████████████████████████████████████████████████████████████████████████| 18/18 [29:34<00:00, 112.87s/it]

Unnamed: 0,Topics,Alpha,Beta,Coherence
15,6,symmetric,symmetric,0.476357
17,6,asymmetric,symmetric,0.473375
16,6,asymmetric,0.1,0.469845
12,6,0.1,0.1,0.467208
11,4,asymmetric,symmetric,0.463723
14,6,symmetric,0.1,0.461857
13,6,0.1,symmetric,0.46082
6,4,0.1,0.1,0.450078
10,4,asymmetric,0.1,0.444844
8,4,symmetric,0.1,0.444335


100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [29:34<00:00, 98.60s/it]


Training best model...
Done.


Processing makeup_face_color


100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [12:18<00:00, 45.35s/it]

Unnamed: 0,Topics,Alpha,Beta,Coherence
16,6,asymmetric,0.1,0.488243
17,6,asymmetric,symmetric,0.482032
10,4,asymmetric,0.1,0.478342
11,4,asymmetric,symmetric,0.472049
14,6,symmetric,0.1,0.463764
4,2,asymmetric,0.1,0.456661
12,6,0.1,0.1,0.456578
15,6,symmetric,symmetric,0.453946
5,2,asymmetric,symmetric,0.452187
13,6,0.1,symmetric,0.451662


100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [12:18<00:00, 41.02s/it]


Training best model...
Done.


  0%|                                                                                           | 0/18 [00:00<?, ?it/s]

Processing makeup_other


100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [07:22<00:00, 26.32s/it]

Unnamed: 0,Topics,Alpha,Beta,Coherence
7,4,0.1,symmetric,0.462342
8,4,symmetric,0.1,0.462288
9,4,symmetric,symmetric,0.455867
14,6,symmetric,0.1,0.455517
12,6,0.1,0.1,0.448661
15,6,symmetric,symmetric,0.444258
13,6,0.1,symmetric,0.440159
6,4,0.1,0.1,0.438684
2,2,symmetric,0.1,0.433808
10,4,asymmetric,0.1,0.431777


100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [07:22<00:00, 24.58s/it]


Training best model...
Done.


Processing skincare_all


100%|█████████████████████████████████████████████████████████████████████████████████| 18/18 [31:12<00:00, 106.73s/it]

Unnamed: 0,Topics,Alpha,Beta,Coherence
17,6,asymmetric,symmetric,0.592819
16,6,asymmetric,0.1,0.584195
10,4,asymmetric,0.1,0.556307
8,4,symmetric,0.1,0.544013
11,4,asymmetric,symmetric,0.540895
15,6,symmetric,symmetric,0.53878
14,6,symmetric,0.1,0.531931
9,4,symmetric,symmetric,0.530704
13,6,0.1,symmetric,0.528157
12,6,0.1,0.1,0.523789


100%|█████████████████████████████████████████████████████████████████████████████████| 18/18 [31:12<00:00, 104.00s/it]


Training best model...
Done.


In [16]:
pickle.load(open( "grid.pickle", "rb" ))

AttributeError: Can't get attribute 'TopicModeller' on <module '__main__'>

In [21]:
def train_display_save(reviews, nb_samples, num_topics, alpha, eta, name):
    tokens=reviews['tokens']
    nb_samples = min(nb_samples, len(tokens))
        
    # Training set
    train = tokens.sample(nb_samples, random_state=3).values.tolist()

    # Create Dictionary
    id2word = corpora.Dictionary(train)

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in train]
    start=time.time()
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                                   id2word=id2word,
                                                   num_topics=num_topics,
                                                   alpha =alpha ,
                                                   eta =eta ,
                                                   random_state=3,
                                                   chunksize=100,
                                                   passes=10,
                                                   per_word_topics=True)
#     pyLDAvis.enable_notebook()
#     LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
#     display(LDAvis_prepared)
#     pyLDAvis.save_html(LDAvis_prepared, name+'_'+str(alpha)+'_'+str(eta)+'_'+str(num_topics)+'.html')
    lda_model.save(name+'_'+str(alpha).replace('.','')+'_'+str(eta).replace('.','')+'_'+str(num_topics))
#     pickle.dump(id2word, open( "id2word_"+ name+'_'+str(alpha)+'_'+str(eta)+'_'+str(num_topics)+".p", "wb" ))
    print(time.time()-start)
    

#     # Term Document Frequency
#     corpus = [id2word.doc2bow(text) for text in tokens]

#     topics = gensim.matutils.corpus2csc(lda.get_document_topics(corpus)).T.toarray()

#     topics = pd.DataFrame(topics, columns=['topic_'+str(i) for i in range(1,topics.shape[1]+1)])

#     output = pd.concat([reviews.reset_index(drop=True), topics], axis=1)
#     output.to_csv(name+'_'+str(alpha)+'_'+str(eta)+'_'+str(num_topics)+'.csv', index=False)
    return lda_model #, id2word, output, topics, reviews

In [None]:
grid['']

In [None]:
for cat in grid.keys():
    print(cat)
    num_topics = 4
    best_param = grid[cat]['best_param']
    grid[cat]['model'] = train_display_save(grid[cat]['df'], 10000, num_topics, best_param['Alpha'], best_param['Beta'], cat)

makeup_face
26.348291158676147
makeup_lips
19.98966121673584
makeup_eyes
23.80621910095215
makeup_face_color
25.18797731399536
makeup_other


In [138]:
pickle.dump(grid, open( "grid_4_topics.pickle", "wb" ))

In [131]:
pd.concat([rev.reset_index(drop=True), topics], axis=1)

Unnamed: 0,type,channel,source_product_identifier,date,onlinepost_id,description,nb_statements,rating,rating_1,rating_2,rating_3,rating_4,rating_5,sentiment_negative,sentiment_neutral,sentiment_positive,sentiment,tokens,product,brand_abbrev,elc_brand,brand_score,item_description,product_score,itemid_4,major_category_id,major_category,application_id,application,category_id,category,sub_category_id,sub_category,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
0,Skincare,Amazon USA,B0006LNFXW,2015-02,OnlinePost_20191120_140735168,I'm glad I found a container here on Amazon.. ...,2,5.0,0,0,0,0,1,0.0,0.00,1.00,1.00,"[find, container, look, kiss, sparkly]",MAC Iridescent Powder/Loose GOLDEN BRONZE,MC,M.A.C,1.0,IRIDESCENT POWDER/LOOSE,0.879657,SF96,510.0,Makeup,25,Face,26,Face Powder,44,Loose Face Powder,0.243224,0.486226,0.021969,0.018321,0.015684,0.013724,0.012199,0.178673,0.000000
1,Skincare,Amazon USA,B0006LNFXW,2015-02,OnlinePost_20200301_28485b30-1b33-4c9a-b2b6-c3...,it's really pretty and shimmery and is also gr...,4,5.0,0,0,0,0,1,0.0,0.00,1.00,1.00,"[really, pretty, shimmery, also, great, go, lo...",MAC Iridescent Powder/Loose GOLDEN BRONZE,MC,M.A.C,1.0,IRIDESCENT POWDER/LOOSE,0.879657,SF96,510.0,Makeup,25,Face,26,Face Powder,44,Loose Face Powder,0.349953,0.013764,0.010980,0.233745,0.000000,0.162322,0.000000,0.210323,0.000000
2,Skincare,Amazon USA,B0007D02HQ,2015-02,OnlinePost_20191120_142084340,Two Stars.,1,2.0,0,1,0,0,0,0.0,0.00,1.00,1.00,[star],"Clinique Continuous Coverage Makeup SPF 15, 1 ...",CL,Clinique,1.0,CONTINUOUS COVERAGE SPF15,0.716561,647G,510.0,Makeup,25,Face,28,Foundation,26,Liquid Foundation,0.109661,0.082246,0.065797,0.054831,0.546983,0.041123,0.036554,0.032898,0.029908
3,Skincare,Amazon USA,B0007D02HQ,2015-02,OnlinePost_20200301_71e4c00b-73f4-4931-a016-04...,didn't like the coverage.,1,2.0,0,1,0,0,0,0.0,0.00,1.00,1.00,[coverage],"Clinique Continuous Coverage Makeup SPF 15, 1 ...",CL,Clinique,1.0,CONTINUOUS COVERAGE SPF15,0.716561,647G,510.0,Makeup,25,Face,28,Foundation,26,Liquid Foundation,0.110480,0.082702,0.065825,0.054866,0.047000,0.041124,0.535194,0.032899,0.029908
4,Skincare,Amazon USA,B0007D02HQ,2015-03,OnlinePost_20191120_142437265,"I have used this product, and was very satisfi...",4,4.0,0,0,0,1,0,0.0,0.50,0.50,0.50,"[use, satisfied, satisfied, however, big, rip]","Clinique Continuous Coverage Makeup SPF 15, 1 ...",CL,Clinique,1.0,CONTINUOUS COVERAGE SPF15,0.716561,647G,510.0,Makeup,25,Face,28,Foundation,26,Liquid Foundation,0.504181,0.407705,0.018847,0.015674,0.013434,0.011755,0.010449,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134703,Cosmetics,Ulta,xlsImpprod4930101,2018-03,OnlinePost_20191027_185345917,My favorite primer.. Be sure to let lotion/ser...,4,5.0,0,0,0,0,1,0.0,0.25,0.75,0.75,"[favorite, sure, let, thoroughly, applying, be...",Photo Finish Protect SPF 20 Primer,SX,Smashbox,1.0,PHOTO FINISH PRIMER SPF 20 DS,0.872896,C0PL,510.0,Makeup,25,Face,29,Makeup Primers/Finis,31,AllOther Primers/Fin,0.000000,0.439583,0.244418,0.092170,0.066475,0.100740,0.000000,0.000000,0.043911
134704,Cosmetics,Ulta,xlsImpprod4930101,2018-06,OnlinePost_20191027_184229144,I just love love love this…. Yes I love it…. I...,4,5.0,0,0,0,0,1,0.0,0.00,1.00,1.00,"[love, love, love, love, wish, high, wish, high]",Photo Finish Protect SPF 20 Primer,SX,Smashbox,1.0,PHOTO FINISH PRIMER SPF 20 DS,0.872896,C0PL,510.0,Makeup,25,Face,29,Makeup Primers/Finis,31,AllOther Primers/Fin,0.025196,0.018297,0.014643,0.012187,0.010445,0.000000,0.000000,0.895324,0.000000
134705,Cosmetics,Ulta,xlsImpprod4930101,2018-06,OnlinePost_20191027_185282872,I love this product!. I love the way it makes ...,4,5.0,0,0,0,0,1,0.0,0.00,1.00,1.00,"[love, love, way, make, silky, look, great, go...",Photo Finish Protect SPF 20 Primer,SX,Smashbox,1.0,PHOTO FINISH PRIMER SPF 20 DS,0.872896,C0PL,510.0,Makeup,25,Face,29,Makeup Primers/Finis,31,AllOther Primers/Fin,0.233764,0.016816,0.013193,0.308296,0.000000,0.398653,0.000000,0.000000,0.000000
134706,Cosmetics,Ulta,xlsImpprod4930101,2018-07,OnlinePost_20191027_184557695,Great product!. I love this product.. It is a ...,3,5.0,0,0,0,0,1,0.0,0.00,1.00,1.00,"[great, love, super, make, apply]",Photo Finish Protect SPF 20 Primer,SX,Smashbox,1.0,PHOTO FINISH PRIMER SPF 20 DS,0.872896,C0PL,510.0,Makeup,25,Face,29,Makeup Primers/Finis,31,AllOther Primers/Fin,0.037852,0.027524,0.021963,0.369620,0.015667,0.494253,0.012185,0.010966,0.000000


In [25]:
lda, id2word = train_display_save(reviews['tokens'], 500000, 8, alpha='asymmetric', eta='symmetric', name='full')

In [26]:
docs = reviews['tokens'].values.tolist()

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in docs]

topics = gensim.matutils.corpus2csc(lda.get_document_topics(corpus)).T.toarray()

topics = pd.DataFrame(topics, columns=['topic_'+str(i) for i in range(1,topics.shape[1]+1)])

output = pd.concat([reviews, topics], axis=1)

output.to_pickle('reviews_w_topics.pkl')

In [80]:
topics = lda.print_topics(num_words=6)
for topic in topics:
    print('Topic #{}'.format(topic[0]+1))
    print('Terms: ', topic[1])

Topic #1
Terms:  0.033*"buy" + 0.031*"product" + 0.022*"so" + 0.022*"will" + 0.021*"use" + 0.020*"try"
Topic #2
Terms:  0.041*"love" + 0.041*"sample" + 0.034*"receive" + 0.033*"size" + 0.029*"all" + 0.027*"shadow"
Topic #3
Terms:  0.056*"skin" + 0.036*"primer" + 0.028*"product" + 0.027*"make" + 0.026*"feel" + 0.026*"use"
Topic #4
Terms:  0.038*"shade" + 0.034*"color" + 0.029*"too" + 0.025*"skin" + 0.025*"light" + 0.017*"more"
Topic #5
Terms:  0.069*"use" + 0.049*"good" + 0.048*"love" + 0.047*"year" + 0.044*"product" + 0.034*"ever"
Topic #6
Terms:  0.071*"product" + 0.054*"great" + 0.045*"eye" + 0.035*"star" + 0.034*"use" + 0.031*"concealer"
Topic #7
Terms:  0.064*"foundation" + 0.037*"coverage" + 0.029*"skin" + 0.028*"look" + 0.025*"day" + 0.020*"powder"
Topic #8
Terms:  0.066*"mascara" + 0.038*"lash" + 0.035*"eye" + 0.021*"eyeliner" + 0.021*"look" + 0.019*"love"
Topic #9
Terms:  0.097*"color" + 0.051*"lip" + 0.048*"love" + 0.036*"lipstick" + 0.022*"great" + 0.020*"look"


In [41]:
l15_ train_display_save(reviews['tokens'], 100000, 15, alpha='asymmetric', eta='symmetric', name='full')

365.79632544517517


(<gensim.models.ldamulticore.LdaMulticore at 0x17d600c82c8>,
 <gensim.corpora.dictionary.Dictionary at 0x17cc8b54ec8>)

In [37]:
l10 = train_display_save(reviews['tokens'], 100000, 10, alpha='asymmetric', eta='symmetric', name='full')

274.0791862010956


(<gensim.models.ldamulticore.LdaMulticore at 0x17cda4a5f88>,
 <gensim.corpora.dictionary.Dictionary at 0x17cda4c3788>)

In [38]:
l_9 = train_display_save(reviews['tokens'], 100000, 9, alpha='asymmetric', eta='symmetric', name='full')

277.3664469718933


(<gensim.models.ldamulticore.LdaMulticore at 0x17cf0dbd408>,
 <gensim.corpora.dictionary.Dictionary at 0x17cf0dbd0c8>)

In [39]:
train_display_save(reviews['tokens'], 100000, 8, alpha='asymmetric', eta='symmetric', name='full')

294.06596064567566


(<gensim.models.ldamulticore.LdaMulticore at 0x17c2ac09788>,
 <gensim.corpora.dictionary.Dictionary at 0x17a7e2319c8>)

In [31]:
train_display_save(reviews['tokens'], 100000, 7, alpha='symmetric', eta='symmetric', name='full')

254.2011616230011


(<gensim.models.ldamulticore.LdaMulticore at 0x17c2abf5a08>,
 <gensim.corpora.dictionary.Dictionary at 0x17d460b8948>)

In [33]:
train_display_save(reviews['tokens'], 100000, 6, alpha=0.1, eta='symmetric', name='full')

257.9453058242798


(<gensim.models.ldamulticore.LdaMulticore at 0x17cda4cf5c8>,
 <gensim.corpora.dictionary.Dictionary at 0x17cda4cfac8>)

In [34]:
train_display_save(reviews['tokens'], 100000, 5, alpha='asymmetric', eta=0.1, name='full')

239.18256402015686


(<gensim.models.ldamulticore.LdaMulticore at 0x17a7a4f3ac8>,
 <gensim.corpora.dictionary.Dictionary at 0x17c2abf9548>)

In [35]:
train_display_save(reviews['tokens'], 100000, 4, alpha='asymmetric', eta=0.1, name='full')

243.96960830688477


(<gensim.models.ldamulticore.LdaMulticore at 0x17a7a502908>,
 <gensim.corpora.dictionary.Dictionary at 0x17a7a502948>)

In [36]:
train_display_save(reviews['tokens'], 100000, 3, alpha='asymmetric', eta=0.1, name='full')

232.1398787498474


(<gensim.models.ldamulticore.LdaMulticore at 0x17cda4c6d48>,
 <gensim.corpora.dictionary.Dictionary at 0x17cda4c6708>)

In [40]:
train_display_save(reviews['tokens'], 100000, 2, alpha='asymmetric', eta=0.1, name='full')

243.65680503845215


(<gensim.models.ldamulticore.LdaMulticore at 0x17cefb4bb08>,
 <gensim.corpora.dictionary.Dictionary at 0x17cd1260d48>)

In [None]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

## Splitting by review

In [None]:
bad_reviews['tokens'].sample(100000)

In [None]:
good_reviews_6 = train_display_save(tokens=good_reviews['tokens'], nb_samples=100000, num_topics=6, alpha='asymmetric', eta=0.1, name='good_reviews')

In [None]:
bad_reviews_6 = train_display_save(tokens=bad_reviews['tokens'], nb_samples=100000, num_topics=6, alpha='asymmetric', eta=0.1, name='bad_reviews')

In [None]:
neutral_reviews_6 = train_display_save(tokens=neutral_reviews['tokens'], nb_samples=100000, num_topics=6, alpha='asymmetric', eta=0.1, name='neutral_reviews')

In [None]:
def save_display(self, title):
    p = pyLDAvis.gensim.prepare(self.lda_model, self.corpus, self.id2word)
    pyLDAvis.save_html(p, title+'.html')

In [None]:
save_display(good_lda, 'good_reviews_8_topics')

In [None]:
save_display(bad_lda, 'bad_reviews_8_topics')

In [None]:
save_display(neutral_lda, 'neutral_reviews_8_topics')

In [None]:
good_reviews = reviews[reviews['rating']>=4]
neutral_reviews = reviews[(reviews['rating']>2) & (reviews['rating']<4)]
bad_reviews = reviews[reviews['rating']<=2]

In [None]:
print(len(good_reviews))
good_lda = TopicModeller(good_reviews, 'tokens')
good_lda.train_valid_lda()

In [None]:
display(good_lda.cv_results)

In [None]:
good_lda.view()

In [None]:
print(len(bad_reviews))
bad_lda = TopicModeller(bad_reviews, 'tokens')
bad_lda.train_valid_lda()
bad_lda.view()

In [None]:
display(bad_lda.cv_results)

In [None]:
bad_lda.view()

In [None]:
print(len(neutral_reviews))
neutral_lda = TopicModeller(neutral_reviews, 'tokens')
neutral_lda.train_valid_lda()
neutral_lda.view()

In [None]:
display(neutral_lda.cv_results)

In [None]:
neutral_lda.view()

In [None]:
good_reviews = good_lda.get_docs_topics()

In [None]:
neutral_reviews = neutral_lda.get_docs_topics()

In [None]:
bad_reviews = bad_lda.get_docs_topics()

In [None]:
good_reviews.to_pickle('good_reviews.pickle')
bad_reviews.to_pickle('bad_reviews.pickle')
neutral_reviews.to_pickle('neutral_reviews.pickle')

In [None]:
good_lda.cv_results

In [None]:
good_reviews_3 = train_display_save(tokens=good_reviews['tokens'], nb_samples=100000, num_topics=3, alpha='asymmetric', eta=0.1, name='good_reviews')
bad_reviews_3 = train_display_save(tokens=bad_reviews['tokens'], nb_samples=100000, num_topics=3, alpha='asymmetric', eta=0.1, name='bad_reviews')
neutral_reviews_3 = train_display_save(tokens=neutral_reviews['tokens'], nb_samples=100000, num_topics=3, alpha='asymmetric', eta='symmetric', name='neutral_reviews')

In [None]:
int_start=time.time()
# Inputs
tokens_good = random.sample(good_reviews['tokens'].values.tolist(),100000)
# Create Dictionary
id2word_good = corpora.Dictionary(tokens_good)
# Term Document Frequency
corpus_good = [id2word_good.doc2bow(text) for text in tokens_good]
print(time.time()-int_start)

# Build LDA model
int_start=time.time()
lda_model_good = gensim.models.LdaMulticore(corpus=corpus_good,
                                       id2word=id2word_good,
                                       num_topics=3,
                                       alpha = 'asymmetric',
                                       eta = 0.1,
                                       random_state=3,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
print(time.time()-int_start)

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_good, corpus_good, id2word_good)
LDAvis_prepared

In [None]:
pyLDAvis.gensim.prepare(lda_model_good, corpus_good, id2word_good)

In [None]:
bad_lda.cv_results

In [None]:
int_start=time.time()
# Inputs
tokens_bad = random.sample(bad_reviews['tokens'].values.tolist(),100000)
# Create Dictionary
id2word_bad = corpora.Dictionary(tokens_bad)
# Term Document Frequency
corpus_bad = [id2word_bad.doc2bow(text) for text in tokens_bad]
print(time.time()-int_start)

# Build LDA model
int_start=time.time()
lda_model_bad = gensim.models.LdaMulticore(corpus=corpus_bad,
                                       id2word=id2word_bad,
                                       num_topics=3,
                                       alpha = 'asymmetric',
                                       eta = 0.1,
                                       random_state=3,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
print(time.time()-int_start)

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_bad, corpus_bad, id2word_bad)
LDAvis_prepared

In [None]:
int_start=time.time()
# Inputs
tokens_neutral = random.sample(neutral_reviews['tokens'].values.tolist(),100000)
# Create Dictionary
id2word_neutral = corpora.Dictionary(tokens_neutral)
# Term Document Frequency
corpus_neutral = [id2word_neutral.doc2bow(text) for text in tokens_neutral]
print(time.time()-int_start)

# Build LDA model
int_start=time.time()
lda_model_neutral = gensim.models.LdaMulticore(corpus=corpus_neutral,
                                       id2word=id2word_neutral,
                                       num_topics=3,
                                       alpha = 'asymmetric',
                                       eta = 'symmetric',
                                       random_state=3,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
print(time.time()-int_start)

pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_neutral, corpus_neutral, id2word_neutral)
LDAvis_prepared

## NLP Preprocessing

In [None]:
docs = pickle.load(open("docs.pickle", "rb"))

d = corpora.Dictionary(docs)

freq = pd.DataFrame(d.dfs.values(), index=d.dfs.keys(), columns=['freq'])
freq.index.name = 'idx'
freq = freq.reset_index()
freq['token'] = freq['idx'].apply(lambda x:d[x])
freq = freq.sort_values(by='freq', ascending=False)

In [None]:
freq[freq['freq']>500]

## Validating Model

In [None]:
# Inputs
docs = random.sample(docs, 100000)

# Create Dictionary
id2word = corpora.Dictionary(docs)
pickle.dump(id2word, open( "id2word.p", "wb" ) )

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in docs]

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, text, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=text, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
grid = {}
grid['Validation_Set'] = {}

nb_words = len(id2word)

# Topics range
min_topics = 6
max_topics = 10
step_size = 1
# topics_range = range(min_topics, max_topics, step_size)
topics_range = [8]
# Alpha parameter
# Added in the loop

# Alpha
alpha = [
#     0.1, 
#     'symmetric',
#     'asymmetric'
]

# Beta parameter
beta = [
    0.1, 
#     200/nb_words
]
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [
#     gensim.utils.ClippedCorpus(corpus, num_of_docs*0.05), 
#     gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
#     gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
    self.corpus
]

corpus_title = [
#     '25% Corpus',
#     '50% Corpus',
#     '75% Corpus',
    '100% Corpus'
]

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*(len(alpha)+1)*len(topics_range)))
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            alpha.append(50/k)
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, text=docs,
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    pbar.update(1)
    res = pd.DataFrame(model_results)
    res = pd.DataFrame(model_results).sort_values("Coherence", ascending=False)
    res.to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

In [None]:
res

## Training Best Model

In [None]:
best_param = res.iloc[0]
num_topics = best_param['Topics']
alpha = best_param['Alpha']
eta = best_param['Beta']

In [None]:
# Build LDA model
int_start=time.time()
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       alpha = alpha,
                                       eta = eta,
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
print(time.time()-int_start)

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

In [None]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared

In [None]:
lda_model.save('lda_test.model')

In [None]:
print(time.time()-start)

## Predictions

In [None]:
# df = pd.read_pickle('reviews_concat.pkl')

In [None]:
# df = df.loc['2019']

In [None]:
# print(len(df))

In [None]:
# docs = list(tqdm.tqdm(preprocess(df.values.tolist(), stop_words), position=0, leave=True))

In [None]:
# pickle.dump(docs, open( "docs.p", "wb" ) )

In [None]:
# docs = pickle.load(open("docs.p", "rb"))
# id2word = pickle.load(open("id2word.p", "rb"))

# # Term Document Frequency
# corpus = [id2word.doc2bow(text) for text in docs]

# output = pd.concat([df.to_frame('description').reset_index(), pd.DataFrame(gensim.matutils.corpus2csc(lda_model.get_document_topics(corpus)).T.toarray(), columns=['topic_'+str(i) for i in range(1,num_topics+1)])], axis=1, ignore_index=True)

# output.to_pickle('reviews_w_topics_test.pkl')

## Appendix

In [None]:
# # Build the bigram and trigram models
# bigram = gensim.models.Phrases(data_words, min_count=5, threshold=150) # higher threshold fewer phrases.
# # trigram = gensim.models.Phrases(bigram[data_words], threshold=150)

# # Faster way to get a sentence clubbed as a trigram/bigram
# bigram_mod = gensim.models.phrases.Phraser(bigram)
# # trigram_mod = gensim.models.phrases.Phraser(trigram)

# def remove_stopwords(texts):
#     return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# def make_bigrams(texts):
#     return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

# def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     """https://spacy.io/api/annotation"""
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent)) 
#         texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#     return texts_out

# # Remove Stop Words
# data_words = remove_stopwords(data_words)

# # Do lemmatization keeping only noun, adj, vb, adv
# data_words = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# # Form Bigrams
# data_words = make_bigrams(data_words)

In [None]:
# # Build LDA model
# lda_model = gensim.models.LdaMulticore(corpus=corpus,
#                                        id2word=id2word,
#                                        num_topics=8, 
#                                        random_state=100,
#                                        chunksize=100,
#                                        passes=10,
#                                        per_word_topics=True)

In [None]:
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

In [None]:
# coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
# coherence_lda = coherence_model_lda.get_coherence()
# print('Coherence Score: ', coherence_lda)