<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>
<br></br>

# Topic Modeling
## *Data Science Unit 4 Sprint 1 Assignment 4*

Analyze a corpus of Amazon reviews from Unit 4 Sprint 1 Module 1's lecture using topic modeling: 

- Fit a Gensim LDA topic model on Amazon Reviews
- Select appropriate number of topics
- Create some dope visualization of the topics
- Write a few bullets on your findings in markdown at the end
- **Note**: You don't *have* to use generators for this assignment

In [12]:
import warnings
warnings.filterwarnings("ignore")

In [24]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
import pyLDAvis
import pyLDAvis.gensim 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm
tqdm.pandas()

In [14]:
df = pd.read_csv('May19.csv')

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28332 entries, 0 to 28331
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   28332 non-null  object 
 1   dateAdded            28332 non-null  object 
 2   dateUpdated          28332 non-null  object 
 3   name                 28332 non-null  object 
 4   asins                28332 non-null  object 
 5   brand                28332 non-null  object 
 6   categories           28332 non-null  object 
 7   primaryCategories    28332 non-null  object 
 8   imageURLs            28332 non-null  object 
 9   keys                 28332 non-null  object 
 10  manufacturer         28332 non-null  object 
 11  manufacturerNumber   28332 non-null  object 
 12  reviews.date         28332 non-null  object 
 13  reviews.dateSeen     28332 non-null  object 
 14  reviews.didPurchase  9 non-null      object 
 15  reviews.doRecommend  16086 non-null 

In [25]:
pd.set_option('display.max_colwidth', 0)

In [27]:
df['reviews.text']

0        I order 3 of them and one of the item is bad quality. Is missing backup spring so I have to put a pcs of aluminum to make the battery work.                                                                                                                                                                                  
1        Bulk is always the less expensive way to go for products like these                                                                                                                                                                                                                                                          
2        Well they are not Duracell but for the price i am happy.                                                                                                                                                                                                                                                                     
3        Seem to wo

In [28]:
# 1. Remove new line characters
df['clean_text'] = df['reviews.text'].apply(lambda x: re.sub('\s+', ' ', x))

# 2. Remove Emails
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub('From: \S+@\S+', '', x))

# 3. Remove non-alphanumeric characters
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

# 4. Remove extra whitespace 
df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join(x.split()))

In [19]:
vect = TfidfVectorizer(min_df=5)

In [20]:
df.keys()

Index(['id', 'dateAdded', 'dateUpdated', 'name', 'asins', 'brand',
       'categories', 'primaryCategories', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'reviews.date', 'reviews.dateSeen',
       'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id',
       'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.username', 'sourceURLs'],
      dtype='object')

In [22]:
nlp = spacy.load("en_core_web_lg")

In [29]:
def get_lemmas(x):
    lemmas = []
    for token in nlp(x):
        if (token.is_stop!=True) and (token.is_punct!=True):
            lemmas.append(token.lemma_)
    return lemmas

df['lemmas'] = df['clean_text'].progress_apply(get_lemmas)

100%|███████████████████████████████████████████████| 28332/28332 [10:00<00:00, 47.16it/s]


In [30]:
# Create Dictionary
id2word = corpora.Dictionary(df['lemmas'])

# How many words do we have?
print(f' Before filtering : {len(id2word.keys())} words in the custom dictionary')

# Let's remove extreme values from the dataset
id2word.filter_extremes(no_below=3, no_above=0.75)

# How many words do we have?
print(f' After filtering : {len(id2word.keys())} words in the custom dictionary')

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in df['lemmas']]

 Before filtering : 8677 words in the custom dictionary
 After filtering : 4336 words in the custom dictionary


In [31]:
%%time
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=50, 
                                            chunksize=100,
                                            passes=10,
                                            per_word_topics=True)

Wall time: 5min 11s


In [32]:
lda_model.save('lda_model.model2')

In [33]:
%%time
lda_multicore = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                        id2word=id2word,
                                                        num_topics=20, 
                                                        chunksize=100,
                                                        passes=10,
                                                        per_word_topics=True,
                                                        workers=12)

Wall time: 36.6 s


In [34]:
lda_multicore.save('lda_multicore.model2')

In [35]:
from gensim import models
lda =  models.LdaModel.load('lda_model.model2')

In [36]:
pprint(lda.print_topics())

[(3,
  '0.574*"easy" + 0.085*"take" + 0.079*"control" + 0.069*"camera" + '
  '0.053*"large" + 0.052*"run" + 0.037*"hand" + 0.010*"okay" + 0.010*"house" + '
  '0.006*"important"'),
 (26,
  '0.423*"nice" + 0.235*"light" + 0.127*"know" + 0.074*"thank" + 0.069*"fit" + '
  '0.053*"ok" + 0.000*"Ideal" + 0.000*"shocking" + 0.000*"usability" + '
  '0.000*"briefcase"'),
 (45,
  '0.512*"case" + 0.374*"cheap" + 0.062*"place" + 0.000*"briefcase" + '
  '0.000*"wander" + 0.000*"usability" + 0.000*"Ideal" + 0.000*"Verizon" + '
  '0.000*"ipad" + 0.000*"Drive"'),
 (13,
  '0.581*"need" + 0.283*"little" + 0.110*"fine" + 0.004*"household" + '
  '0.000*"briefcase" + 0.000*"usability" + 0.000*"Ideal" + 0.000*"Verizon" + '
  '0.000*"ipad" + 0.000*"unbox"'),
 (29,
  '0.684*"Amazon" + 0.122*"storage" + 0.033*"everyday" + 0.032*"gaming" + '
  '0.024*"believe" + 0.022*"provide" + 0.018*"kind" + 0.017*"short" + '
  '0.016*"future" + 0.009*"likely"'),
 (4,
  '0.710*"t" + 0.141*"don" + 0.061*"option" + 0.036*"spend

In [38]:
# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, 
                                     texts=df['lemmas'], 
                                     dictionary=id2word, 
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -29.585970179022205

Coherence Score:  0.48001971930444964


In [40]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, id2word, mds='mmds')
pyLDAvis.display(vis)

## Stretch Goals

* Incorporate Named Entity Recognition in your analysis
* Incorporate some custom pre-processing from our previous lessons (like spacy lemmatization)
* Analyze a dataset of interest to you with topic modeling