<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>
<br></br>

# Topic Modeling
## *Data Science Unit 4 Sprint 1 Assignment 4*

Analyze a corpus of Amazon reviews from Unit 4 Sprint 1 Module 1's lecture using topic modeling: 

- Fit a Gensim LDA topic model on Amazon Reviews
- Select appropriate number of topics
- Create some dope visualization of the topics
- Write a few bullets on your findings in markdown at the end
- **Note**: You don't *have* to use generators for this assignment

In [1]:
import numpy as np
import gensim
import os
import re

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora

from gensim.models.ldamulticore import LdaMulticore

import pandas as pd
import spacy
nlp = spacy.load("en_core_web_lg")

In [2]:
%pwd

'/Users/andrewlowe/Lambda_Repositories/unit4/DS-Unit-4-Sprint-1-NLP/module4-topic-modeling'

In [3]:
path = '../module1-text-data/data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv'

df = pd.read_csv(path)

In [8]:
df['reviews.text'].head()

0    I order 3 of them and one of the item is bad q...
1    Bulk is always the less expensive way to go fo...
2    Well they are not Duracell but for the price i...
3    Seem to work as well as name brand batteries a...
4    These batteries are very long lasting the pric...
Name: reviews.text, dtype: object

In [12]:
def tokenize(text):
    "Complete this function"
    
    return [token for token in simple_preprocess(text) if token in STOPWORDS]

In [30]:
df['tokens'] = df['reviews.text'].apply(tokenize)
df.tokens.shape

(28332,)

In [27]:
id2word = corpora.Dictionary(list(df['tokens']))

In [33]:
import sys
sys.getsizeof(id2word)

56

In [34]:
id2word.filter_extremes(no_below=5, no_above=0.95)


In [35]:
len(id2word.keys())

279

In [37]:
corpus = [id2word.doc2bow(text) for text in list(df['tokens'])]

In [39]:
corpus[0]

[(0, 1),
 (1, 1),
 (2, 2),
 (3, 1),
 (4, 3),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 2),
 (9, 1),
 (10, 2)]

In [40]:
lda = LdaMulticore(corpus=corpus,      # LDA multicore is the multiple distributed version of LDA. (like n_jobs on sklearn.)
                   id2word=id2word,     
                   random_state=723812,   # Random state not supre reliable in distributed model
                   num_topics = 15,     
                   passes=10,           # Num of times it tries to fit the distributions. (This has strongest effect of runtime). Maybe try 100 or 200?
                   workers=8     # Also number of distributed cores to multicore.
                  )

In [41]:
lda.print_topics()

[(0,
  '0.083*"the" + 0.081*"one" + 0.052*"my" + 0.050*"for" + 0.050*"so" + 0.049*"and" + 0.045*"this" + 0.040*"had" + 0.038*"fire" + 0.035*"no"'),
 (1,
  '0.259*"as" + 0.061*"the" + 0.058*"last" + 0.049*"just" + 0.044*"well" + 0.040*"name" + 0.038*"and" + 0.033*"these" + 0.028*"they" + 0.028*"other"'),
 (2,
  '0.123*"it" + 0.077*"for" + 0.074*"he" + 0.072*"and" + 0.065*"my" + 0.053*"was" + 0.049*"the" + 0.045*"this" + 0.043*"to" + 0.028*"his"'),
 (3,
  '0.094*"this" + 0.093*"the" + 0.093*"for" + 0.087*"is" + 0.086*"and" + 0.060*"my" + 0.059*"it" + 0.053*"has" + 0.048*"to" + 0.030*"that"'),
 (4,
  '0.176*"it" + 0.118*"for" + 0.089*"she" + 0.083*"my" + 0.059*"and" + 0.051*"this" + 0.042*"to" + 0.041*"her" + 0.025*"as" + 0.025*"so"'),
 (5,
  '0.114*"the" + 0.105*"very" + 0.081*"to" + 0.073*"of" + 0.060*"with" + 0.049*"and" + 0.044*"for" + 0.034*"they" + 0.027*"are" + 0.022*"in"'),
 (6,
  '0.142*"to" + 0.122*"it" + 0.087*"and" + 0.062*"my" + 0.042*"with" + 0.024*"the" + 0.024*"is" + 0.020

In [42]:
words = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]


In [43]:
topics = [' '.join(t[0:5]) for t in words]

In [44]:
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
the one my for so

------ Topic 1 ------
as the last just well

------ Topic 2 ------
it for he and my

------ Topic 3 ------
this the for is and

------ Topic 4 ------
it for she my and

------ Topic 5 ------
the very to of with

------ Topic 6 ------
to it and my with

------ Topic 7 ------
the to you it for

------ Topic 8 ------
for to it can and

------ Topic 9 ------
the is and it to

------ Topic 10 ------
the of in them and

------ Topic 11 ------
for we our and the

------ Topic 12 ------
the is can it you

------ Topic 13 ------
and to have it this

------ Topic 14 ------
they these and to are



In [45]:
import pyLDAvis.gensim    # Helps us visulize the topic models.

pyLDAvis.enable_notebook()

In [46]:
# Visualization of distance (similarity) of all these topics. size = size of topics in corpus. 

pyLDAvis.gensim.prepare(lda, corpus, id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [47]:
from gensim.models.coherencemodel import CoherenceModel

def compute_coherence_values(dictionary, corpus, limit, start=2, step=3, passes=5):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : Max num of topics
    passes: the number of times the entire lda model & coherence values are calculated

    Returns:
    -------
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    
    coherence_values = []
    
    for iter_ in range(passes):
        for num_topics in range(start, limit, step):
            model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary, workers=4)
            coherencemodel = CoherenceModel(model=model,dictionary=dictionary,corpus=corpus, coherence='u_mass')
            coherence_values.append({'pass': iter_, 
                                     'num_topics': num_topics, 
                                     'coherence_score': coherencemodel.get_coherence()
                                    })

    return coherence_values

In [None]:
# Can take a long time to run.
coherence_values = compute_coherence_values(dictionary=id2word, 
                                                        corpus=corpus,
                                                        start=2, 
                                                        limit=40, 
                                                        step=6,
                                                        passes=40)

In [None]:
topic_coherence = pd.DataFrame.from_records(coherence_values)

In [None]:
topic_coherence.head()

In [None]:
import seaborn as sns

ax = sns.lineplot(x="num_topics", y="coherence_score", data=topic_coherence)

## Stretch Goals

* Incorporate Named Entity Recognition in your analysis
* Incorporate some custom pre-processing from our previous lessons (like spacy lemmatization)
* Analyze a dataset of interest to you with topic modeling