Guiding Question: What are customers saying about our movies?

## Our approach to analyzing:
- Analyze text of movie reviews
- Clean the review text
- HOW - topic modeling
- Label most reviews with the most important topics
- Visualize the results

In [2]:
import pandas as pd
import gensim
import os
import re

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora

from gensim.models.ldamulticore import LdaMulticore

import pandas as pd

In [3]:
gensim.__version__

'3.8.1'

In [4]:
path = './data/austen-brontë-split'

In [5]:
STOPWORDS = set(STOPWORDS).union(set(['said', 'mr', 'mrs']))

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [6]:
import os

def gather_data(path_to_data):
    data = []
    for f in os.listdir(path):
        if os.path.isdir(f) == False:
            if f[-3:] == 'txt':
                with open(os.path.join(path,f)) as t:
                    text = t.read().strip('\n')
                    data.append(tokenize(str(text)))
    return data

In [7]:
tokens = gather_data(path)

In [8]:
tokens[0][0:10]

['giving',
 'fair',
 'companion',
 'account',
 'yesterday',
 'party',
 'friend',
 'cole',
 'come',
 'stilton']

In [9]:
"this is a sample string with a \n newline character".replace('\n', '')

'this is a sample string with a  newline character'

In [10]:
titles = [t[:-4] for t in os.listdir(path) if os.path.isdir(t) == False]

In [11]:
len(titles)

813

In [12]:
len(tokens)

813

In [13]:
df = pd.DataFrame(index=titles, data={'tokens':tokens})

In [14]:
df.head()

Unnamed: 0,tokens
Austen_Emma0026,"[giving, fair, companion, account, yesterday, ..."
Austen_Emma0032,"[feels, like, snow, place, party, try, day, di..."
CBronte_Villette0086,"[pierced, opaque, blackness, stood, bougie, qu..."
CBronte_Jane0099,"[joke, queer, looks, tell, thing, specially, s..."
CBronte_Villette0092,"[second, landing, floor, comprising, abode, kn..."


In [15]:
df['author'] = df.reset_index()['index'].apply(lambda x: x.split('_')[0]).tolist()
df['book'] = df.reset_index()['index'].apply(lambda x: x.split('_')[1][:-4]).tolist()
df['section'] = df.reset_index()['index'].apply(lambda x: x[-4:]).tolist()
df['section'] = df['section'].astype('int')

In [16]:
df['author'] = df['author'].map({'Austen':1, 'CBronte':0})

In [17]:
df.author.value_counts()

0    441
1    372
Name: author, dtype: int64

In [18]:
df.head()

Unnamed: 0,tokens,author,book,section
Austen_Emma0026,"[giving, fair, companion, account, yesterday, ...",1,Emma,26
Austen_Emma0032,"[feels, like, snow, place, party, try, day, di...",1,Emma,32
CBronte_Villette0086,"[pierced, opaque, blackness, stood, bougie, qu...",0,Villette,86
CBronte_Jane0099,"[joke, queer, looks, tell, thing, specially, s...",0,Jane,99
CBronte_Villette0092,"[second, landing, floor, comprising, abode, kn...",0,Villette,92


In [26]:
def doc_stream(path):
    for f in os.listdir(path):
        if os.path.isdir(f) == False:
            if f[-3:] == 'txt':
                with open(os.path.join(path,f)) as t:
                    text = t.read().strip('\n')
                    tokens = tokenize(text)
                yield tokens

In [27]:
streaming_data = doc_stream(path)

In [28]:
type(streaming_data)

generator

In [29]:
# gather_data => returns a list
# doc_stream => returns a generator

In [30]:
next(streaming_data) # Returns one document at a time from the generator

['giving',
 'fair',
 'companion',
 'account',
 'yesterday',
 'party',
 'friend',
 'cole',
 'come',
 'stilton',
 'cheese',
 'north',
 'wiltshire',
 'butter',
 'cellery',
 'beet',
 'root',
 'dessert',
 'soon',
 'led',
 'better',
 'course',
 'consoling',
 'reflection',
 'thing',
 'interests',
 'love',
 'thing',
 'serve',
 'introduction',
 'near',
 'heart',
 'kept',
 'longer',
 'away',
 'walked',
 'quietly',
 'till',
 'view',
 'vicarage',
 'pales',
 'sudden',
 'resolution',
 'getting',
 'harriet',
 'house',
 'amiss',
 'boot',
 'fall',
 'arrange',
 'broke',
 'lace',
 'short',
 'dexterously',
 'throwing',
 'ditch',
 'presently',
 'obliged',
 'entreat',
 'stop',
 'acknowledged',
 'inability',
 'rights',
 'able',
 'walk',
 'home',
 'tolerable',
 'comfort',
 'lace',
 'gone',
 'know',
 'contrive',
 'troublesome',
 'companion',
 'hope',
 'ill',
 'equipped',
 'elton',
 'beg',
 'leave',
 'stop',
 'house',
 'ask',
 'housekeeper',
 'bit',
 'ribband',
 'string',
 'thing',
 'boot',
 'elton',
 'looked',

In [32]:
# Dictionary Representation of all the words in our corpus
id2word = corpora.Dictionary(doc_stream(path))

In [33]:
id2word.token2id['england']

869

In [34]:
id2word.doc2bow(tokenize("This is a sample message Darcy England England England"))

[(869, 3), (1254, 1), (2485, 1), (16850, 1)]

In [35]:
import sys
print(sys.getsizeof(id2word))
print(sys.getsizeof(tokens))

56
7056


In [36]:
len(id2word.keys())

22095

In [37]:
# Remove extreme values from the dataset
id2word.filter_extremes(no_below=5, no_above=0.95)

In [38]:
len(id2word.keys())

8102

In [39]:
# a bag of words(bow) representation of our corpus
# Note: we haven't actually read any text into memory here
# Although abstracted away - tokenization IS happening in the doc_stream f(x)
corpus = [id2word.doc2bow(text) for text in doc_stream(path)]

In [40]:
corpus[345][:10]

[(4, 1),
 (17, 1),
 (28, 1),
 (30, 1),
 (31, 2),
 (40, 1),
 (46, 1),
 (51, 1),
 (55, 1),
 (71, 1)]

In [41]:
lda = LdaMulticore(corpus=corpus,
                  id2word=id2word,
                  random_state=723812,
                  num_topics = 15,
                  passes=10,
                  workers=12)

In [42]:
lda.print_topics()

[(0,
  '0.009*"pounds" + 0.007*"mother" + 0.006*"sure" + 0.006*"think" + 0.006*"year" + 0.004*"rochester" + 0.004*"certainly" + 0.004*"thornfield" + 0.004*"fairfax" + 0.004*"live"'),
 (1,
  '0.008*"pelet" + 0.005*"thought" + 0.005*"like" + 0.004*"little" + 0.004*"man" + 0.004*"hunsden" + 0.004*"time" + 0.003*"having" + 0.003*"pupils" + 0.003*"crimsworth"'),
 (2,
  '0.008*"madame" + 0.006*"little" + 0.006*"like" + 0.004*"day" + 0.004*"vous" + 0.004*"thought" + 0.003*"know" + 0.003*"beck" + 0.003*"good" + 0.003*"paul"'),
 (3,
  '0.005*"night" + 0.005*"look" + 0.005*"like" + 0.004*"old" + 0.004*"est" + 0.004*"looked" + 0.003*"sir" + 0.003*"thought" + 0.003*"house" + 0.003*"hunsden"'),
 (4,
  '0.007*"little" + 0.005*"madame" + 0.005*"like" + 0.004*"thought" + 0.004*"know" + 0.004*"hand" + 0.004*"good" + 0.004*"night" + 0.003*"day" + 0.003*"dr"'),
 (5,
  '0.017*"elizabeth" + 0.013*"darcy" + 0.010*"miss" + 0.010*"bingley" + 0.010*"bennet" + 0.009*"jane" + 0.007*"know" + 0.007*"think" + 0.006

In [43]:
words = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]

In [44]:
topics = [' '.join(t[0:5]) for t in words]

In [46]:
for id, t in enumerate(topics):
    print(f"----- Topic {id} -----")
    print(t, end="\n\n")

----- Topic 0 -----
pounds mother sure think year

----- Topic 1 -----
pelet thought like little man

----- Topic 2 -----
madame little like day vous

----- Topic 3 -----
night look like old est

----- Topic 4 -----
little madame like thought know

----- Topic 5 -----
elizabeth darcy miss bingley bennet

----- Topic 6 -----
know jane life like long

----- Topic 7 -----
frances monsieur hunsden little english

----- Topic 8 -----
good little like emma elton

----- Topic 9 -----
sir long jane rochester little

----- Topic 10 -----
like little rochester miss room

----- Topic 11 -----
emma miss harriet thing weston

----- Topic 12 -----
jane emma little thing miss

----- Topic 13 -----
elinor marianne sister mother time

----- Topic 14 -----
monsieur henri georgiana mdlle little



In [48]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

In [49]:
pyLDAvis.gensim.prepare(lda, corpus, id2word)

In [50]:
lda[corpus[0]]

[(10, 0.13781828), (11, 0.7753989), (13, 0.08468007)]

In [51]:
distro = [lda[d] for d in corpus]

In [52]:
distro[0]

[(10, 0.13781089), (11, 0.77537), (13, 0.08471637)]

In [54]:
distro = [lda[d] for d in corpus]

def update(doc):
    d_dist = {k:0 for k in range(0,15)}
    for t in doc:
        d_dist[t[0]] = t[1]
    return d_dist

new_distro = [update(d) for d in distro]

In [55]:
len(new_distro)

813

In [56]:
df.head()

Unnamed: 0,tokens,author,book,section
Austen_Emma0026,"[giving, fair, companion, account, yesterday, ...",1,Emma,26
Austen_Emma0032,"[feels, like, snow, place, party, try, day, di...",1,Emma,32
CBronte_Villette0086,"[pierced, opaque, blackness, stood, bougie, qu...",0,Villette,86
CBronte_Jane0099,"[joke, queer, looks, tell, thing, specially, s...",0,Jane,99
CBronte_Villette0092,"[second, landing, floor, comprising, abode, kn...",0,Villette,92


In [57]:
df = pd.DataFrame.from_records(new_distro, index=titles)
df.columns = topics
df['author'] = df.reset_index()['index'].apply(lambda x: x.split('_')[0]).tolist()

In [58]:
df.head()

Unnamed: 0,pounds mother sure think year,pelet thought like little man,madame little like day vous,night look like old est,little madame like thought know,elizabeth darcy miss bingley bennet,know jane life like long,frances monsieur hunsden little english,good little like emma elton,sir long jane rochester little,like little rochester miss room,emma miss harriet thing weston,jane emma little thing miss,elinor marianne sister mother time,monsieur henri georgiana mdlle little,author
Austen_Emma0026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137893,0.775696,0.0,0.084309,0.0,Austen
Austen_Emma0032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.997455,0.0,0.0,0.0,Austen
CBronte_Villette0086,0.0,0.0,0.0,0.0,0.215013,0.0,0.782775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CBronte
CBronte_Jane0099,0.0,0.0,0.0,0.0,0.0,0.0,0.522476,0.0,0.0,0.0,0.472818,0.0,0.0,0.0,0.0,CBronte
CBronte_Villette0092,0.0,0.0,0.0,0.0,0.561942,0.02727,0.0,0.0,0.0,0.0,0.370904,0.038231,0.0,0.0,0.0,CBronte


In [59]:
df.groupby('author').mean()

Unnamed: 0_level_0,pounds mother sure think year,pelet thought like little man,madame little like day vous,night look like old est,little madame like thought know,elizabeth darcy miss bingley bennet,know jane life like long,frances monsieur hunsden little english,good little like emma elton,sir long jane rochester little,like little rochester miss room,emma miss harriet thing weston,jane emma little thing miss,elinor marianne sister mother time,monsieur henri georgiana mdlle little
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Austen,0.00268,0.005259,0.000558,2.7e-05,0.002244,0.229092,0.005139,0.000836,0.016347,0.005213,0.014287,0.352109,0.023064,0.340474,0.0
CBronte,0.002808,0.035885,0.061142,0.008203,0.298076,0.0058,0.123273,0.026868,0.007873,0.012269,0.385654,0.005821,0.004652,0.014466,0.004484


In [64]:
from gensim.models.coherencemodel import CoherenceModel

def compute_coherence_values(dictionary, corpus, limit, start=2, step=3, passes=5):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : Max num of topics
    passes: the number of times the entire lda model & coherence values are calculated

    Returns:
    -------
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    
    coherence_values = []
    
    for iter_ in range(passes):
        for num_topics in range(start, limit, step):
            model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary, workers=4)
            coherencemodel = CoherenceModel(model=model, dictionary=dictionary, corpus=corpus, coherence='u_mass')
            coherence_values.append({'pass': iter_,
                                    'num_topics': num_topics,
                                    'coherence_score': coherencemodel.get_coherence()})

In [None]:
coherence_values = compute_coherence_values(dictionary=id2word,
                                            corpus=corpus,
                                            start=2,
                                            limit=40,
                                            step=2,
                                            passes=100)

In [None]:
topic_coherence.head()

In [None]:
import seaborn as sns

ax = sns.lineplot(x="num_topics", y="coherence_score", data=topic_coherence)

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics = ", m, " has Coherence Value of", round(cv, 4))

In [None]:
lda[id2word.doc2bow(tokenize(lda[id2word.doc2bow(tokenize("This is a sample document to score with a topic distribution."))]