# LDA Model for Visualization

In [0]:
# imports needed for data
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
# read in the data with pandas
data = pd.read_parquet('clean_review_0.parquet')
data = data[['business_id', 'token']]
print(data.shape)
data.head()

(20000, 2)


Unnamed: 0,business_id,token
0,ujmEBvifdJM6h6RLv4wQIg,"[Total, bill, for, this, horrible, service, Ov..."
1,NZnhc2sEQy3RmzKTZnqtwQ,"[I, adore, Travis, at, the, Hard, Rock, 's, ne..."
2,WTqjgwHlXbSFevF32_DJVw,"[I, have, to, say, that, this, office, really,..."
3,ikCg8xy5JIg_NGPx-MSIDA,"[Went, in, for, a, lunch, Steak, sandwich, was..."
4,b1b1eb3uo-w561D0ZfCEiQ,"[Today, was, my, second, out, of, three, sessi..."


In [6]:
# create a variable for later inputs
token = data['token']
token.shape

(20000,)

In [0]:
# Fit and transform the processed titles
cv = CountVectorizer(stop_words='english')
cvdata = cv.fit_transform(data['token'].astype(str))

In [8]:
print(cvdata[0])

  (0, 32865)	1
  (0, 15745)	1
  (0, 28645)	1
  (0, 841)	1
  (0, 8432)	1
  (0, 1187)	1
  (0, 21692)	1
  (0, 6210)	1
  (0, 735)	1
  (0, 24077)	2
  (0, 6299)	1
  (0, 22489)	1
  (0, 207)	1
  (0, 6035)	1
  (0, 2925)	1
  (0, 15773)	1
  (0, 11505)	1
  (0, 8059)	1


After fitting we can set up the corpus and dictionary

In [0]:
# imports for LDA with Gensim
from gensim import matutils, models
import scipy.sparse

In [0]:
# we're going to put the data into a new gensim format

sparse_counts = scipy.sparse.csr_matrix(cvdata)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [0]:
# gensim also requires a dictionary of all the terms, and possibly their location.

# cv = pickle.load(open("SOMETHING.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

now that we have the corpus (TDM) and id2word (dictionary of location: term) we will need to specify 2 other parameters - The nunber of Topics and The number of Passes. We'll start the number of topics at 2, see if it makes sense and adjust from there

In [0]:
# set the lda model and the parameters
# 2 topics
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.002*"fuddrucker" + 0.002*"bedeckecked" + 0.001*"crying" + 0.001*"guaranteeing" + 0.001*"magnífico" + 0.001*"garlic" + 0.001*"flamed" + 0.001*"carpeted" + 0.001*"cataplana" + 0.001*"jayme"'),
 (1,
  '0.001*"almonds" + 0.001*"830" + 0.001*"emerald" + 0.001*"cornmeal" + 0.001*"5people" + 0.001*"editor" + 0.001*"ethically" + 0.001*"3k" + 0.001*"breweries" + 0.001*"ditsy"')]

In [0]:
# 3 topics
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.002*"bedeckecked" + 0.002*"magnífico" + 0.001*"flamed" + 0.001*"garlic" + 0.001*"carpeted" + 0.001*"jayme" + 0.001*"chère" + 0.001*"heureusement" + 0.001*"dsl" + 0.001*"causa"'),
 (1,
  '0.004*"crying" + 0.002*"lawd" + 0.002*"employment" + 0.002*"crazy" + 0.002*"happening" + 0.002*"commas" + 0.002*"changing" + 0.002*"00pm" + 0.001*"1957" + 0.001*"chinatowns"'),
 (2,
  '0.002*"fuddrucker" + 0.001*"guaranteeing" + 0.001*"almonds" + 0.001*"burgatory" + 0.001*"emerald" + 0.001*"cultural" + 0.001*"830" + 0.001*"lyons" + 0.001*"maniac" + 0.001*"cornmeal"')]

In [0]:
# 4 topics
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.001*"almonds" + 0.001*"emerald" + 0.001*"830" + 0.001*"5people" + 0.001*"editor" + 0.001*"3k" + 0.001*"ditsy" + 0.001*"hibatchi" + 0.001*"1400" + 0.001*"alberta"'),
 (1,
  '0.003*"magnífico" + 0.002*"instalments" + 0.002*"burg" + 0.002*"cataplana" + 0.002*"backpack" + 0.002*"honoredas" + 0.002*"edm" + 0.002*"bellies" + 0.001*"lifelong" + 0.001*"caffe"'),
 (2,
  '0.029*"fuddrucker" + 0.019*"crying" + 0.017*"guaranteeing" + 0.013*"burgatory" + 0.013*"cultural" + 0.012*"lyons" + 0.012*"maniac" + 0.012*"g0dd" + 0.012*"izzy" + 0.011*"festively"'),
 (3,
  '0.002*"bedeckecked" + 0.001*"garlic" + 0.001*"flamed" + 0.001*"carpeted" + 0.001*"jayme" + 0.001*"lawd" + 0.001*"heureusement" + 0.001*"chère" + 0.001*"crescendoing" + 0.001*"causa"')]

The output: first row shows the top words for the 1st topic, then below will be the rows for the 2nd topic, etc


The next level will be to get Nouns and Adjectives only. This will polish the topics being found. 

In [17]:
# There was an error message later that said this install and download was required in order to move on
!pip install nltk



In [0]:
import nltk

In [20]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

Now that nltk was installed and imported

In [0]:
# Let's create a function to pull out the nouns and adj from the text.
# NN is used for nouns and JJ is used for Adjectives
from nltk import pos_tag

def nouns_adj(text):
  is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
  tokenized = token
  nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj]
  return ' '.join(nouns_adj)

In [0]:
# read in the cleaned data, before the vectorizer step

data_clean = token

In [21]:
# apply the nouns adj function  to the transcripts to filter

data_nouns_adj = pd.DataFrame(data_clean.apply(nouns_adj))
data_nouns_adj

AttributeError: ignored

the output will be each doc with their transcript

In [0]:
# create a new DTM only using the nouns and adj

data_cv = data_nouns_adj.transcript
data_dtm = pd.DataFrame(data_cv.toarray(), columns = data_cv.get_feature_names)  
data_dtm.index = data_nouns_adj.index
data_dtm

now we can recreate everything to include what we've made


In [0]:
# create the gensim corpus

corpusna = matutils.Sparse2Corpus(scipy.sparse,scr_matrix(data_dtm.transpose()))

# create the vocabulary dictionary

id2wordna = dict((v, k) for k, v in  data_cv.vocabulary_.items())

In [0]:
# start with 2 topics again

ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldna.print_topics()

In [0]:
# try 3 topics

ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldna.print_topics()

In [0]:
# try 4 topics

ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldna.print_topics()

When the topics start looking different we can go with that to the next step.

In [0]:
# run more iterations on our "final model"
# what increasing the passes does is it stabalizes which words falls into a topic

ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=80)
ldna.print_topics()

In [0]:
# now we can look at which topic each doc or transcript contains 

corpus_transformed = ldna[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtm.index))