In [1]:
import sys

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import  CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from pprint import pprint

# Plotting tools
!{sys.executable} -m pip install pyLDAvis
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)



In [2]:
%run ./Text_Normalization_Function.ipynb

Collecting html.parser
Installing collected packages: html.parser
Successfully installed html.parser
Original:   <p>The circus dog in a plissé skirt jumped over Python who wasn't that large, just 3 feet long.</p>
Processed:  ['<', 'p', '>', 'The', 'circus', 'dog', 'in', 'a', 'plissé', 'skirt', 'jumped', 'over', 'Python', 'who', 'was', "n't", 'that', 'large', ',', 'just', '3', 'feet', 'long.', '<', '/p', '>']
Original:   <p>The circus dog in a plissé skirt jumped over Python who wasn't that large, just 3 feet long.</p>
Processed:  <p>The circus dog in a plissé skirt jumped over Python who was not that large, just 3 feet long.</p>
Original:   <p>The circus dog in a plissé skirt jumped over Python who wasn't that large, just 3 feet long.</p>
Processed:  [('<', 'a'), ('p', 'n'), ('>', 'v'), ('the', None), ('circus', 'n'), ('dog', 'n'), ('in', None), ('a', None), ('plissé', 'n'), ('skirt', 'n'), ('jumped', 'v'), ('over', None), ('python', 'n'), ('who', None), ('was', 'v'), ("n't", 'r'), ('t

In [3]:
path = '/Users/xingxueyan/Desktop/Curriculum/SPRING19/Text Mining/amazon-fine-food-reviews/'

In [4]:
def get_topic_words(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topic_words = []
    for topic_weights in lda_model.components_:
        top_word_locs = (-topic_weights).argsort()[:n_words]
        topic_words.append(keywords.take(top_word_locs).tolist())
    return topic_words

In [5]:
data = pd.read_csv(path + 'Reviews_sub_10000.csv')

In [6]:
normalized_corpus = data.Normalized_Text.astype('U')

In [7]:
bow_vectorizer = CountVectorizer()
bow_corpus = bow_vectorizer.fit_transform(normalized_corpus)
bow_feature_names = bow_vectorizer.get_feature_names()

In [8]:
no_topics = 3

In [9]:
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=100,random_state = 42).fit(bow_corpus)



In [10]:
no_top_words = 10
topic_words = get_topic_words(vectorizer = bow_vectorizer, 
                              lda_model = lda, 
                              n_words = no_top_words)
pd.DataFrame(topic_words, 
             columns = ["word_" + str(i) for i in range(no_top_words)],
             index = ["Topic_" + str(i) for i in range(len(topic_words))]) 

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9
Topic_0,coffee,good,buy,amazon,order,taste,like,price,product,cup
Topic_1,food,dog,cat,love,product,treat,eat,like,give,good
Topic_2,taste,like,flavor,good,tea,great,use,love,eat,sugar


In [18]:
no_topics = 2

In [19]:
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=100,random_state = 42).fit(bow_corpus)



In [20]:
no_top_words = 10
topic_words = get_topic_words(vectorizer = bow_vectorizer, 
                              lda_model = lda, 
                              n_words = no_top_words)
pd.DataFrame(topic_words, 
             columns = ["word_" + str(i) for i in range(no_top_words)],
             index = ["Topic_" + str(i) for i in range(len(topic_words))]) 

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9
Topic_0,taste,like,good,flavor,coffee,tea,great,use,buy,love
Topic_1,food,dog,product,love,eat,cat,treat,like,good,buy
