In [2]:
#Topic Mining

In [3]:
import pandas as pd
import spacy
import time
import nltk

In [31]:
# Only Arizona Businesses, Change if needed
restaurant_file='processed_data/cleaned_restaurants.csv'
reviews_file   ='processed_data/cleaned_reviews.csv'

# Number of topic
NUM_TOPICS = 10

In [5]:
%%time
# This is the large Spacy English Library
nlp = spacy.load('en_core_web_lg')

Wall time: 10.2 s


In [32]:
%%time
# Read Businesses
all_restaurants = pd.read_csv(restaurant_file).drop(labels='Unnamed: 0', axis=1)

Wall time: 8 ms


In [33]:
%%time
# Read all reviews
all_reviews = pd.read_csv(reviews_file).drop(labels='Unnamed: 0', axis=1).drop(labels='date', axis=1)

Wall time: 47 ms


In [34]:
%%time
# Top 5 Reviews
all_reviews.head()

Wall time: 0 ns


Unnamed: 0,business_id,name,city,state,business_stars,review_count,categories,review_id,user_id,review_stars,useful,text,cleansed_text
0,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,Phoenix,AZ,3.5,269,"Chinese, Restaurants",QgV9RPyPUC3cAse1Wxqoow,P3cMpkppvBuVpPD8LBTbBQ,4.0,2,Enjoyed Nee House immensely. No service issues...,enjoy immensely service issue fantastic chines...
1,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,Phoenix,AZ,3.5,269,"Chinese, Restaurants",1ZTO6zFtVVxtXclHp4TvHQ,b1yLsCdv4ZL_d3INMCZzoA,3.0,0,I'm not sure how I rate this restaurant becaus...,sure rate walk soup pm destroy need clear sinu...
2,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,Phoenix,AZ,3.5,269,"Chinese, Restaurants",h17ep5S7O8_JMKovooWoVA,TaVuQWmXAhxy_LvIXBs9sg,5.0,0,The food from this place reminds me of home. I...,food place remind home taiwanese little island...
3,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,Phoenix,AZ,3.5,269,"Chinese, Restaurants",FVJaiFuf67Dzamax-zq1UQ,aCY0jMl8Jsvx_HQL4D3tmw,4.0,2,Found this place by just driving down the road...,find place drive road lunch car decide wednesd...
4,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,Phoenix,AZ,3.5,269,"Chinese, Restaurants",zqzOcreb9KBTaESR6qbTSg,TPZbqNXMA2xMXLLd1zL_0A,4.0,2,We eat here on a regular basis. It's like tha...,eat regular basis little hole_wall place tasty...


In [140]:
restaurant = all_restaurants.iloc[[1]].business_id.values[0]

In [141]:
restaurant

'r8764MtYyt8JhxMvrfM_xQ'

In [142]:
reviews = all_reviews[all_reviews.business_id == restaurant].cleansed_text

In [143]:
reviews

285    disappoint recent visit vintage month always t...
286    staff friendly excellent prepare traditional w...
287    spouse find deal groupon plan great evening at...
288    great live_music cozy fireplace recommend flat...
289    table ready baby ve dinner drink never sunday ...
                             ...                        
609    great experience wife night groupon discount c...
610    writing review ultimately return vintage impre...
611    wife date saturday decide try vintage time awe...
612    try vintage time friendly staff fun atmosphere...
613    warm classy divine word describe food wine mus...
Name: cleansed_text, Length: 329, dtype: object

In [144]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures

In [145]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(reviews)

In [146]:
#pip install pyLDAvis

In [147]:
warnings.filterwarnings('ignore')

import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings

In [148]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [149]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized) 

In [150]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)

In [164]:
# Get Topic Name
def get_topic_name(tok):
    topic_name_toks = []
    doc = nlp(" ".join(tok))
    pos = [token.pos_ for token in doc]
    
    def remove_at(j):
        topic_name_toks.append(tok[j].capitalize())
        pos.remove(pos[j])
        tok.remove(tok[j])
    for x in range(5):
        i = 0
        if x < 3:
            if  ("ADJ"   in pos) : i = pos.index("ADJ")
            elif("PROPN" in pos) : i = pos.index("PROPN")
            elif("NOUN"  in pos) : i = pos.index("NOUN")
            elif("ADV"   in pos) : i = pos.index("ADV")
            elif("VERB"  in pos) : i = pos.index("VERB")
            else :i=0
        elif x < 5:
            if  ("NOUN"  in pos) : i = pos.index("NOUN")
            elif("PROPN" in pos) : i = pos.index("PROPN")
            elif("VERB"  in pos) : i = pos.index("VERB")
            elif("ADV"   in pos) : i = pos.index("ADV")
            else :i=0
        
        remove_at(i)
    
    return " ".join(topic_name_toks) 

In [165]:
# Functions for printing keywords for each topic
def get_selected_topics(model, vectorizer, top_n=10):
    topics = {}
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        name = get_topic_name([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        print("Topic Name: "+name)
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
        topics[idx]=name
    return topics

In [166]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics = get_selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
Topic Name: Good Nice Wasnot Order
[('good', 25.98302556810524), ('order', 22.1691985251735), ('flavor', 21.50529570204471), ('food', 20.531320253338517), ('cheese', 18.36329324832126), ('think', 17.39966940205284), ('bruschetta', 15.674235325877968), ('nice', 15.311505266730318), ('wasnot', 14.299653038398995), ('salad', 13.492548883425906)]
Topic 1:
Topic Name: Excellent Sweet Ahi Rice
[('ahi', 6.436629421340358), ('great', 6.37391265879874), ('salmon', 6.269689521302305), ('rice', 5.255661172093214), ('pepper', 5.197534859885082), ('fry', 4.953689099863716), ('sauce', 4.635824396185133), ('excellent', 4.406948708543913), ('egg', 4.202498111740189), ('sweet', 4.136787232164157)]
Topic 2:
Topic Name: Wine Table Menu Place
[('wine', 42.06875099901537), ('sit', 34.66057722246126), ('table', 19.799954391771102), ('want', 18.329674054499446), ('menu', 17.93468513120167), ('place', 15.332172421872936), ('hostess', 15.270652965046265), ('couch', 15.117142538034722), ('do

In [167]:
selected_topics

{0: 'Good Nice Wasnot Order',
 1: 'Excellent Sweet Ahi Rice',
 2: 'Wine Table Menu Place',
 3: 'Nice Good Wine Bottle',
 4: 'Best Good Little Lunch',
 5: 'Great Good Vintage Food',
 6: 'Platter Enjoyable Flavorful Couple',
 7: 'Best Perfect Management Request',
 8: 'Open Belly Tomato Smoke',
 9: 'Good Order Table Drink'}

In [155]:
# to be continued