In [1]:
#Topic Mining

In [2]:
import pandas as pd
import spacy
import time
import nltk

In [3]:
# Only Arizona Businesses, Change if needed
restaurant_file='processed_data/cleaned_restaurants.csv'
reviews_file   ='processed_data/cleaned_reviews.csv'

# Number of topic
NUM_TOPICS = 10

In [4]:
%%time
# This is the large Spacy English Library
nlp = spacy.load('en_core_web_lg')

Wall time: 8.84 s


In [5]:
%%time
# Read Businesses
all_restaurants = pd.read_csv(restaurant_file).drop(labels='Unnamed: 0', axis=1)

Wall time: 15 ms


In [7]:
%%time
# Read all reviews
all_reviews = pd.read_csv(reviews_file).drop(labels='Unnamed: 0', axis=1).drop(labels='date', axis=1)

Wall time: 51 ms


In [8]:
%%time
# Top 5 Reviews
all_reviews.head()

Wall time: 1 ms


Unnamed: 0,business_id,name,business_stars,review_count,review_id,review_stars,useful,text,cleansed_text
0,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,3.5,269,QgV9RPyPUC3cAse1Wxqoow,4.0,2,Enjoyed Nee House immensely. No service issues...,enjoy immensely service issue fantastic chines...
1,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,3.5,269,1ZTO6zFtVVxtXclHp4TvHQ,3.0,0,I'm not sure how I rate this restaurant becaus...,not_sure_rate walk_soup_pm destroy need clear ...
2,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,3.5,269,h17ep5S7O8_JMKovooWoVA,5.0,0,The food from this place reminds me of home. I...,food remind home taiwanese little island china...
3,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,3.5,269,FVJaiFuf67Dzamax-zq1UQ,4.0,2,Found this place by just driving down the road...,find drive road lunch car decide stop wednesda...
4,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,3.5,269,zqzOcreb9KBTaESR6qbTSg,4.0,2,We eat here on a regular basis. It's like tha...,eat regular_basis little hole_wall tasty food ...


In [9]:
restaurant = all_restaurants.iloc[[1]].business_id.values[0]

In [10]:
restaurant

'r8764MtYyt8JhxMvrfM_xQ'

In [28]:
reviews = all_reviews[all_reviews.business_id == restaurant].cleansed_text

In [16]:
reviews = reviews.replace('_',' ')

In [29]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures

In [30]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(reviews)

  vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')


In [19]:
#pip install pyLDAvis

In [31]:
import warnings
warnings.filterwarnings('ignore')

import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig


In [32]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [33]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized) 

In [34]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)

In [35]:
# Get Topic Name
def get_topic_name(tok):
    topic_name_toks = []
    doc = nlp(" ".join(tok))
    pos = [token.pos_ for token in doc]
    
    def remove_at(j):
        topic_name_toks.append(tok[j].capitalize())
        pos.remove(pos[j])
        tok.remove(tok[j])
    for x in range(5):
        i = 0
        if x < 3:
            if  ("ADJ"   in pos) : i = pos.index("ADJ")
            elif("PROPN" in pos) : i = pos.index("PROPN")
            elif("NOUN"  in pos) : i = pos.index("NOUN")
            elif("ADV"   in pos) : i = pos.index("ADV")
            elif("VERB"  in pos) : i = pos.index("VERB")
            else :i=0
        elif x < 5:
            if  ("NOUN"  in pos) : i = pos.index("NOUN")
            elif("PROPN" in pos) : i = pos.index("PROPN")
            elif("VERB"  in pos) : i = pos.index("VERB")
            elif("ADV"   in pos) : i = pos.index("ADV")
            else :i=0
        
        remove_at(i)
    
    return " ".join(topic_name_toks) 

In [36]:
# Functions for printing keywords for each topic
def get_selected_topics(model, vectorizer, top_n=10):
    topics = {}
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        name = get_topic_name([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        print("Topic Name: "+name)
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
        topics[idx]=name
    return topics

In [37]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics = get_selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
Topic Name: Great Good Excellent Sauce Brunch
[('brunch', 8.999995434484019), ('sauce', 7.78255723793549), ('egg', 7.752249695219515), ('great', 7.490098285016525), ('good', 5.820838880603943), ('sunday', 5.696408187095436), ('ahi', 5.493595025206661), ('excellent', 5.082582021774999), ('strip', 4.781528920708476), ('pulled', 4.161502160754456)]
Topic 1:
Topic Name: Good Vintage Food Table Burger
[('good', 63.48294716534114), ('food', 57.31009485004116), ('table', 47.71597137267571), ('try', 47.11213291524162), ('burger', 46.775711302558655), ('enjoy', 42.02448840551378), ('vintage', 40.02833875419047), ('server', 39.476529417166105), ('time', 36.2747249171881), ('cheese', 35.378460700959565)]
Topic 2:
Topic Name: Great Good Nice Food Vintage
[('wine', 227.37528831440636), ('great', 164.70623036849145), ('good', 142.86897391041796), ('food', 127.34141881255195), ('nice', 85.85479215954042), ('vintage', 81.52380919598849), ('drink', 78.94106763815549), ('time', 73.28

In [38]:
selected_topics

{0: 'Great Good Excellent Sauce Brunch',
 1: 'Good Vintage Food Table Burger',
 2: 'Great Good Nice Food Vintage',
 3: 'Good Service Wine Food Time',
 4: 'Wonderful Heavy Excellent Ice Chef',
 5: 'Groupon Course Waiter Wine Hostess',
 6: 'Good Salmon Bisque Tomato Taste',
 7: 'Great Wonderful Delicious Bruschetta Love',
 8: 'Great Good Nice Service Atmosphere',
 9: 'Good Outstanding Smoke Bowl Ambiance'}

In [155]:
# to be continued

In [None]:

1> Clean the review -   machine readable (pre processing - remove stopwards, symbols, )
2> Aspects > Topics     Topic modelling  ()
3> Topics > catagories  catagorization   
4> Semetimants > rating score/Aspects > LARA
5> Optional > recommending


Between 17 - 23 Asad  - Not Available
                Karun - Not Available
                Ron   - Not Available
24 - 31         Asad  - step x 
                Dec 12 - Dec 16 (4 days)
      

