In [1]:
#Topic Mining

In [2]:
import pandas as pd
import spacy
import time
import nltk

In [3]:
# for now we restrich Restaurants to this number to develop the code
sample_restaurants_to_load = 10000

# Only Arizona Businesses, Change if needed
restaurant_file='processed_data/restaurants_az.csv'
reviews_file   ='processed_data/restaurant_az_reviews.csv'

# Number of topic
NUM_TOPICS = 10

In [4]:
%%time
# This is the large Spacy English Library
nlp = spacy.load('en_core_web_lg')

Wall time: 10.2 s


In [5]:
# Stopwords for topic mining
stopwords = [line.rstrip('\n') for line in open('config/stopwords.txt', 'r')]

In [6]:
# The words that appear in names of the Restaurants
# Restaurants name may appear multiple time in review, increasing its word frequenty
# For topic mining per restaurant, it is not useful and should be removed
# However words such as 'chicken' when come in restaurant name should be retained
stopnames = [line.rstrip('\n').lower() for line in open('config/names.txt', 'r')]

In [7]:
%%time
# Read Businesses
all_restaurants = pd.read_csv(restaurant_file).drop(labels='Unnamed: 0', axis=1).head(sample_restaurants_to_load)

Wall time: 17 ms


In [8]:
%%time
# Read all reviews
all_reviews = pd.read_csv(reviews_file).drop(labels='Unnamed: 0', axis=1)

Wall time: 9.56 s


In [9]:
%%time
# Retain reviews of selected Businesses
all_reviews = all_reviews[all_reviews.business_id.isin(all_restaurants.business_id)]

Wall time: 246 ms


In [10]:
%%time
# Top 5 Reviews
all_reviews.head()

Wall time: 0 ns


Unnamed: 0,business_id,name,city,state,business_stars,review_count,categories,review_id,user_id,review_stars,useful,text,date
0,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,Phoenix,AZ,3.5,269,"Chinese, Restaurants",QgV9RPyPUC3cAse1Wxqoow,P3cMpkppvBuVpPD8LBTbBQ,4.0,2,Enjoyed Nee House immensely. No service issues...,2012-04-28 21:08:22
1,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,Phoenix,AZ,3.5,269,"Chinese, Restaurants",1ZTO6zFtVVxtXclHp4TvHQ,b1yLsCdv4ZL_d3INMCZzoA,3.0,0,I'm not sure how I rate this restaurant becaus...,2017-02-09 05:15:25
2,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,Phoenix,AZ,3.5,269,"Chinese, Restaurants",h17ep5S7O8_JMKovooWoVA,TaVuQWmXAhxy_LvIXBs9sg,5.0,0,The food from this place reminds me of home. I...,2016-08-12 21:38:55
3,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,Phoenix,AZ,3.5,269,"Chinese, Restaurants",FVJaiFuf67Dzamax-zq1UQ,aCY0jMl8Jsvx_HQL4D3tmw,4.0,2,Found this place by just driving down the road...,2011-08-24 22:35:28
4,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,Phoenix,AZ,3.5,269,"Chinese, Restaurants",zqzOcreb9KBTaESR6qbTSg,TPZbqNXMA2xMXLLd1zL_0A,4.0,2,We eat here on a regular basis. It's like tha...,2013-04-11 02:26:52


In [11]:
def clean_name(name):
    name_toks = []
    
    # Nlp doc from Name
    name_doc = nlp(name)
    for token in name_doc:
        
        # Retain Proper nouns in Name
        if token.pos_ == 'PROPN' or token.like_num:
        
            # Lose stop words in Name
            if token.text.lower() not in stopnames:
            
                # All Restaurant name tokens to be remoed from reviews of this reataurant
                name_toks.append(token.text.lower())
    return name_toks

In [12]:
def clean_doc(doc,name_toks):
    
    # Remove punctuation, symbols (#) and stopwords
    #doc = [tok.text for tok in doc if (tok.text.lower() not in stopwords and tok.pos_ != "PUNCT" and tok.pos_ != "SYM")]
    toks = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ 
            for tok in doc if (tok.text.lower().strip() not in stopwords and tok.text.lower() not in name_toks and tok.pos_ != "SYM" )]
    
    # Make all tokens lowercase
    doc = [tok.lower() for tok in toks]
    doc = ' '.join(doc).replace("n't",'not').replace(' .','.').replace('  ',' ')
    
    return doc

In [13]:
%%time
total = len(all_restaurants)
cleansed_text = []
for index, restaurant in all_restaurants.iterrows():
    #print(f'Cleaning reviews for restaurant: "{restaurant["name"]:<{40}}" [{index+1:>{5}}/{total:>{5}}]')
    if index % 100 == 0:
        print(f'Cleaning reviews [{index+1:>{5}}/{total:>{5}}]')
    for parsed_review in nlp.pipe(iter(all_reviews.query(' business_id == "'+restaurant['business_id']+'" ')['text']), batch_size=1000, n_threads=8):
        cleansed_text.append(clean_doc(parsed_review,clean_name(restaurant["name"])))

all_reviews['cleansed_text'] = cleansed_text
all_reviews.to_csv('processed_data/reviews.csv')

Cleaning reviews [    1/ 5216]
Cleaning reviews [  101/ 5216]
Cleaning reviews [  201/ 5216]
Cleaning reviews [  301/ 5216]
Cleaning reviews [  401/ 5216]
Cleaning reviews [  501/ 5216]
Cleaning reviews [  601/ 5216]
Cleaning reviews [  701/ 5216]
Cleaning reviews [  801/ 5216]
Cleaning reviews [  901/ 5216]
Cleaning reviews [ 1001/ 5216]
Cleaning reviews [ 1101/ 5216]
Cleaning reviews [ 1201/ 5216]
Cleaning reviews [ 1301/ 5216]
Cleaning reviews [ 1401/ 5216]
Cleaning reviews [ 1501/ 5216]
Cleaning reviews [ 1601/ 5216]
Cleaning reviews [ 1701/ 5216]
Cleaning reviews [ 1801/ 5216]
Cleaning reviews [ 1901/ 5216]
Cleaning reviews [ 2001/ 5216]
Cleaning reviews [ 2101/ 5216]
Cleaning reviews [ 2201/ 5216]
Cleaning reviews [ 2301/ 5216]
Cleaning reviews [ 2401/ 5216]
Cleaning reviews [ 2501/ 5216]
Cleaning reviews [ 2601/ 5216]
Cleaning reviews [ 2701/ 5216]
Cleaning reviews [ 2801/ 5216]
Cleaning reviews [ 2901/ 5216]
Cleaning reviews [ 3001/ 5216]
Cleaning reviews [ 3101/ 5216]
Cleaning

In [None]:
#all_reviews['cleansed_text'] = cleansed_text

In [None]:
print(all_reviews[['text','cleansed_text']])

In [None]:
#all_reviews.to_csv('processed_data/reviews.csv')

In [None]:
restaurant = all_restaurants.head(1).business_id.values[0]

In [None]:
restaurant

In [None]:
reviews = all_reviews[all_reviews.business_id == restaurant].cleansed_text

In [None]:
reviews

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures

In [None]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(reviews)

In [None]:
#pip install pyLDAvis

In [None]:
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

In [None]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized) 

In [None]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)

In [None]:
# Get Topic Name
def get_topic_name(tok):
    topic_name_toks = []
    doc = nlp(" ".join(tok))
    pos = [token.pos_ for token in doc]
    
    def remove_at(j):
        topic_name_toks.append(tok[j].capitalize())
        pos.remove(pos[j])
        tok.remove(tok[j])
    for x in range(5):
        i = 0
        if x % 2 == 0:
            if  ("ADJ"   in pos) : i = pos.index("ADJ")
            elif("PROPN" in pos) : i = pos.index("PROPN")
            elif("NOUN"  in pos) : i = pos.index("NOUN")
            elif("ADV"   in pos) : i = pos.index("ADV")
            elif("VERB"  in pos) : i = pos.index("VERB")
            else :i=0
        else:
            if  ("NOUN"  in pos) : i = pos.index("NOUN")
            elif("PROPN" in pos): i = pos.index("PROPN")
            elif("VERB"  in pos) : i = pos.index("VERB")
            elif("ADV"   in pos) : i = pos.index("ADV")
            else :i=0

        remove_at(i)
    
    return " ".join(topic_name_toks) 

In [None]:
# Functions for printing keywords for each topic
def get_selected_topics(model, vectorizer, top_n=10):
    topics = {}
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        name = get_topic_name([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        print("Topic Name: "+name)
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
        topics[idx]=name
    return topics

In [None]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics = get_selected_topics(lda, vectorizer)

In [None]:
selected_topics

In [None]:
# to be continued