In [None]:
#pip install pymongo

In [None]:
#pip install gensim

In [1]:
# Import necessary libs
import pandas as pd
import re
import numpy as np
import time
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Spacy
import spacy

# Plotting
import matplotlib.pyplot as plot

# Mongo DB
from pymongo import MongoClient

In [None]:
# import data from MongoDB
DBClient = MongoClient()
yelp_data = DBClient.yelp

In [2]:
# Select business having atleast 50 reviews
min_review_count = 50

# businesses to Analyse
businesses_to_analyse = 'Restaurants'

In [None]:
# Get all restaurant businesses
Restaurant_business = pd.DataFrame(yelp_data.business.find({"categories":{"$regex" :".*"+businesses_to_analyse+".*"}, "review_count":{"$gte":min_review_count} },  {'business_id':1, 'name':1, 'city':1, 'state':1, 'stars':1, 'review_count':1, 'categories':1, '_id': 0}))

In [None]:
# Get all reviews
All_reviews = pd.DataFrame(yelp_data.review.find({},{'review_id':1, 'user_id':1, 'business_id':1, 'stars':1, 'useful':1, 'text':1, 'date':1, '_id': 0}))

In [None]:
# Find all restaurant reviews
#Restaurant_reviews = All_reviews[All_reviews.business_id.isin(Restaurant_business.business_id.values)]
Restaurant_reviews = pd.merge(Restaurant_business,All_reviews, on='business_id').rename(columns={'stars_x':'business_stars', 'stars_y':'review_stars'})

In [None]:
# Sample 5 Restaurant
Restaurant_business.head(5)

In [None]:
# Sample 5 Reviews
Restaurant_reviews.head(5)

In [None]:
# Write selected Restaurants to file
Restaurant_reviews.to_csv('processed_data/restaurant_reviews.csv')

In [None]:
# Write selected Restaurant-reviews to file
Restaurant_business.to_csv('processed_data/restaurants.csv')

In [None]:
# plot how many reviews we have of each star
star_x = Restaurant_reviews.review_stars.value_counts().index
star_y = Restaurant_reviews.review_stars.value_counts().values

plot.figure(figsize=(8,5))
# colors are in the order 5, 4, 3, 1, 2
bar_colors = ['darkgreen', 'mediumseagreen', 'gold', 'crimson', 'orange']
plot.bar(star_x, star_y, color=bar_colors, width=.6)
plot.xlabel('Stars (Rating)')
plot.ylabel('Number of Reviews')
plot.title(f'Number of Reviews Per Rating of {businesses_to_analyse}')

In [None]:
Restaurant_business.groupby('state').count()

In [None]:
restaurants_per_state = Restaurant_business.groupby('state').count()[['business_id']].rename(columns={'state': 'State', 'business_id': 'Restaurants'})

In [None]:
restaurants_per_state.sort_values(by='Restaurants').plot.bar(figsize=(10,10))

In [None]:
Restaurant_AZ = pd.DataFrame(yelp_data.business.find({"categories":{"$regex" :".*"+businesses_to_analyse+".*"}, "review_count":{"$gte":min_review_count}, "state":"AZ" },  {'business_id':1, 'name':1, 'city':1, 'state':1, 'stars':1, 'review_count':1, 'categories':1, '_id': 0}))

In [None]:
Restaurant_AZ_reviews = pd.merge(Restaurant_AZ,All_reviews, on='business_id').rename(columns={'stars_x':'business_stars', 'stars_y':'review_stars'})

In [None]:
Restaurant_AZ_reviews.to_csv('processed_data/restaurant_az_reviews.csv')

In [None]:
Restaurant_AZ.to_csv('processed_data/restaurants_az.csv')

In [None]:
Restaurant_AZ_reviews.shape

In [None]:
# plot how many reviews we have of each star
star_x = Restaurant_AZ_reviews.review_stars.value_counts().index
star_y = Restaurant_AZ_reviews.review_stars.value_counts().values

plot.figure(figsize=(8,5))
# colors are in the order 5, 4, 3, 1, 2
bar_colors = ['darkgreen', 'mediumseagreen', 'gold', 'crimson', 'orange']
plot.bar(star_x, star_y, color=bar_colors, width=.6)
plot.xlabel('Stars (Rating)')
plot.ylabel('Number of Reviews')
plot.title(f'Number of Reviews Per Rating of {businesses_to_analyse}')

In [150]:
# for now we restrich Restaurants to this number to develop the code
sample_restaurants_to_load = 10

# Only Arizona Businesses, Change if needed
restaurant_file='processed_data/restaurants_az.csv'
reviews_file   ='processed_data/restaurant_az_reviews.csv'

In [169]:
%%time
# SPACY
# This is the large Spacy English Library
nlp = spacy.load('en_core_web_lg')
nlp2 = spacy.load('en_core_web_lg', disable=["ner"])

Wall time: 21.8 s


In [152]:
# Stopwords for topic mining
stopwords = [line.rstrip('\n') for line in open('config/stopwords.txt', 'r')]

In [153]:
# The words that appear in names of the Restaurants
# Restaurants name may appear multiple time in review, increasing its word frequenty
# For topic mining per restaurant, it is not useful and should be removed
# However words such as 'chicken' when come in restaurant name should be retained
stopnames = [line.rstrip('\n').lower() for line in open('config/names.txt', 'r')]

In [154]:
%%time
# Read Businesses
all_restaurants = pd.read_csv(restaurant_file).drop(labels='Unnamed: 0', axis=1).head(sample_restaurants_to_load)

Wall time: 19 ms


In [155]:
%%time
# Read all reviews
all_reviews = pd.read_csv(reviews_file).drop(labels='Unnamed: 0', axis=1).drop(labels='city', axis=1).drop(labels='state', axis=1).drop(labels='categories', axis=1).drop(labels='user_id', axis=1).drop(labels='date', axis=1)

Wall time: 10.7 s


In [156]:
%%time
# Retain reviews of selected Businesses
all_reviews = all_reviews[all_reviews.business_id.isin(all_restaurants.business_id)]

Wall time: 343 ms


In [157]:
%%time
# Top 5 Reviews
all_reviews.head()

Wall time: 1.01 ms


Unnamed: 0,business_id,name,business_stars,review_count,review_id,review_stars,useful,text
0,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,3.5,269,QgV9RPyPUC3cAse1Wxqoow,4.0,2,Enjoyed Nee House immensely. No service issues...
1,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,3.5,269,1ZTO6zFtVVxtXclHp4TvHQ,3.0,0,I'm not sure how I rate this restaurant becaus...
2,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,3.5,269,h17ep5S7O8_JMKovooWoVA,5.0,0,The food from this place reminds me of home. I...
3,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,3.5,269,FVJaiFuf67Dzamax-zq1UQ,4.0,2,Found this place by just driving down the road...
4,44YFU284Z3KDEy25QyVoUw,Nee House Chinese Restaurant,3.5,269,zqzOcreb9KBTaESR6qbTSg,4.0,2,We eat here on a regular basis. It's like tha...


In [158]:
def tokenize_docs(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True re

In [159]:
# String List cleaning, removes spaces, new lines
def clean_string(data):
    data = [re.sub('\s+', ' '  , sent) for sent in data]
    data = [re.sub("n't", 'not', sent) for sent in data]
    data = [sent.lower()               for sent in data]
    data = list(tokenize_docs(data))
    data = [[tok for tok in sent if tok not in stopwords ] for sent in data]
    return data

In [160]:
def clean_name(name):
    name_toks = []
    
    # Nlp doc from Name
    name_doc = nlp(name)
    for token in name_doc:
        
        # Retain Proper nouns in Name
        if token.pos_ == 'PROPN' or token.like_num:
        
            # Lose stop words in Name
            if token.text.lower() not in stopnames:
            
                # All Restaurant name tokens to be remoed from reviews of this reataurant
                name_toks.append(token.text.lower())
    
    #for noun_phrase in list(name_doc.noun_chunks):
        #if(len(str(noun_phrase).split())<2):
            #noun_phrase.merge(noun_phrase.root.tag_, noun_phrase.root.lemma_, noun_phrase.root.ent_type_)
    
    
    for chunk in name_doc.ents:
        name_toks.append(chunk.text.lower())
    
    return name_toks

In [170]:
def clean_doc(doc,name_toks,allowed_postags=['PROPN', 'NOUN', 'ADJ', 'VERB', 'ADV']):
    
    # Remove punctuation, symbols (#) and stopwords
    allowed_postags=['PROPN', 'NOUN', 'ADJ', 'VERB', 'ADV']
    
    toks = [tok.lemma_ for tok in doc ]
    doc = nlp2(" ".join(toks))
    
    [noun_phrase.merge(noun_phrase.root.tag_, noun_phrase.root.lemma_, noun_phrase.root.ent_type_) for noun_phrase in doc.noun_chunks if len(str(noun_phrase).split())>1 and len(str(noun_phrase).split())<4]
    
    #doc = [tok.text for tok in doc if (tok.text.lower() not in stopwords and tok.pos_ != "PUNCT" and tok.pos_ != "SYM")]
    toks = [tok.text.lower().strip().replace('_',' ') for tok in doc 
                if (tok.text.lower().strip().replace('_',' ') not in stopwords 
                    and tok.text.lower().replace('_',' ') not in name_toks 
                    and tok.pos_ in allowed_postags
                   )]
    return " ".join(toks)

In [171]:
%%time
total = len(all_restaurants)
cleansed_text = []
for index, restaurant in all_restaurants.iterrows():
    #print(f'Cleaning reviews for restaurant: "{restaurant["name"]:<{40}}" [{index+1:>{5}}/{total:>{5}}]')
    if index % 100 == 0:
        print(f'Cleaning reviews [{index+1:>{5}}/{total:>{5}}]')
    
    # Convert to list
    data = all_reviews.query(' business_id == "'+restaurant['business_id']+'" ')['text']
    
    # Remove new lines, spaces, etc. Remove stopwords
    data = clean_string(data)
    
    # Build the bigram and trigram models
    bigram  = gensim.models.Phrases(data, min_count=4, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data],min_count=3, threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod  = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    
    bigrams  = [bigram_mod[doc] for doc in data]
    trigrams = [trigram_mod[bigram_mod[doc]] for doc in data]
    
    data = [" ".join(trigram) for trigram in trigrams]
    #data = [" ".join(toks) for toks in data] 
    
    # iterate list, clean sentences
    for parsed_review in nlp.pipe(iter(data), batch_size=1000, n_threads=8):
        #[noun_phrase.merge(noun_phrase.root.tag_, noun_phrase.root.lemma_, noun_phrase.root.ent_type_) for noun_phrase in parsed_review.noun_chunks if len(str(noun_phrase).split())>1 and len(str(noun_phrase).split())<4]
        cleansed_text.append(clean_doc(parsed_review,clean_name(restaurant["name"])))
        pprint(cleansed_text[-1])

Cleaning reviews [    1/   10]
('enjoy immensely service issue fantastic chinese food food spicy green beans '
 'perfect chow fun chicken perfect great expectation set bad service review '
 'disappointed')
('not sure rate walk soup pm destroy need clear sinus hot sour soap always '
 'trick tell couldnot cup supper time okay order bowl soup home okay thing '
 'look cup soup fresh good sure price high bowl soup wasnot great much broth')
('food remind home taiwanese little island china food taste hong kong american '
 'favorite local classic mongolian beef citrus orange chicken personally mix '
 'feeling more cultural grab lobster garlic chinese broccoli oyster sauce '
 'beef fried rice egg roll girlfriend home chicken fry rice egg roll dude egg '
 'roll bite entree seriously portion huge expect leftover merry moment later '
 'day finish food caveat donot expect service kind truly authentic chinese '
 'expect customer service standard tgif chinese point food sit food pay bill '
 'donot ch

('good chinese food northern phoenix maybe entire metro area wife sample '
 'chinese cultural district opinion better good general tso chicken region '
 'little twist think ginger certain set apart form competition general tso '
 'rivals competition west coast east coast noodle fantastic fresh ingredient '
 'vegetable eggplant shrimp outstanding reason didnot nee house star admit '
 'picky dish look entire phoenix metro area admittedly focus sichuan style '
 'dish dish look sichuan spicy cold chicken noodle fall love live bay area '
 'impossible find phoenix metro area want authentic delicious chinese food '
 'sure try')
('recent lunch change opinion crab dried scallop fried rice good best fry rice '
 've eat live hong kong rice kernel perfectly al dente greasy seasoned right '
 'restraint breath wok shine deliciously fry rice noodle real skill chef '
 'require good control heat timing lobster house special sauce lobster pluck '
 'live tank weigh lbs lb total approx sauce addictively s

KeyboardInterrupt: 

In [100]:
all_reviews['cleansed_text'] = cleansed_text
all_reviews.to_csv('processed_data/cleaned_reviews.csv')
all_restaurants.to_csv('processed_data/cleaned_restaurants.csv')

ValueError: Length of values does not match length of index

In [None]:
#END