# Review Word Cloud

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from collections import Counter
from operator import itemgetter
import spacy
import re
nlp = spacy.load('en')

## Yelp Review Data

### Data Wrangling

In [2]:
#Load data
reviews  = pd.read_csv("input_data/yelp_review.csv")

In [3]:
#Look at data head
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0


## Analysis

In [4]:
#Load cleaned business data for restaurant flag
res = pd.read_csv("output_data/res_data.csv")

In [5]:
#Merge review and business data for restaurant flag
res_rev = pd.merge(reviews,
                   res[['business_id','Restaurant']],
                   left_on = 'business_id',
                   right_on = 'business_id',
                   how = 'inner')

In [6]:
#Look at instances of useful > 0
len(res_rev.loc[res_rev['useful']>0, ['useful']])

1471844

In [7]:
#Look at instances of funny > 0
len(res_rev.loc[res_rev['funny']>0, ['funny']])

687099

In [8]:
#Look at instances of cool > 0
len(res_rev.loc[res_rev['cool']>0, ['cool']])

857957

In [9]:
#Create a singly long text string for reviews that were useful but not cool or funny, (and other 2 combinations)
useful_text = ' '.join(i+ " " for i in res_rev.loc[((res_rev['useful']>0) & (res_rev['funny']==0) & (res_rev['cool']==0)) ,'text'])
funny_text = ' '.join(i+ " " for i in res_rev.loc[((res_rev['useful']==0) & (res_rev['funny']>0) & (res_rev['cool']==0)),'text'])
cool_text = ' '.join(i+ " " for i in res_rev.loc[((res_rev['useful']==0) & (res_rev['funny']==0) & (res_rev['cool']>0)),'text'])


In [10]:
#Take the first 1,000,000 characters as a subset for the analysis
useful_text1 = useful_text[0:1000000]
funny_text1 = funny_text[0:1000000]
cool_text1 = cool_text[0:1000000]

In [11]:
#Run spacy tokenizer on each of the strings
parsed_text1 = nlp(useful_text1)
parsed_text2 = nlp(funny_text1)
parsed_text3 = nlp(cool_text1)

In [12]:
#Create a counter dictionary for useful words

histogram_with_some_filtering1 = Counter()
for token in parsed_text1:
    if re.match('[a-zA-Z]+$', token.lemma_) and not (token.is_stop or token.pos_ == 'PUNCT' or token.pos_ == 'SPACE'):
        lemma = token.lemma_
        histogram_with_some_filtering1[lemma] += 1

sorted_lemma_count_pairs = sorted(histogram_with_some_filtering1.items(),
                                  reverse=True,
                                  key=itemgetter(1))
for lemma, count in sorted_lemma_count_pairs[:10]:
    print(lemma, ":", count)

the : 1894
be : 1603
not : 1538
good : 1507
place : 1079
food : 988
order : 784
like : 752
come : 727
meat : 639


In [13]:
#Create a counter dictionary for funny words

histogram_with_some_filtering2 = Counter()
for token in parsed_text2:
    if re.match('[a-zA-Z]+$', token.lemma_) and not (token.is_stop or token.pos_ == 'PUNCT' or token.pos_ == 'SPACE'):
        lemma = token.lemma_
        histogram_with_some_filtering2[lemma] += 1

sorted_lemma_count_pairs = sorted(histogram_with_some_filtering2.items(),
                                  reverse=True,
                                  key=itemgetter(1))
for lemma, count in sorted_lemma_count_pairs[:10]:
    print(lemma, ":", count)

the : 1754
not : 1689
be : 1357
good : 1208
food : 1186
place : 1021
like : 791
order : 761
come : 710
time : 639


In [14]:
#Create a counter dictionary for cool words

histogram_with_some_filtering3 = Counter()
for token in parsed_text3:
    if re.match('[a-zA-Z]+$', token.lemma_) and not (token.is_stop or token.pos_ == 'PUNCT' or token.pos_ == 'SPACE'):
        lemma = token.lemma_
        histogram_with_some_filtering3[lemma] += 1

sorted_lemma_count_pairs = sorted(histogram_with_some_filtering3.items(),
                                  reverse=True,
                                  key=itemgetter(1))
for lemma, count in sorted_lemma_count_pairs[:10]:
    print(lemma, ":", count)

the : 1973
good : 1639
not : 1344
be : 1333
food : 1195
place : 1170
great : 794
come : 738
order : 724
time : 689


In [15]:
#Get the set of all keys
keys = set(list(histogram_with_some_filtering1.keys()) + list(histogram_with_some_filtering2.keys())+ list(histogram_with_some_filtering3.keys()))

In [16]:
#Created a dataframe of words with how many times they appear in useful, funny or cool rated reviews and their relative percentages
word_counts = []
for key in keys:
    word_counts.append([key, histogram_with_some_filtering1.get(key),histogram_with_some_filtering2.get(key),histogram_with_some_filtering3.get(key)])  
word_counts = pd.DataFrame(word_counts, columns = ["word","useful", "funny", "cool"])
word_counts.fillna(0, inplace = True)
word_counts.head()
word_counts["total"] = word_counts["useful"] + word_counts["funny"] + word_counts["cool"]
word_counts["useful_per"] = word_counts["useful"] / word_counts["total"]
word_counts["funny_per"] = word_counts["funny"] / word_counts["total"]
word_counts["cool_per"] = word_counts["cool"] / word_counts["total"]
word_counts.head()

Unnamed: 0,word,useful,funny,cool,total,useful_per,funny_per,cool_per
0,loom,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,pression,1.0,1.0,0.0,2.0,0.5,0.5,0.0
2,throat,3.0,2.0,0.0,5.0,0.6,0.4,0.0
3,agradable,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,somehwere,1.0,0.0,0.0,1.0,1.0,0.0,0.0


In [17]:
#To uniquely identify words in each category, take words that are appear over 10 times and have an occurence rating of >.7 in one category
threshhold = .7
unique = word_counts.loc[((word_counts["total"]>10) & ((word_counts['useful_per'] > threshhold) | (word_counts['funny_per'] >threshhold) | (word_counts['cool_per'] >threshhold)))]
unique.head()

Unnamed: 0,word,useful,funny,cool,total,useful_per,funny_per,cool_per
40,soap,2.0,10.0,1.0,13.0,0.153846,0.769231,0.076923
49,nasty,0.0,17.0,3.0,20.0,0.0,0.85,0.15
205,limo,0.0,3.0,9.0,12.0,0.0,0.25,0.75
227,filthy,1.0,11.0,2.0,14.0,0.071429,0.785714,0.142857
326,coat,14.0,1.0,2.0,17.0,0.823529,0.058824,0.117647


In [18]:
#Add which category had the highest percentage
unique['Category'] = unique[['useful_per','funny_per','cool_per']].apply(lambda s : ("Useful" if s['useful_per']>threshhold else ("Funny" if s['funny_per']>threshhold else "Cool")) , axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [19]:
#Get the count which is the max 
unique['Max Count'] = unique[['useful','funny','cool']].apply(lambda r : max(r['useful'],r['funny'],r['cool']), axis = 1)
unique.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,word,useful,funny,cool,total,useful_per,funny_per,cool_per,Category,Max Count
40,soap,2.0,10.0,1.0,13.0,0.153846,0.769231,0.076923,Funny,10.0
49,nasty,0.0,17.0,3.0,20.0,0.0,0.85,0.15,Funny,17.0
205,limo,0.0,3.0,9.0,12.0,0.0,0.25,0.75,Cool,9.0
227,filthy,1.0,11.0,2.0,14.0,0.071429,0.785714,0.142857,Funny,11.0
326,coat,14.0,1.0,2.0,17.0,0.823529,0.058824,0.117647,Useful,14.0


In [20]:
# Use occurences and category to make a data set that can be easily word clouded
word_cloud = []
counts = [int(i) for i in unique['Max Count'].values]
category = unique['Category'].values
word = unique['word'].values
for i,j in enumerate(counts):
    for k in range(j):
        word_cloud.append([word[i],category[i]])

In [21]:
word_cloud = pd.DataFrame(word_cloud, columns = ["word","category"])
word_cloud.head()

Unnamed: 0,word,category
0,soap,Funny
1,soap,Funny
2,soap,Funny
3,soap,Funny
4,soap,Funny


In [22]:
#Look at reviews that are cool with banh
res_rev.loc[((res_rev['text'].str.match(".*banh.*",case = False)) & (res_rev['funny']==0) & (res_rev['useful']==0) & (res_rev['cool']>0) & (res_rev['Restaurant'] == 'Y')), 'text'].values[0:5] 

array([ "I can't eat Banh Mi anywhere else. I am Vietnamese and I love the fusion meats along with daikon + carrot + cucumber baguettes. Additionally, kimchi fries are a must have! Who would have thought mixing kimchi and fries would be fantastic on the taste buds.",
       "Thinking about this place makes me so sad. They only have locations in Toronto and I am all the way in London, removed from the delicious food the big city can offer me. I went two different times, got the five spice pork, crispy tofu and kimchi fries. Everything was just perfect. The kimchi fries has a giant chunk of pork on it and it's a bit much for me. That wasn't a complaint though, no one should ever complain about too much delicious food. I got the bao and the banh mi, both of which were perfect. I would get the bao just because it's smaller and I can try more flavors. You can't really go wrong though",
       "The reviews don't lie, such a great Banh Mi, really affordable as well. Only docking it a star cau

In [23]:
#Look at reviews that are funny with reveipt
res_rev.loc[((res_rev['text'].str.match(".*receipt.*",case = False)) & (res_rev['funny']>0) & (res_rev['useful']==0) & (res_rev['cool']==0) & (res_rev['Restaurant'] == 'Y')), 'text'].values[0:5]

array([ 'I loved this place! It was my favorite part of a whirl-wind Vegas trip :). The older musicians playing standards are phenomenal, and so fun. The food is old school Italian, which I very much enjoy. The front of the house staff is second to none. Plus, the whole place is like traveling back in time! My only reason for not giving it 5 stars is because of our waitress. We had a large party, and had warned her we would be splitting the tab... She said, "no problem, I\'ll print multiple receipts so everyone can circle what they had." So we do this, and she cops major attitude because we gave her 5 cards (NOTE: I know this is annoying for a server, but she said it was fine when asked about it). We proceeded to wait for a full 30 MINUTES for our checks to sign! The night had been perfect for me up to that point, and I\'m sad it ended on a crappier note. None-the-less, you (review reader) should DEFINITELY go here! Especially if you like nostalgia; this place is very much for you ;).'

In [24]:
#Look at reviews that are useful with izakaya
res_rev.loc[((res_rev['text'].str.match(".*izakaya.*",case = False)) & (res_rev['funny']==0) & (res_rev['useful']>0) & (res_rev['cool']==0) & (res_rev['Restaurant'] == 'Y')), 'text'].values[0:5] 

array([ "This is my favourite izakaya in Toronto. The food and service are always excellent. The room is fun (you may end up making new friends) and boisterous. If you're not sure what to order, there's even a new tasting menu (and at $25 per person you really can't go wrong).",
       'I\'ve never had truly amazing Izakaya, outside of Toyko, Japan, and San Francisco, California, until I stumbled upon Guu Izakaya! If I could open an Izakaya joint in the United States, it would be Guu Izakaya!\n\nI happened to stumble on this small restaurant after returning to the Shangri-La Hotel on an empty stomach. And to my surprise, all the workers were Japanese and very friendly! To top it off, I was fortunate to come on the last night where they were in some type of celebration at the restaurant, where all the workers were wearing Kimonos! \n\nAmbiance: You are instantly welcomed when you enter the building. There\'s always great music and the interior setting feels as though you\'re inside an a

<img src="Visualization/Cool Cloud.png">

## Ouput Data

In [25]:
unique.to_csv("output_data/useful_cool_funny.csv")

In [26]:
word_cloud.to_csv("output_data/word_cloud.csv")