In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from textblob import Word
from textblob import TextBlob

In [None]:
r = requests.get('https://www.yelp.com/biz/savor-san-francisco-7?osq=Restaurants')

In [None]:
r.status_code

200

In [None]:
soup = BeautifulSoup(r.text, 'html.parser')

In [None]:
divs = soup.findAll(class_="raw__09f24__T4Ezm", attrs={'lang':'en'})
divs

In [None]:
reviews = []
for div in divs:
  reviews.append(div.text)
reviews.pop(-1)
reviews[0]

"In a quiet corner of inner sunset sits a mighty new entrant to the food scene of San Francisco. A notable new entrant that had us salivating for more and planning our next meal already. Chef Mohammed is an artist with 30+ years of fine dining experience. This according to him is his last gig, and he went vegetarian and vegan for his labor of love.This is Arabic fine dining at its best. Beit Rima used to be our favorite but that throne is now with savor cafe. Chef Mohammed reimagines Arabic classics and like an artist working with his muse, transforms the familiar into the sublime. You get fine dining refined palette at a fraction of the cost of any etoile conferred by Michelin.We had an incredibly challenging time selecting the dishes, since so many dishes spoke to us. We asked the sole server, who was also doubling up as the person behind the counter, making beverages and communicating with the sole employee in the kitchen - chef Mohammed - one man genius who is proud of his roots.We

In [None]:
df = pd.DataFrame(np.array(reviews), columns = ['review'])
df.head()

Unnamed: 0,review
0,In a quiet corner of inner sunset sits a might...
1,Mohamed and his crew are making some absolute ...
2,Just tried to go here for lunch because the me...
3,Where do I even start with this place.. it's s...
4,My first time at Savor did not disappoint. We ...


In [None]:
df['word_count'] = df['review'].apply(lambda x: len(x.split()))

In [None]:
df.head()

Unnamed: 0,review,word_count
0,In a quiet corner of inner sunset sits a might...,711
1,Mohamed and his crew are making some absolute ...,183
2,Just tried to go here for lunch because the me...,44
3,Where do I even start with this place.. it's s...,119
4,My first time at Savor did not disappoint. We ...,66


In [None]:
df['char_count'] = df['review'].apply(lambda x: len(x))

In [None]:
def average_words(x):
  words = x.split()
  return sum(len(word) for word in words) / len(words)

In [None]:
df['average_word_lenght'] = df['review'].apply(lambda x: average_words(x))

In [None]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_lenght
0,In a quiet corner of inner sunset sits a might...,711,4051,4.699015
1,Mohamed and his crew are making some absolute ...,183,999,4.459016
2,Just tried to go here for lunch because the me...,44,251,4.727273
3,Where do I even start with this place.. it's s...,119,619,4.210084
4,My first time at Savor did not disappoint. We ...,66,371,4.636364


In [None]:
stop_words = stopwords.words('english')

In [None]:
df['stopword_count'] = df['review'].apply(lambda x: len([word for word in x.split() if word.lower() in stop_words]))

In [None]:
df['stop_word_rate'] = df['stopword_count'] / df['word_count']

In [None]:
df.sort_values(by = 'stop_word_rate', ascending = False)

Unnamed: 0,review,word_count,char_count,average_word_lenght,stopword_count,stop_word_rate
3,Where do I even start with this place.. it's s...,119,619,4.210084,64,0.537815
2,Just tried to go here for lunch because the me...,44,251,4.727273,22,0.5
5,After years of going to the Carlton Hotel for ...,263,1453,4.528517,125,0.475285
4,My first time at Savor did not disappoint. We ...,66,371,4.636364,30,0.454545
1,Mohamed and his crew are making some absolute ...,183,999,4.459016,83,0.453552
6,Omg Savor is such a treasure to have in the in...,208,1177,4.663462,92,0.442308
0,In a quiet corner of inner sunset sits a might...,711,4051,4.699015,302,0.424754
7,This isn't the same as the Savor that was on 2...,245,1438,4.869388,98,0.4
8,It's a great experience. The ambiance is very ...,23,162,6.043478,9,0.391304


In [None]:
df['lower_case'] = df['review'].apply(lambda x: ' '.join(word.lower() for word in x.split()))

In [None]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_lenght,stopword_count,stop_word_rate,lower_case
0,In a quiet corner of inner sunset sits a might...,711,4051,4.699015,302,0.424754,in a quiet corner of inner sunset sits a might...
1,Mohamed and his crew are making some absolute ...,183,999,4.459016,83,0.453552,mohamed and his crew are making some absolute ...
2,Just tried to go here for lunch because the me...,44,251,4.727273,22,0.5,just tried to go here for lunch because the me...
3,Where do I even start with this place.. it's s...,119,619,4.210084,64,0.537815,where do i even start with this place.. it's s...
4,My first time at Savor did not disappoint. We ...,66,371,4.636364,30,0.454545,my first time at savor did not disappoint. we ...


In [None]:
df['punctuation'] = df['lower_case'].str.replace('[^\w\s]','')

  df['punctuation'] = df['lower_case'].str.replace('[^\w\s]','')


In [None]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_lenght,stopword_count,stop_word_rate,lower_case,punctuation
0,In a quiet corner of inner sunset sits a might...,711,4051,4.699015,302,0.424754,in a quiet corner of inner sunset sits a might...,in a quiet corner of inner sunset sits a might...
1,Mohamed and his crew are making some absolute ...,183,999,4.459016,83,0.453552,mohamed and his crew are making some absolute ...,mohamed and his crew are making some absolute ...
2,Just tried to go here for lunch because the me...,44,251,4.727273,22,0.5,just tried to go here for lunch because the me...,just tried to go here for lunch because the me...
3,Where do I even start with this place.. it's s...,119,619,4.210084,64,0.537815,where do i even start with this place.. it's s...,where do i even start with this place its so s...
4,My first time at Savor did not disappoint. We ...,66,371,4.636364,30,0.454545,my first time at savor did not disappoint. we ...,my first time at savor did not disappoint we o...


In [None]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
df['stopwords'] = df['punctuation'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

In [None]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_lenght,stopword_count,stop_word_rate,lower_case,punctuation,stopwords
0,In a quiet corner of inner sunset sits a might...,711,4051,4.699015,302,0.424754,in a quiet corner of inner sunset sits a might...,in a quiet corner of inner sunset sits a might...,quiet corner inner sunset sits mighty new entr...
1,Mohamed and his crew are making some absolute ...,183,999,4.459016,83,0.453552,mohamed and his crew are making some absolute ...,mohamed and his crew are making some absolute ...,mohamed crew making absolute magic went sunday...
2,Just tried to go here for lunch because the me...,44,251,4.727273,22,0.5,just tried to go here for lunch because the me...,just tried to go here for lunch because the me...,tried go lunch menu looks wonderful used restr...
3,Where do I even start with this place.. it's s...,119,619,4.210084,64,0.537815,where do i even start with this place.. it's s...,where do i even start with this place its so s...,even start place good definitely never vegetar...
4,My first time at Savor did not disappoint. We ...,66,371,4.636364,30,0.454545,my first time at savor did not disappoint. we ...,my first time at savor did not disappoint we o...,first time savor disappoint ordered pulled kin...


In [None]:
pd.Series(' '.join(df['stopwords']).split()).value_counts()[:30]

savor         12
sauce         11
food           9
vegan          8
meat           7
one            7
us             7
next           6
mushroom       6
4              6
get            6
like           6
fine           6
two            6
dishes         6
favorite       5
experience     5
bread          5
mashi          5
shawarma       5
eggplant       5
try            5
night          5
served         5
platter        5
chef           5
place          5
dining         4
dish           4
lunch          4
dtype: int64

In [None]:
other_stop_words = ['one', 'next', '4', 'get', 'two', 'try', 'night', 'us', 'go']

In [None]:
df['cleanreview'] = df['stopwords'].apply(lambda x: ' '.join(word for word in x.split() if word not in other_stop_words))

In [None]:
pd.Series(' '.join(df['cleanreview']).split()).value_counts()[:30]

savor         12
sauce         11
food           9
vegan          8
meat           7
mushroom       6
dishes         6
fine           6
like           6
platter        5
chef           5
bread          5
eggplant       5
experience     5
favorite       5
shawarma       5
mashi          5
place          5
served         5
cafe           4
menu           4
flavor         4
dish           4
got            4
good           4
tahini         4
falafel        4
saffron        4
tried          4
lunch          4
dtype: int64

In [None]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_lenght,stopword_count,stop_word_rate,lower_case,punctuation,stopwords,cleanreview
0,In a quiet corner of inner sunset sits a might...,711,4051,4.699015,302,0.424754,in a quiet corner of inner sunset sits a might...,in a quiet corner of inner sunset sits a might...,quiet corner inner sunset sits mighty new entr...,quiet corner inner sunset sits mighty new entr...
1,Mohamed and his crew are making some absolute ...,183,999,4.459016,83,0.453552,mohamed and his crew are making some absolute ...,mohamed and his crew are making some absolute ...,mohamed crew making absolute magic went sunday...,mohamed crew making absolute magic went sunday...
2,Just tried to go here for lunch because the me...,44,251,4.727273,22,0.5,just tried to go here for lunch because the me...,just tried to go here for lunch because the me...,tried go lunch menu looks wonderful used restr...,tried lunch menu looks wonderful used restroom...
3,Where do I even start with this place.. it's s...,119,619,4.210084,64,0.537815,where do i even start with this place.. it's s...,where do i even start with this place its so s...,even start place good definitely never vegetar...,even start place good definitely never vegetar...
4,My first time at Savor did not disappoint. We ...,66,371,4.636364,30,0.454545,my first time at savor did not disappoint. we ...,my first time at savor did not disappoint we o...,first time savor disappoint ordered pulled kin...,first time savor disappoint ordered pulled kin...


In [None]:
df['lemmatized'] = df['cleanreview'].apply(lambda x:' '.join(Word(word).lemmatize() for word in x.split()))

In [None]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_lenght,stopword_count,stop_word_rate,lower_case,punctuation,stopwords,cleanreview,lemmatized
0,In a quiet corner of inner sunset sits a might...,711,4051,4.699015,302,0.424754,in a quiet corner of inner sunset sits a might...,in a quiet corner of inner sunset sits a might...,quiet corner inner sunset sits mighty new entr...,quiet corner inner sunset sits mighty new entr...,quiet corner inner sunset sits mighty new entr...
1,Mohamed and his crew are making some absolute ...,183,999,4.459016,83,0.453552,mohamed and his crew are making some absolute ...,mohamed and his crew are making some absolute ...,mohamed crew making absolute magic went sunday...,mohamed crew making absolute magic went sunday...,mohamed crew making absolute magic went sunday...
2,Just tried to go here for lunch because the me...,44,251,4.727273,22,0.5,just tried to go here for lunch because the me...,just tried to go here for lunch because the me...,tried go lunch menu looks wonderful used restr...,tried lunch menu looks wonderful used restroom...,tried lunch menu look wonderful used restroom ...
3,Where do I even start with this place.. it's s...,119,619,4.210084,64,0.537815,where do i even start with this place.. it's s...,where do i even start with this place its so s...,even start place good definitely never vegetar...,even start place good definitely never vegetar...,even start place good definitely never vegetar...
4,My first time at Savor did not disappoint. We ...,66,371,4.636364,30,0.454545,my first time at savor did not disappoint. we ...,my first time at savor did not disappoint we o...,first time savor disappoint ordered pulled kin...,first time savor disappoint ordered pulled kin...,first time savor disappoint ordered pulled kin...


In [None]:
df['polarity'] = df['lemmatized'].apply(lambda x: TextBlob(x).sentiment[0])

In [None]:
df['subjectivity'] = df['lemmatized'].apply(lambda x: TextBlob(x).sentiment[1])

In [None]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_lenght,stopword_count,stop_word_rate,lower_case,punctuation,stopwords,cleanreview,lemmatized,polarity,subjectivity
0,In a quiet corner of inner sunset sits a might...,711,4051,4.699015,302,0.424754,in a quiet corner of inner sunset sits a might...,in a quiet corner of inner sunset sits a might...,quiet corner inner sunset sits mighty new entr...,quiet corner inner sunset sits mighty new entr...,quiet corner inner sunset sits mighty new entr...,0.291955,0.593513
1,Mohamed and his crew are making some absolute ...,183,999,4.459016,83,0.453552,mohamed and his crew are making some absolute ...,mohamed and his crew are making some absolute ...,mohamed crew making absolute magic went sunday...,mohamed crew making absolute magic went sunday...,mohamed crew making absolute magic went sunday...,0.46321,0.689062
2,Just tried to go here for lunch because the me...,44,251,4.727273,22,0.5,just tried to go here for lunch because the me...,just tried to go here for lunch because the me...,tried go lunch menu looks wonderful used restr...,tried lunch menu looks wonderful used restroom...,tried lunch menu look wonderful used restroom ...,0.171429,0.808929
3,Where do I even start with this place.. it's s...,119,619,4.210084,64,0.537815,where do i even start with this place.. it's s...,where do i even start with this place its so s...,even start place good definitely never vegetar...,even start place good definitely never vegetar...,even start place good definitely never vegetar...,0.284762,0.625952
4,My first time at Savor did not disappoint. We ...,66,371,4.636364,30,0.454545,my first time at savor did not disappoint. we ...,my first time at savor did not disappoint we o...,first time savor disappoint ordered pulled kin...,first time savor disappoint ordered pulled kin...,first time savor disappoint ordered pulled kin...,0.0275,0.568333


In [None]:
df['clean_review_word_count'] = df['cleanreview'].apply(lambda x: len([word for word in x.split()]))

In [None]:
df['clean_review_word_rate'] = df['clean_review_word_count'] / df['word_count']

In [None]:
df.sort_values(by = 'clean_review_word_rate', ascending = False)

Unnamed: 0,review,word_count,char_count,average_word_lenght,stopword_count,stop_word_rate,lower_case,punctuation,stopwords,cleanreview,lemmatized,clean_review_word_count,clean_review_word_rate,polarity,subjectivity
8,It's a great experience. The ambiance is very ...,23,162,6.043478,9,0.391304,it's a great experience. the ambiance is very ...,its a great experience the ambiance is very we...,great experience ambiance welcoming charming a...,great experience ambiance welcoming charming a...,great experience ambiance welcoming charming a...,14,0.608696,0.555,0.88
7,This isn't the same as the Savor that was on 2...,245,1438,4.869388,98,0.4,this isn't the same as the savor that was on 2...,this isnt the same as the savor that was on 24...,isnt savor 24th noe valley distinct memory cho...,isnt savor 24th noe valley distinct memory cho...,isnt savor 24th noe valley distinct memory cho...,135,0.55102,0.128497,0.476116
0,In a quiet corner of inner sunset sits a might...,711,4051,4.699015,302,0.424754,in a quiet corner of inner sunset sits a might...,in a quiet corner of inner sunset sits a might...,quiet corner inner sunset sits mighty new entr...,quiet corner inner sunset sits mighty new entr...,quiet corner inner sunset sits mighty new entr...,384,0.540084,0.291955,0.593513
4,My first time at Savor did not disappoint. We ...,66,371,4.636364,30,0.454545,my first time at savor did not disappoint. we ...,my first time at savor did not disappoint we o...,first time savor disappoint ordered pulled kin...,first time savor disappoint ordered pulled kin...,first time savor disappoint ordered pulled kin...,35,0.530303,0.0275,0.568333
6,Omg Savor is such a treasure to have in the in...,208,1177,4.663462,92,0.442308,omg savor is such a treasure to have in the in...,omg savor is such a treasure to have in the in...,omg savor treasure inner sunset san francisco ...,omg savor treasure inner sunset san francisco ...,omg savor treasure inner sunset san francisco ...,107,0.514423,0.323214,0.545982
1,Mohamed and his crew are making some absolute ...,183,999,4.459016,83,0.453552,mohamed and his crew are making some absolute ...,mohamed and his crew are making some absolute ...,mohamed crew making absolute magic went sunday...,mohamed crew making absolute magic went sunday...,mohamed crew making absolute magic went sunday...,94,0.513661,0.46321,0.689062
5,After years of going to the Carlton Hotel for ...,263,1453,4.528517,125,0.475285,after years of going to the carlton hotel for ...,after years of going to the carlton hotel for ...,years going carlton hotel saha berkeley locati...,years going carlton hotel saha berkeley locati...,year going carlton hotel saha berkeley locatio...,131,0.498099,0.227198,0.570788
2,Just tried to go here for lunch because the me...,44,251,4.727273,22,0.5,just tried to go here for lunch because the me...,just tried to go here for lunch because the me...,tried go lunch menu looks wonderful used restr...,tried lunch menu looks wonderful used restroom...,tried lunch menu look wonderful used restroom ...,20,0.454545,0.171429,0.808929
3,Where do I even start with this place.. it's s...,119,619,4.210084,64,0.537815,where do i even start with this place.. it's s...,where do i even start with this place its so s...,even start place good definitely never vegetar...,even start place good definitely never vegetar...,even start place good definitely never vegetar...,49,0.411765,0.284762,0.625952


In [None]:
df.drop(['lower_case', 'stopword_count', 'char_count', 'punctuation', 'stopwords', 'lemmatized'], axis = 1, inplace = True)

In [None]:
df.head()

Unnamed: 0,review,word_count,average_word_lenght,stop_word_rate,cleanreview,clean_review_word_count,clean_review_word_rate,polarity,subjectivity
0,In a quiet corner of inner sunset sits a might...,711,4.699015,0.424754,quiet corner inner sunset sits mighty new entr...,384,0.540084,0.291955,0.593513
1,Mohamed and his crew are making some absolute ...,183,4.459016,0.453552,mohamed crew making absolute magic went sunday...,94,0.513661,0.46321,0.689062
2,Just tried to go here for lunch because the me...,44,4.727273,0.5,tried lunch menu looks wonderful used restroom...,20,0.454545,0.171429,0.808929
3,Where do I even start with this place.. it's s...,119,4.210084,0.537815,even start place good definitely never vegetar...,49,0.411765,0.284762,0.625952
4,My first time at Savor did not disappoint. We ...,66,4.636364,0.454545,first time savor disappoint ordered pulled kin...,35,0.530303,0.0275,0.568333


In [None]:
df.sort_values(by='polarity', ascending = False)

Unnamed: 0,review,word_count,average_word_lenght,stop_word_rate,cleanreview,clean_review_word_count,clean_review_word_rate,polarity,subjectivity
8,It's a great experience. The ambiance is very ...,23,6.043478,0.391304,great experience ambiance welcoming charming a...,14,0.608696,0.555,0.88
1,Mohamed and his crew are making some absolute ...,183,4.459016,0.453552,mohamed crew making absolute magic went sunday...,94,0.513661,0.46321,0.689062
6,Omg Savor is such a treasure to have in the in...,208,4.663462,0.442308,omg savor treasure inner sunset san francisco ...,107,0.514423,0.323214,0.545982
0,In a quiet corner of inner sunset sits a might...,711,4.699015,0.424754,quiet corner inner sunset sits mighty new entr...,384,0.540084,0.291955,0.593513
3,Where do I even start with this place.. it's s...,119,4.210084,0.537815,even start place good definitely never vegetar...,49,0.411765,0.284762,0.625952
5,After years of going to the Carlton Hotel for ...,263,4.528517,0.475285,years going carlton hotel saha berkeley locati...,131,0.498099,0.227198,0.570788
2,Just tried to go here for lunch because the me...,44,4.727273,0.5,tried lunch menu looks wonderful used restroom...,20,0.454545,0.171429,0.808929
7,This isn't the same as the Savor that was on 2...,245,4.869388,0.4,isnt savor 24th noe valley distinct memory cho...,135,0.55102,0.128497,0.476116
4,My first time at Savor did not disappoint. We ...,66,4.636364,0.454545,first time savor disappoint ordered pulled kin...,35,0.530303,0.0275,0.568333
