## Processing data

In [1]:
import pandas as pd
reviews = pd.read_parquet("reviews2.parquet", columns = ["review_id", "business_id", "user_id", "stars", "text", "date"])  # cleaned version
business = pd.read_parquet("business.parquet")  # changed cleaner version to parquet in earlier file.
users = pd.read_parquet("users2.parquet", columns = ["user_id", "name", "review_count","yelping_since"])  # unchanged.

In [5]:
merged_df = reviews.merge(users, on='user_id', how='left', suffixes = ['_review', '_user'])
merged_df_numeric = merged_df.drop(columns=['review_id', 'user_id', 'business_id', 'text', 'name', 
                                         'yelping_since', 'date'])
merged_df_numeric = merged_df_numeric.dropna()

In [None]:
merged_df['text']

In [7]:
#lower text
df_text = pd.DataFrame(merged_df.text.str.lower())
df_text = df_text.sample(10000, random_state=42)

## Cleaning text

### Tokenization: Text to Tokens

In [5]:
!pip install nltk



In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.casual import TweetTokenizer
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize, WhitespaceTokenizer, RegexpTokenizer
from nltk.corpus import words
from nltk.tokenize import SyllableTokenizer

# download package
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('words')

# upload stopwords and string
punctuations = set(string.punctuation)
english_words = set(words.words())

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package words to /home/jupyter/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [9]:
tokenized = [word_tokenize(t) for t in df_text['text']] 
tokenized[0]

['ok',
 ',',
 'let',
 'me',
 'start',
 'off',
 'by',
 'acknowledging',
 'i',
 'know',
 'how',
 'stressful',
 'restaurant',
 'week',
 'is',
 'for',
 'servers',
 'who',
 'are',
 'underpaid',
 'and',
 'overworked',
 '.',
 'my',
 'experience',
 'last',
 'night',
 'was',
 'absolutely',
 'incredible',
 '.',
 'we',
 'made',
 'reservations',
 'on',
 'open',
 'table',
 'for',
 '8',
 "o'clock",
 'for',
 'the',
 'restaurant',
 'week',
 'special',
 '.',
 'we',
 'were',
 'offered',
 'inside',
 'or',
 'outside',
 'seating',
 'and',
 'opted',
 'for',
 'the',
 'latter',
 '.',
 'the',
 "'porch",
 "'",
 'set',
 'the',
 'tone',
 'for',
 'a',
 'wonderful',
 'experience',
 '-',
 'think',
 'southern',
 'wraparound',
 'porch',
 'with',
 'an',
 'aesthetically',
 'pleasing',
 'faux',
 'ivy',
 'wall',
 'and',
 'high',
 'top',
 'seating',
 '.',
 'the',
 'perfect',
 'spot',
 'for',
 'a',
 'afternoon',
 'cocktail',
 'in',
 'the',
 'summer',
 '!',
 'our',
 'server',
 '(',
 'wish',
 'i',
 'remembered',
 'his',
 'nam

### Stopword + punctuations

In [13]:
import string
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
punctuations = set(string.punctuation)
tokenized_filtered = [[word for word in sentence if word.lower() not in stop_words and word not in punctuations] for sentence in tokenized]

print(tokenized_filtered[0])

['ok', 'let', 'start', 'acknowledging', 'know', 'stressful', 'restaurant', 'week', 'servers', 'underpaid', 'overworked', 'experience', 'last', 'night', 'absolutely', 'incredible', 'made', 'reservations', 'open', 'table', '8', "o'clock", 'restaurant', 'week', 'special', 'offered', 'inside', 'outside', 'seating', 'opted', 'latter', "'porch", 'set', 'tone', 'wonderful', 'experience', 'think', 'southern', 'wraparound', 'porch', 'aesthetically', 'pleasing', 'faux', 'ivy', 'wall', 'high', 'top', 'seating', 'perfect', 'spot', 'afternoon', 'cocktail', 'summer', 'server', 'wish', 'remembered', 'name', 'excellent', 'almost', 'immediately', 'acknowledged', 'picked', 'drink', 'orders', 'queen', 'anne', "'s", 'revenge', 'twist', 'old', 'fashion', 'drink', 'faint', 'heart', 'thankfully', 'needed', 'stiff', 'drink', 'first', 'course', 'fried', 'green', 'tomatoes', 'crushpuppies', 'fried', 'green', 'tomatoes', 'delicious', 'remoulade', 'little', 'pickled', 'shrimp', 'right', 'top', 'crushpuppies', 'yu

### merge back

In [17]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()  

tokenized_filtered_stemmed = []
for tokenized_sentence in tokenized_filtered: 
    tokenized_filtered_stemmed.append([ps.stem(word) for word in tokenized_sentence])

tokenized_filtered_stemmed[0:1]


[['ok',
  'let',
  'start',
  'acknowledg',
  'know',
  'stress',
  'restaur',
  'week',
  'server',
  'underpaid',
  'overwork',
  'experi',
  'last',
  'night',
  'absolut',
  'incred',
  'made',
  'reserv',
  'open',
  'tabl',
  '8',
  "o'clock",
  'restaur',
  'week',
  'special',
  'offer',
  'insid',
  'outsid',
  'seat',
  'opt',
  'latter',
  "'porch",
  'set',
  'tone',
  'wonder',
  'experi',
  'think',
  'southern',
  'wraparound',
  'porch',
  'aesthet',
  'pleas',
  'faux',
  'ivi',
  'wall',
  'high',
  'top',
  'seat',
  'perfect',
  'spot',
  'afternoon',
  'cocktail',
  'summer',
  'server',
  'wish',
  'rememb',
  'name',
  'excel',
  'almost',
  'immedi',
  'acknowledg',
  'pick',
  'drink',
  'order',
  'queen',
  'ann',
  "'s",
  'reveng',
  'twist',
  'old',
  'fashion',
  'drink',
  'faint',
  'heart',
  'thank',
  'need',
  'stiff',
  'drink',
  'first',
  'cours',
  'fri',
  'green',
  'tomato',
  'crushpuppi',
  'fri',
  'green',
  'tomato',
  'delici',
  'rem

In [27]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

for tokenized_sentence in tokenized_filtered_stemmed:
   print(TreebankWordDetokenizer().detokenize(tokenized_sentence))

ok let start acknowledg know stress restaur week server underpaid overwork experi last night absolut incred made reserv open tabl 8 o'clock restaur week special offer insid outsid seat opt latter 'porch set tone wonder experi think southern wraparound porch aesthet pleas faux ivi wall high top seat perfect spot afternoon cocktail summer server wish rememb name excel almost immedi acknowledg pick drink order queen ann's reveng twist old fashion drink faint heart thank need stiff drink first cours fri green tomato crushpuppi fri green tomato delici remoulad littl pickl shrimp right top crushpuppi yummi sweet savori combin allow sweet butteri cornbread pair realli well crunchi outer shell next second cours 'she-crab soup watermelon salad're accustom crab soup shore littl nervou wouldn't meet standard soup salad exceed expect soup came hearti almost overwhelm bowl salad beauti present amaz sweet juici watermelon point'm rave much want come back even sent group text famili let know tri plac

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



## Frequency-based Vectorization

### BoW

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

df_text["clean_text"] = [" ".join(sentence) for sentence in tokenized_filtered_stemmed]

In [31]:
cv = CountVectorizer()
cv.fit(df_text["clean_text"])
print('number of `tokens`', len(cv.vocabulary_))
cv.vocabulary_

number of `tokens` 18622


{'ok': 11612,
 'let': 9627,
 'start': 15655,
 'acknowledg': 761,
 'know': 9291,
 'stress': 15836,
 'restaur': 13769,
 'week': 17978,
 'server': 14657,
 'underpaid': 17243,
 'overwork': 11923,
 'experi': 6009,
 'last': 9466,
 'night': 11307,
 'absolut': 699,
 'incred': 8473,
 'made': 10039,
 'reserv': 13739,
 'open': 11704,
 'tabl': 16184,
 'clock': 3660,
 'special': 15436,
 'offer': 11581,
 'insid': 8581,
 'outsid': 11845,
 'seat': 14539,
 'opt': 11724,
 'latter': 9479,
 'porch': 12784,
 'set': 14668,
 'tone': 16757,
 'wonder': 18234,
 'think': 16548,
 'southern': 15375,
 'wraparound': 18305,
 'aesthet': 844,
 'pleas': 12650,
 'faux': 6193,
 'ivi': 8772,
 'wall': 17867,
 'high': 7940,
 'top': 16787,
 'perfect': 12335,
 'spot': 15531,
 'afternoon': 874,
 'cocktail': 3731,
 'summer': 15988,
 'wish': 18199,
 'rememb': 13665,
 'name': 11107,
 'excel': 5967,
 'almost': 1027,
 'immedi': 8384,
 'pick': 12479,
 'drink': 5359,
 'order': 11745,
 'queen': 13257,
 'ann': 1188,
 'reveng': 13815,
 '

In [35]:
import pandas as pd
df_word_freq = pd.DataFrame(sorted_word_freq, columns=['word', 'frequency'])
df_word_freq_sorted = df_word_freq.sort_values(by='frequency', ascending=False)
print(df_word_freq_sorted.head(10))



     word  frequency
0    food       5292
1   place       5068
2    good       4809
3   great       4482
4    time       4000
5      go       3699
6     get       3654
7   order       3613
8    like       3478
9  servic       3380


## Filtering for a specific type of business

In [8]:
business['categories'].unique()

array(['Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists',
       'Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services',
       'Department Stores, Shopping, Fashion, Home & Garden, Electronics, Furniture Stores',
       ...,
       'Shopping, Jewelry, Piercing, Toy Stores, Beauty & Spas, Accessories, Fashion',
       'Fitness/Exercise Equipment, Eyewear & Opticians, Shopping, Sporting Goods, Bikes',
       'Beauty & Spas, Permanent Makeup, Piercing, Tattoo'], dtype=object)

In [9]:
cafes = business[business['categories'].str.contains('Cafes', case = False, na = False)]

In [10]:
cafes

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
20,WKMJwqnfZKsAae75RMP6jA,Roast Coffeehouse and Wine Bar,10359 104 Street NW,Edmonton,AB,T5J 1B9,53.546045,-113.499169,4.0,40,0,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Coffee & Tea, Food, Cafes, Bars, Wine Bars, Re...","{'Friday': '8:0-18:0', 'Monday': '8:0-18:0', '..."
46,JX4tUpd09YFchLBuI43lGw,Naked Cyber Cafe & Espresso Bar,10303 108 Street NW,Edmonton,AB,T5J 1L7,53.544682,-113.506589,4.0,12,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Arts & Entertainment, Music Venues, Internet S...","{'Friday': '11:0-1:0', 'Monday': '11:0-1:0', '..."
47,lk9IwjZXqUMqqOhM774DtQ,Caviar & Bananas,2031 Broadway,Nashville,TN,37203,36.148371,-86.798895,3.5,159,0,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Coffee & Tea, Restaurants, Wine Bars, Bars, Ni...","{'Friday': '7:0-17:0', 'Monday': '7:0-17:0', '..."
53,cVBxfMC4lp3DnocjYA3FHQ,Paws The Cat Cafe,10588 109 Street,Edmonton,AB,T5H 3B2,53.549633,-113.508780,5.0,20,0,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Coffee & Tea, Cafes, Pets, Restaurants, Pet Ad...","{'Friday': '10:0-20:0', 'Monday': '0:0-0:0', '..."
99,1MeIwdbTnZOBFCKOrgaxuw,Ricardo's Italian Cafe,1931 Park Ave,Saint Louis,MO,63104,38.617272,-90.212784,3.5,80,0,"{'AcceptsInsurance': None, 'AgesAllowed': None...","American (New), Restaurants, Cafes, Italian, A...","{'Friday': '16:0-22:0', 'Monday': '11:0-14:0',..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150232,Scd-rcsQCn60t1sHHFv-og,First Watch,"4045 N Tyrone Blvd, Ste 204",St. Petersburg,FL,33709,27.808314,-82.752110,3.5,183,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Cafes, Restaurants, Breakfast & Brunch, Americ...","{'Friday': '7:0-14:30', 'Monday': '0:0-0:0', '..."
150261,Y0TWCjiiXlFi2XO2cFrUzQ,Connexxions Cafe,12644 137 Avenue NW,Edmonton,AB,T5L 4Y5,53.600573,-113.540273,4.5,13,0,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Hobby Shops, Tabletop Games, Coffee & Tea, Foo...",
150269,2dVJ7R-3JMmu2v4DJYtBbw,Spring Mount Hotel,3 Main St,Schwenksville,PA,19473,40.275532,-75.456772,2.0,5,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Nightlife, Cafes, Hotels, Bars, Hotels & Trave...","{'Friday': '11:0-2:0', 'Monday': '11:0-0:0', '..."
150271,BIyT7Kr7tMJqlfp4oOOYQg,Copper Bell Cafe,11228 Boyette Rd,Riverview,FL,33569,27.853745,-82.316887,3.5,49,0,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Breakfast & Brunch, Cafes, Restaurants","{'Friday': '7:30-14:30', 'Monday': '7:30-14:30..."


In [11]:
merged_df['cafe_review'] = merged_df["business_id"].isin(cafes['business_id'])

In [12]:
cafe_reviews = merged_df[merged_df["cafe_review"]==True]

In [13]:
cafe_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 294728 entries, 11 to 6989545
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   review_id      294728 non-null  object        
 1   business_id    294728 non-null  object        
 2   user_id        294728 non-null  object        
 3   stars          294728 non-null  int64         
 4   text           294728 non-null  object        
 5   date           294728 non-null  datetime64[ns]
 6   name           294727 non-null  object        
 7   review_count   294727 non-null  float64       
 8   yelping_since  294727 non-null  object        
 9   cafe_review    294728 non-null  bool          
dtypes: bool(1), datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 22.8+ MB


In [14]:
cafe_reviews['text']

11         Locals recommended Milktooth, and it's an amaz...
19         My absolute favorite cafe in the city. Their b...
30         First time there and it was excellent!!! It fe...
43         Stopped by after a Sunday morning walk in the ...
44         In a word... "OVERRATED!".  The food took fore...
                                 ...                        
6989508    I am surprised by the recent negative reviews....
6989513    The pastries are made in house so I cannot wai...
6989522    Eh.  It was just ok.  While visiting Tampa som...
6989525    Fiddlehead Fern Cafe keeps getting better and ...
6989545    Please come hungry!!!! Every time I've been in...
Name: text, Length: 294728, dtype: object

In [15]:
cr_text = pd.DataFrame(cafe_reviews.text.str.lower())
cr_text

Unnamed: 0,text
11,"locals recommended milktooth, and it's an amaz..."
19,my absolute favorite cafe in the city. their b...
30,first time there and it was excellent!!! it fe...
43,stopped by after a sunday morning walk in the ...
44,"in a word... ""overrated!"". the food took fore..."
...,...
6989508,i am surprised by the recent negative reviews....
6989513,the pastries are made in house so i cannot wai...
6989522,eh. it was just ok. while visiting tampa som...
6989525,fiddlehead fern cafe keeps getting better and ...


In [16]:
casual_tknzr = TweetTokenizer(reduce_len=True)
cafe_tokenized = [casual_tknzr.tokenize(t) for t in cr_text['text']] 
cafe_tokenized[0]

['locals',
 'recommended',
 'milktooth',
 ',',
 'and',
 "it's",
 'an',
 'amazing',
 'jewel',
 'of',
 'indianapolis',
 '.',
 "i'm",
 'glade',
 'i',
 'had',
 'the',
 'chance',
 'to',
 'experience',
 'this',
 '.']

In [17]:
english_words = set(words.words())
stop_words = set(stopwords.words('english'))
#punctuations = set(string.punctuation) Interested in removing these to see if it helps. if not, paste -> into list comprehension. ||| and word not in punctuations
cafe_txt_filtered = [[word for word in sentence if word.lower() not in stop_words and word.lower() in english_words] for sentence in cafe_tokenized]

print(cafe_txt_filtered[1])

['absolute', 'favorite', 'city', 'black', 'white', 'probably', 'best', 'ever', 'sweet', 'right', 'amount', 'foam', 'always', 'really', 'good', 'even', 'people', 'lot', 'space', 'work', 'noise', 'level', 'perfect', 'music', 'perfect', 'level', 'always', 'enjoy', 'bring', 'dogs', 'keep', 'giving', 'business', 'long']


In [18]:
cr_text["clean_text"] = [" ".join(sentence) for sentence in cafe_txt_filtered]
cv = CountVectorizer()
t = cv.fit_transform(cr_text["clean_text"])

In [19]:
freq_array = t.sum(axis=0).A1
cafe_token_freq = pd.DataFrame({"token": cv.get_feature_names_out(), 'freq_count': freq_array})

In [20]:
cafe_token_freq.sort_values(by='freq_count', ascending=False)

Unnamed: 0,token,freq_count
8264,food,183271
15625,place,158567
9115,good,151256
9281,great,139714
3917,coffee,101556
...,...,...
8164,flossing,1
2172,bluntness,1
2173,blup,1
20689,stum,1
