In [1]:
import sqlite3
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline

In [2]:
conn = sqlite3.connect('yelp_dataset_reviews.db') 
df_reviews= pd.read_sql("""SELECT * FROM reviews""",conn )
conn.close()

In [3]:
conn = sqlite3.connect('yelp_dataset_business.db') 
df_business= pd.read_sql("""SELECT business_id,name,stars,review_count,is_open,city
                        FROM business""",
                        conn )

conn.close()

In [4]:
conn = sqlite3.connect('yelp_dataset_tips.db') 
df_tips= pd.read_sql("""SELECT * FROM tips""",conn )
conn.close()

In [5]:
df_reviews = df_reviews[["business_id","text"]]

In [6]:
df_reviews["text"][3]

"Wow!  Yummy, different,  delicious.   Our favorite is the lamb curry and korma.  With 10 different kinds of naan!!!  Don't let the outside deter you (because we almost changed our minds)...go in and try something new!   You'll be glad you did!"

# Approach 1 : LLAMA2

In [20]:
import replicate
import os

In [21]:
replicate = replicate.Client(api_token=os.getenv("REPLICATE_API_TOKEN"))

In [46]:
output = replicate.run(
  "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
  input={
    "prompt": "Family diner. Had the buffet. Eclectic assortment: a large chicken leg, fried jalapeño, tamale, two rolled grape leaves, fresh melon. All good. Lots of Mexican choices there. Also has a menu with breakfast served all day long. Friendly, attentive staff. Good place for a casual relaxed meal with no expectations. Next to the Clarion Hotel.Identify and list all the dish names mentioned in this review"
  }
)
print(output)

<generator object Prediction.output_iterator at 0x00000212356C52A0>


In [47]:
for i in output:
    print(i,end="")

 Sure! Here are the dish names mentioned in the review:

* Large chicken leg
* Fried jalapeño
* Tamale
* Rolled grape leaves (2)
* Fresh melon

# Approach 2: count n-gram

In [7]:
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia
from tqdm.notebook import tqdm
from wordcloud import WordCloud, STOPWORDS 
import string
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter
nltk.download('vader_lexicon')
nltk.download('punkt')
sid =sia()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dry19\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dry19\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [64]:
df_reviews

Unnamed: 0,business_id,text
0,XQfwVwDr-v0ZS3_CbbE5Xw,"If you decide to eat here, just be aware it is..."
1,7ATYjTIgM3jUlt4UM3IypQ,I've taken a lot of spin classes over the year...
2,YjUWPpI6HXG530lwP-fb2A,Family diner. Had the buffet. Eclectic assortm...
3,kxX2SOes4o-D3ZQBkiMRfA,"Wow! Yummy, different, delicious. Our favo..."
4,e4Vwtrqf-wpJfwesgvdgxQ,Cute interior and owner (?) gave us tour of up...
...,...,...
6990275,jals67o91gcrD4DC81Vk6w,Latest addition to services from ICCU is Apple...
6990276,2vLksaMmSEcGbjI5gywpZA,"This spot offers a great, affordable east week..."
6990277,R1khUUxidqfaJmcpmGd4aw,This Home Depot won me over when I needed to g...
6990278,Rr9kKArrMhSLVE9a53q-aA,For when I'm feeling like ignoring my calorie-...


In [65]:
# df_reviews = df_reviews.groupby('business_id')['text'].agg(list).reset_index()

In [8]:
df_selected = df_business[(df_business['stars'] >=4) & (df_business['review_count'] >= 1000)].sort_values(by=['city'], ascending=False)

In [9]:
selected = df_selected.iloc[0,:].business_id

In [12]:
s_review = df_reviews[df_reviews.business_id== selected]
s_review

Unnamed: 0,business_id,text
716760,WSx9-iYYyST_umny9sJBFg,I love what they have done with the space and ...
717408,WSx9-iYYyST_umny9sJBFg,Two friends and I stopped by The Parish tonigh...
717651,WSx9-iYYyST_umny9sJBFg,The Parish is amazing! Amazing people! Amazi...
717874,WSx9-iYYyST_umny9sJBFg,Great view of the Catalinas; especially when i...
718791,WSx9-iYYyST_umny9sJBFg,While the outside of the restaurant is very un...
...,...,...
1415119,WSx9-iYYyST_umny9sJBFg,I KNOW I KNOW only 2 stars. I honestly feel t...
1415956,WSx9-iYYyST_umny9sJBFg,Great food. You will not be disappointed! They...
1416429,WSx9-iYYyST_umny9sJBFg,Just awesome food and wait staff very attentiv...
1416759,WSx9-iYYyST_umny9sJBFg,"What a great, funky atmosphere with innovative..."


In [13]:
s_review["sia"] = s_review['text'].apply(sid.polarity_scores)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s_review["sia"] = s_review['text'].apply(sid.polarity_scores)


In [14]:
s_review

Unnamed: 0,business_id,text,sia
716760,WSx9-iYYyST_umny9sJBFg,I love what they have done with the space and ...,"{'neg': 0.026, 'neu': 0.658, 'pos': 0.316, 'co..."
717408,WSx9-iYYyST_umny9sJBFg,Two friends and I stopped by The Parish tonigh...,"{'neg': 0.023, 'neu': 0.83, 'pos': 0.147, 'com..."
717651,WSx9-iYYyST_umny9sJBFg,The Parish is amazing! Amazing people! Amazi...,"{'neg': 0.0, 'neu': 0.548, 'pos': 0.452, 'comp..."
717874,WSx9-iYYyST_umny9sJBFg,Great view of the Catalinas; especially when i...,"{'neg': 0.073, 'neu': 0.51, 'pos': 0.417, 'com..."
718791,WSx9-iYYyST_umny9sJBFg,While the outside of the restaurant is very un...,"{'neg': 0.077, 'neu': 0.826, 'pos': 0.097, 'co..."
...,...,...,...
1415119,WSx9-iYYyST_umny9sJBFg,I KNOW I KNOW only 2 stars. I honestly feel t...,"{'neg': 0.043, 'neu': 0.843, 'pos': 0.114, 'co..."
1415956,WSx9-iYYyST_umny9sJBFg,Great food. You will not be disappointed! They...,"{'neg': 0.0, 'neu': 0.45, 'pos': 0.55, 'compou..."
1416429,WSx9-iYYyST_umny9sJBFg,Just awesome food and wait staff very attentiv...,"{'neg': 0.026, 'neu': 0.59, 'pos': 0.384, 'com..."
1416759,WSx9-iYYyST_umny9sJBFg,"What a great, funky atmosphere with innovative...","{'neg': 0.037, 'neu': 0.786, 'pos': 0.177, 'co..."


In [15]:
i = nltk.corpus.stopwords.words('english')
# punctuations to remove
j = list(string.punctuation)
# finally let's combine all of these
stopwords = set(i).union(j)

In [16]:
def preprocess(x):
    x = re.sub('[^a-z\s]', '', x.lower())                  # get rid of noise
    x = [w for w in x.split() if w not in set(stopwords)]  # remove stopwords
    return ' '.join(x) 

In [17]:
s_review["c_rev"] = s_review["text"].apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s_review["c_rev"] = s_review["text"].apply(preprocess)


In [18]:

texts = s_review["c_rev"].to_list()

bigram_lst = []
trigram_lst =[]
for text in texts:
    tokens = word_tokenize(text)
    bigrams = ngrams(tokens, 2)
    trigrams = ngrams(tokens, 3)
    for i in bigrams:
        bigram_lst.append(" ".join(i))
    for i in trigrams:
        trigram_lst.append(" ".join(i))

bigram_count = Counter(bigram_lst)
trigram_count = Counter(trigram_lst)
# # Display the counts
# for bigram, count in bigram_count.items():
#     print(f"Bigram: {bigram}, Count: {count}")

In [19]:
top_bi= bigram_count.most_common(10)

In [20]:
top_bi

[('drunken angel', 204),
 ('hush puppies', 192),
 ('bacon popcorn', 188),
 ('frog legs', 129),
 ('shrimp grits', 122),
 ('fish chips', 117),
 ('bread pudding', 85),
 ('new orleans', 79),
 ('great food', 76),
 ('angel pasta', 74)]

In [21]:
top_tri = trigram_count.most_common(10)

In [22]:
top_tri

[('drunken angel pasta', 67),
 ('pulled pork sandwich', 29),
 ('angel hair pasta', 28),
 ('wait go back', 24),
 ('red beans rice', 24),
 ('cant wait go', 21),
 ('goat cheese relleno', 21),
 ('gumbo hush puppies', 21),
 ('shrimp po boy', 21),
 ('crawfish hush puppies', 20)]

## combine bigram and trigram

In [23]:
def is_substring(substring, string_list):
    for string in string_list:
        if substring in string[0]:
            return True
    return False

In [24]:
exist =[]
for bi in top_bi:
    if not is_substring(bi[0],top_tri):
        exist.append(bi)



In [25]:
exist.extend(top_tri)

In [26]:
exist

[('bacon popcorn', 188),
 ('frog legs', 129),
 ('shrimp grits', 122),
 ('fish chips', 117),
 ('bread pudding', 85),
 ('new orleans', 79),
 ('great food', 76),
 ('drunken angel pasta', 67),
 ('pulled pork sandwich', 29),
 ('angel hair pasta', 28),
 ('wait go back', 24),
 ('red beans rice', 24),
 ('cant wait go', 21),
 ('goat cheese relleno', 21),
 ('gumbo hush puppies', 21),
 ('shrimp po boy', 21),
 ('crawfish hush puppies', 20)]

## word cloud

In [77]:
def draw_wc(review,name):
    wc = WordCloud(width=1600, height=800, random_state=1, max_words=200000000)
    wc.generate(str(review))
    plt.figure(figsize=(20,10), facecolor='k')
    plt.title("Customers reviews about {}".format(name), fontsize=40,color='white')
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout(pad=10)
    plt.show()
    

In [None]:
draw_wc(s_review['c_rev'],"restruant")

# Approach 3: Word2vec

In [157]:
import gensim

In [156]:
tokens = df_reviews["text"][:100000].apply(word_tokenize)

In [None]:
model = gensim.models.Word2Vec.load("word2vec.model")

In [158]:
# model = gensim.models.Word2Vec(tokens, vector_size=150, window=10, min_count=3, workers=10)
# model.train(tokens,total_examples=len(tokens),epochs=10)

(83514517, 116264150)

In [159]:
model.save("word2vec.model")

In [161]:
model.wv.most_similar("curry")

[('satay', 0.6898965835571289),
 ('pad', 0.6633636951446533),
 ('masala', 0.6548245549201965),
 ('Pad', 0.6543367505073547),
 ('lemongrass', 0.6468486785888672),
 ('Curry', 0.6436727643013),
 ('roti', 0.641756534576416),
 ('mee', 0.6394912004470825),
 ('paneer', 0.6343382000923157),
 ('biryani', 0.6332923769950867)]

In [172]:
text ="Wow!  Yummy, different,  delicious.   Our favorite is the lamb curry and korma.  With 10 different kinds of naan!!!  Don't let the outside deter you (because we almost changed our minds)...go in and try something new!   You'll be glad you did!"
text = word_tokenize(preprocess(text))

similar={}
for t in text:
    if t in model.wv:
        similar[t]= model.wv.similarity(t.strip(" !."),"food")
    else:
        similar[t]=0

In [177]:
sort_simi = dict(sorted(similar.items(), key=lambda item: item[1], reverse=True))
first_10_by_keys = dict(list(sort_simi.items())[:10])
first_10_by_keys

{'naan': 0.35486144,
 'curry': 0.24955298,
 'something': 0.17871559,
 'lamb': 0.17564566,
 'outside': 0.15002188,
 'korma': 0.11275223,
 'deter': 0.0850741,
 'almost': 0.06337546,
 'favorite': 0.04751827,
 'youll': 0.040264584}

# Implement mispelling correction

In [79]:
df_business

Unnamed: 0,business_id,name,stars,review_count,is_open,city
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",5.0,7,0,Santa Barbara
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,3.0,15,1,Affton
2,tUFrWirKiKi_TAnsVWINQQ,Target,3.5,22,0,Tucson
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,4.0,80,1,Philadelphia
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,4.5,13,1,Green Lane
...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3.0,13,1,Edmonton
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,4.0,5,1,Nashville
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,3.5,8,1,Indianapolis
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,4.0,24,1,Edwardsville


In [149]:
bus_name = list(set(df_business["name"].to_list()))

In [150]:
def med_dp(str1, str2):

    def med_dp_helper(str1, str2, m, n):

        d = {}
        key = m, n

        # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
        #      TODO: Complete the code by filling the "pass"         #
        # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
        
        if m == 0: 
           return n
        if n == 0: 
           return m 
        if key in d: 
           return d[key]
        if str1[m - 1] == str2[n - 1]: 
          return med_dp_helper(str1, str2, m - 1, n - 1)

        # Hint: you probably want to write something here
        d[key] = min(1 + med_dp_helper(str1, str2, m, n - 1), 
                     1 + med_dp_helper(str1, str2, m - 1, n), 
                     2 + med_dp_helper(str1, str2, m - 1, n - 1))
        return d[key]

    return med_dp_helper(str1, str2, len(str1), len(str2))

In [151]:
def correct_spelling(misspelled_word, dictionary):
    closest_match = None
    min_distance = float('inf')
    cand = [item for item in dictionary if item.startswith(misspelled_word)]
    close_p = {}
    for word in cand:
       close_p[word] = med_dp(misspelled_word,word)

    sorted_d = dict(sorted(close_p.items(), key=lambda item: item[1]))
    first_10_by_keys = dict(list(sorted_d.items())[:10])
    return first_10_by_keys


In [152]:
correct_spelling("Tar",bus_name)

{'Target': 3,
 'Tartine': 4,
 'Tartufo': 4,
 'Tarboosh': 5,
 'Tara Shaw': 6,
 'Tara Yoga': 6,
 'Tarpon Inn': 7,
 'Tarahumara': 7,
 'Tarka Indian': 9,
 "Tarek's Cafe": 9,
 'TargetMaster': 9,
 'Tarpon Diner': 9,
 'Target World': 9,
 'Targetmaster': 9,
 'Tarpon Tavern': 10}