In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
sns.set()

In [2]:
reviews = pd.read_csv('datasets/Restaurant_Reviews.tsv', sep='\t')

reviews.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [90]:
# join all positive reviews into one text
positive_texts = reviews[reviews['Liked']== 1]['Review'].values
separator = ', '
joined_positive_texts = separator.join(positive_texts)

In [91]:
# join all negative reviews into one text
negative_texts = reviews[reviews['Liked']== 0]['Review'].values
separator = ', '
joined_negative_texts = separator.join(negative_texts)

In [71]:
import spacy
from collections import Counter

In [72]:
nlp = spacy.load("en_core_web_lg")

In [92]:
# function that returns a dataframe with the most common nouns from given texts
def get_nouns(texts):
     # turn text into nlp object that has tokens
    doc = nlp(texts)
    nouns = []
    
    nouns = [token.lemma_ for token in doc
         if (not token.is_stop and not token.is_punct
            and token.pos_ == "NOUN")]
    
    # count the occurrences of each nouns
    noun_freq = Counter(nouns)
    
    # get the N most common nouns
    common_nouns = noun_freq.most_common(20)
    
    # create a dataframe from the nouns
    nouns_df = pd.DataFrame(common_nouns, columns = ['nouns' ,'count'])
    return nouns_df

In [93]:
# get most commonly used nouns for positive reviews
positive_df = get_nouns(joined_positive_texts)
positive_df.head()

Unnamed: 0,nouns,count
0,place,60
1,food,53
2,service,39
3,time,26
4,restaurant,17


In [94]:
fig = px.bar(positive_df, x = 'nouns', y = 'count', color = 'nouns', 
             title='Top 20 nouns used in positive reviews')
fig.update_xaxes(tickangle=45)
fig.show()

In [95]:
# get most commonly used nouns for negative reviews
negative_df = get_nouns(joined_negative_texts)
negative_df.head()

Unnamed: 0,nouns,count
0,food,64
1,place,50
2,service,38
3,time,29
4,minute,19


In [96]:
fig = px.bar(negative_df, x = 'nouns', y = 'count', color = 'nouns', 
             title='Top 20 nouns used in negative reviews')
fig.update_xaxes(tickangle=45)
fig.show()

In [83]:
# function that returns a dataframe with the most common phrases from given texts
def find_phrases(texts):

    # turn text into nlp object that has tokens
    doc = nlp(texts)
    bigram_phrases = []

    # loop over each word/token
    # we create index in order to access the token next to the current token
    for i,token in enumerate(doc):
        if(i==len(doc)-2):
            break
        # only execute if the token is not a punctuaction sign and
        # if the current token is an ADJECTIVE AND the next token is a NOUN
        # example: good food
        if (not token.is_stop and not token.is_punct and 
             (token.pos_ == "ADJ" or token.pos_ == "NUM") and doc[i+1].pos_ == "NOUN"):
            # join the 2 tokens
            phrase = token.lemma_ + ' ' + doc[i+1].lemma_
            # add them to the list of phrases
            bigram_phrases.append(phrase)
    # count the occurrences of each phrase
    phrase_freq = Counter(bigram_phrases)

    # get the N most common 2 word phrases
    common_phrases = phrase_freq.most_common(20)
    
    # create a dataframe from the phrases
    phrases_df = pd.DataFrame(common_phrases, columns = ['phrases' ,'count'])
    return phrases_df

In [84]:
# get most common phrases for positive reviews
positive_phrases = find_phrases(joined_positive_texts)
positive_phrases.head()

Unnamed: 0,phrases,count
0,great food,7
1,great place,6
2,great service,5
3,good food,5
4,5 star,5


In [85]:
fig = px.bar(positive_phrases, x = 'phrases', y = 'count', color = 'phrases', 
             title='Top 20 phrases used in positive reviews')
fig.update_xaxes(tickangle=45)
fig.show()

In [86]:
# get most common phrases for negative reviews
negative_phrases = find_phrases(joined_negative_texts)
negative_phrases.head()

Unnamed: 0,phrases,count
0,10 minute,4
1,zero star,4
2,bad food,4
3,good food,3
4,mediocre food,3


In [89]:
fig = px.bar(negative_phrases, x = 'phrases', y = 'count', color = 'phrases', 
             title='Top 20 phrases used in negative reviews')
fig.update_xaxes(tickangle=45)
fig.show()