## Plan:
- Most Frequent Words/Phrases from Positive and Negative buckets (likely based on Cafe)
    - Get most frequently used words (5).
    - Gather list of occurrences.
    - Rank list based on emotions.
    - Take top 1 for each word positive and negative.
- Most Positive/Negative Review Phrases on the Drink
    - Get sentences that contain the drink name using Spacy.
    - Rank sentences based on emotions.
    - Take top 3 for positive and negative.
- Overall Reviewer Sentiment towards Drink
    - Calculate the sentiment score for each drink review.
    - Get the average value.

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from spacy.lang.en import English
import spacy
from spacy.matcher import PhraseMatcher
import heapq
from nltk.sentiment import SentimentIntensityAnalyzer
import text2emotion as te
from multiprocessing import Pool

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [36]:
# Import files and format
reviews = pd.read_json("../scraper/boba_reviews.json")
drinks = pd.read_json("../scraper/boba_drinks.json")

drinks_new = drinks.rename(columns={"_id":"drinkId"})
reviews_new = reviews.copy()
drinks_new["drinkId"] = drinks_new["drinkId"].apply(lambda x: x["$oid"])
reviews_new["drinkId"] = reviews_new["drinkId"].apply(lambda x: x["$oid"])

In [37]:
# Create a sentiment score column
drink_reviews = pd.merge(drinks_new, reviews_new, on="drinkId", how="inner")
drink_reviews = drink_reviews[["shopId", "shopName", "drinkId", "drinkName", "reviewRating", "reviewMessage"]]
drink_reviews["shopId"] = drink_reviews["shopId"].apply(lambda x: x["$oid"])
drink_reviews["sentimentScore"] = drink_reviews["reviewRating"].apply(lambda x: 1 if x >= 3 else 0)

In [4]:
# Get drink Ids
drinkIds = list(set(drink_reviews["drinkId"]))
drinkIds[:5]

['1c3add94009711ed8051d0ab',
 '22d61b21009711eda4e5d0ab',
 'd80b4384009611edba0ad0ab',
 'e7852d5e009611ed8280d0ab',
 '03da708b009711eda7c7d0ab']

In [5]:
grouped_drinks = drink_reviews.groupby(by="drinkId").count()
grouped_drinks

Unnamed: 0_level_0,shopId,shopName,drinkName,reviewRating,reviewMessage,sentimentScore
drinkId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00b672a4009711edb617d0ab,6,6,6,6,6,6
027dc4dd009711edbc2cd0ab,5,5,5,5,5,5
03da708b009711eda7c7d0ab,3,3,3,3,3,3
04da09d3009711ed956ed0ab,2,2,2,2,2,2
05bef900009711edad46d0ab,1,1,1,1,1,1
...,...,...,...,...,...,...
ee27e26d009611ed8eafd0ab,1,1,1,1,1,1
f2535988009611ed87d2d0ab,15,15,15,15,15,15
f63cd502009611ed91acd0ab,15,15,15,15,15,15
fab05d1c009611edb20ed0ab,10,10,10,10,10,10


### Test for Most Positive/Negative Drink Reviews

In [6]:
# Aggregate the positive and negative review text
def aggregate_reviews(sample_drink):
    positive_reviews_text, negative_reviews_text = """""", """"""

    for i, rows in sample_drink.iterrows():
        addString = ""
        if i > 0:
            addString = " "
        if rows["sentimentScore"]:
            addString += rows["reviewMessage"]#.lower()
            positive_reviews_text += addString
        else:
            addString += rows["reviewMessage"]#.lower()
            negative_reviews_text += addString

    return positive_reviews_text, negative_reviews_text

In [7]:
# Find phrases that contain drink of the drink name
def find_drink_phrases(sample_drink, text):
    drink_name = list(set(sample_drink["drinkName"].values))[0]#.lower()
    part_drink_name = " ".join(drink_name.split(" ")[1:])

    nlp = spacy.load("en_core_web_sm")

    phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    phrases = [drink_name, part_drink_name, "drink", "drink"]
    patterns = [nlp(text) for text in phrases]
    phrase_matcher.add("Key", None, *patterns)

    doc = nlp(text)

    drink_related_sents = set()
    for sent in doc.sents:
        for match_id, start, end in phrase_matcher(nlp(sent.text)):
            if nlp.vocab.strings[match_id] in ["Key"]:
                drink_related_sents.add(sent.text)

    return drink_related_sents

In [27]:
# Sort the phrases by emotion
def sort_scores(sents, emotions):
    sia = SentimentIntensityAnalyzer()
    maxHeap = []
    emotion1_avgScore = 0
    emotion2_avgScore = 0

    for i, sent in enumerate(sents):
        score1 = te.get_emotion(sent)[emotions[0]] 
        emotion1_avgScore += score1
        score2 = te.get_emotion(sent)[emotions[1]] 
        emotion2_avgScore += score2
        # score2 = sia.polarity_scores(sent)['compound'] 
        #score2 = -1 * sia.polarity_scores(sent)['compound']
        avgScore = float((score1 + score2) / -2.0)
        heapq.heappush(maxHeap, [avgScore, sent])

    maxHeap = [sent for score, sent in maxHeap]
    if len(sents):
        emotion1_avgScore = float(emotion1_avgScore / len(sents))
        emotion2_avgScore = float(emotion2_avgScore / len(sents))
    else:
        emotion1_avgScore = emotion2_avgScore = 0

    return maxHeap, {emotions[0]: emotion1_avgScore, emotions[1]: emotion2_avgScore}

In [28]:
def retrieve_emotional_sents():
    final_sents = []
    # drinkIds = ["f63cd502009611ed91acd0ab", "027dc4dd009711edbc2cd0ab"]
    for drinkId in drinkIds:
        sample_drink = drink_reviews[drink_reviews["drinkId"] == drinkId]
        positive_reviews_text, negative_reviews_text = aggregate_reviews(sample_drink)
        if not(positive_reviews_text or negative_reviews_text):
            final_sents.append({"drinkId": {"$oid": drinkId}, "positive_sents": [], "negative_sents": []})
        else:
            positive_drink_related_sents = find_drink_phrases(sample_drink, positive_reviews_text)
            negative_drink_related_sents = find_drink_phrases(sample_drink, negative_reviews_text)
            top3_positive_sents, pos_avgScore = sort_scores(positive_drink_related_sents, ["Happy", "Surprise"])[:3]
            top3_negative_sents, neg_avgScore = sort_scores(negative_drink_related_sents, ["Sad", "Angry"])[:3]

            happyScore = pos_avgScore["Happy"]
            surprisedScore = pos_avgScore["Surprise"] 
            sadScore = neg_avgScore["Sad"] 
            angryScore = neg_avgScore["Angry"]
            totalScore = happyScore + surprisedScore + sadScore + angryScore

            if totalScore:
                happyScore = happyScore / totalScore
                surprisedScore = surprisedScore / totalScore
                sadScore = sadScore / totalScore
                angryScore = angryScore / totalScore

            final_sents.append({
                "drinkId": {"$oid": drinkId}, 
                "positive_sents": top3_positive_sents, 
                "negative_sents": top3_negative_sents, 
                "happyScore": happyScore,
                "surprisedScore": surprisedScore,
                "sadScore": sadScore,
                "angryScore": angryScore
            })
        print(drinkId)
    return final_sents

In [12]:
emotional_sents = retrieve_emotional_sents()

1c3add94009711ed8051d0ab
22d61b21009711eda4e5d0ab
d80b4384009611edba0ad0ab
e7852d5e009611ed8280d0ab
03da708b009711eda7c7d0ab
07b8d62b009711ed8acfd0ab
c394154c009611edbccfd0ab
fd574866009611edbb8fd0ab
28c0d16d009711ed90dcd0ab
c4da324f009611ed8772d0ab
367bd2aa009711edaba4d0ab
1e658596009711ed91d1d0ab
37487205009711edb53dd0ab
31e6d272009711ed8e88d0ab
148fce7c009711edb1e6d0ab
0ca9c381009711edb0fed0ab
cfe4765f009611edaf23d0ab
05bef900009711edad46d0ab
becb682e009611edb267d0ab
00b672a4009711edb617d0ab
191f45d6009711ed969fd0ab
2c711a80009711ed9fa6d0ab
577fd262009711edb599d0ab
5859e92d009711edbf62d0ab
c2121bcb009611ed98b0d0ab
37e32e73009711edadb5d0ab
257da2e9009711ed825ed0ab
027dc4dd009711edbc2cd0ab
bddd2ccd009611ed9fb7d0ab
332dd2c1009711edb9a2d0ab
dda798ff009611edb2bad0ab
ee27e26d009611ed8eafd0ab
12503f26009711edb2a0d0ab
cb5bacc9009611ed9f07d0ab
48e7a47d009711ed90bfd0ab
136ba657009711ed9ff8d0ab
1d775547009711ed9424d0ab
346e6dd5009711edaee7d0ab
e3015bdf009611edbd3cd0ab
070c9421009711ed96a9d0ab


In [177]:
emotional_sents

[{'drinkId': {'$oid': 'f63cd502009611ed91acd0ab'},
  'positive_sents': ['My favorite tea here is rose jasmine tea.',
   'My favorites include the matcha latte and the rose jasmine tea.',
   "It is the perfect drink for a hot Summer's day."],
  'negative_sents': ['Avoid this place if you wanna drink any type of milk bubble tea.',
   'If I wanted to drink milk, I would just buy it at the supermarket for $4 not $7.'],
  'happyScore': 0.5915057915057915,
  'surprisedScore': 0.07722007722007722,
  'sadScore': 0.3312741312741313,
  'angryScore': 0.0},
 {'drinkId': {'$oid': '027dc4dd009711edbc2cd0ab'},
  'positive_sents': ['I really love this drink as the rose flavour complimented the Jasmine tea surprisingly well.',
   'The brown sugar ginger tea was so spicy and delicious.',
   'This drink is 5/5.'],
  'negative_sents': [],
  'happyScore': 1.0,
  'surprisedScore': 0.0,
  'sadScore': 0.0,
  'angryScore': 0.0}]

In [None]:
emotional_sents

### Test for Most Frequently Used Words

In [51]:
# Prep all stop words
# Get Drink names as tokens to use as stop words
drink_name_tokens = list()
for drink_name in list(set(drinks["drinkName"].values)):
    for token in drink_name.split(" "):
        drink_name_tokens.append(token.lower())

# Get Shop names as tokens to use as stop words
shop_name_tokens = list()
for shop_name in list(set(drinks["shopName"].values)):
    for token in shop_name.split(" "):
        shop_name_tokens.append(token.lower())

punctuation_strings = string.punctuation+("’")
spacy_stop_words = list(STOP_WORDS)
nlp = English()
other_words = ["shop", "drink", "place", "taste", "ordered", "order"]
all_stop_words = spacy_stop_words + shop_name_tokens + drink_name_tokens + other_words + ["..", "...", "''"]

In [52]:
# Preprocess review message words
def clean_text(sents):
    # tokenized_sentences = nltk.sent_tokenize(text)
    words = list()
    for tokenized_sentence in sents:
        lemmatizer = WordNetLemmatizer()
        # lemmatized_words = [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(tokenized_sentence)]
        words += [word.lower() for word in nltk.word_tokenize(tokenized_sentence) 
                    if word.lower() not in all_stop_words and 
                    word.lower() not in punctuation_strings and 
                    not nlp.vocab[word].is_stop and
                    not word.lower().isdigit()]
        # words += [word.lower() for word in nltk.word_tokenize(tokenized_sentence)]
    return list(set(words))

In [53]:
def retrieve_freq_words():
    final_words = list()
    for drinkId in drinkIds:
        sample_drink = drink_reviews[drink_reviews["drinkId"] == drinkId]
        positive_reviews_text, negative_reviews_text = aggregate_reviews(sample_drink)
        positive_drink_related_sents = find_drink_phrases(sample_drink, positive_reviews_text)
        negative_drink_related_sents = find_drink_phrases(sample_drink, negative_reviews_text)

        positive_words = clean_text(positive_drink_related_sents)
        negative_words = clean_text(negative_drink_related_sents)

        # sorted_positive_words, posScore = sort_scores(positive_words, ["Happy", "Surprise"])
        # sorted_negative_words, negScore = sort_scores(negative_words, ["Sad", "Angry"])

        most_common_positive_words = nltk.FreqDist(positive_words).most_common()[:5]
        most_common_negative_words = nltk.FreqDist(negative_words).most_common()[:5]
        most_common_positive_words = [word for word, i in most_common_positive_words if word not in most_common_negative_words]
        most_common_negative_words = [word for word, i in most_common_negative_words if word not in most_common_positive_words]

        final_words.append({"drinkId": {"$oid": drinkId}, "positive_words": most_common_positive_words, "negative_words": most_common_negative_words})

    return final_words

In [54]:
freq_words = retrieve_freq_words()

### Reinsert into JSON

In [69]:
emotional_sents_df = pd.DataFrame(emotional_sents)
freq_words_df = pd.DataFrame(freq_words)
drinks_updated = drinks.copy()

emotional_sents_df = emotional_sents_df.rename(columns={"drinkId":"_id"})
emotional_sents_df["_id"] = emotional_sents_df["_id"].apply(lambda x: str(x["$oid"]))
freq_words_df = freq_words_df.rename(columns={"drinkId":"_id"})
freq_words_df["_id"] = freq_words_df["_id"].apply(lambda x: str(x["$oid"]))
drinks_updated["_id"] = drinks_updated["_id"].apply(lambda x: str(x["$oid"]))


In [75]:
drinks_updated = pd.merge(drinks_updated, emotional_sents_df, on="_id", how="inner")
drinks_updated = pd.merge(drinks_updated, freq_words_df, on="_id", how="inner")

In [81]:
drinks_updated["_id"] = drinks_updated["_id"].apply(lambda x: {"$oid": x})
drinks_updated = drinks_updated.rename(columns={
    "positive_sents": "positiveSents",
    "negative_sents": "negativeSents",
    "positive_words": "positiveWords",
    "negative_words": "negativeWords"
})

In [82]:
drinks_updated.to_json("boba_drinks_nlp.json", orient='records')

In [83]:
drinks_updated.head()

Unnamed: 0,_id,drinkName,drinkImage,avgRating,reviews,shopId,shopName,positiveSents,negativeSents,happyScore,surprisedScore,sadScore,angryScore,positiveWords,negativeWords
0,{'$oid': {'$oid': 'b50ef315009611ed9159d0ab'}},Roasted Milk Tea,https://s3-media0.fl.yelpcdn.com/bphoto/HrWnzP...,3.6,"[{'$oid': 'b5e3d071009611eda2b8d0ab'}, {'$oid'...",{'$oid': 'b07d5cb4009611edbd66d0ab'},Chatime - Dundas,"[10 points = 1 free regular drink. :-), It's t...",[but now i have less confidence with their dri...,0.477793,0.331034,0.191172,0.0,"[addicted, think, customize, followed, especia...","[flavour, confidence, purchased, marble, love]"
1,{'$oid': {'$oid': 'b9c251dd009611eda86dd0ab'}},Brown Sugar Pearl Milk Tea,https://s3-media0.fl.yelpcdn.com/bphoto/dYm7xx...,4.153846,"[{'$oid': 'ba600469009611edad9ad0ab'}, {'$oid'...",{'$oid': 'b07d5cb4009611edbd66d0ab'},Chatime - Dundas,"[Also, get a Vicinity points card and you can ...",[],0.464516,0.535484,0.0,0.0,"[crowded, minutes, fairy, compromising, wait]",[]
2,{'$oid': {'$oid': 'bcbfc7dc009611edba90d0ab'}},Matcha Tea Latte,https://s3-media0.fl.yelpcdn.com/bphoto/_DxKWs...,4.666667,"[{'$oid': 'bd4491d3009611ed90e4d0ab'}, {'$oid'...",{'$oid': 'b07d5cb4009611edbd66d0ab'},Chatime - Dundas,"[You receive one point for every drink, and af...",[],0.434555,0.565445,0.0,0.0,"[served, option, 4.70, return, pretty]",[]
3,{'$oid': {'$oid': 'bddd2ccd009611ed9fb7d0ab'}},Hazelnut Chocolate Milk Tea,https://s3-media0.fl.yelpcdn.com/bphoto/VynawK...,4.5,"[{'$oid': 'be72e370009611edad53d0ab'}, {'$oid'...",{'$oid': 'b07d5cb4009611edbd66d0ab'},Chatime - Dundas,[- Hazelnut chocolate milk tea with pudding ($...,[],0.428571,0.571429,0.0,0.0,"[lineup, option, suit, interesting, 4.20]",[]
4,{'$oid': {'$oid': 'becb682e009611edb267d0ab'}},Caramel Milk Tea,https://s3-media0.fl.yelpcdn.com/bphoto/T1bj_W...,4.2,"[{'$oid': 'bf4f76f9009611edb909d0ab'}, {'$oid'...",{'$oid': 'b07d5cb4009611edbd66d0ab'},Chatime - Dundas,"[Caramel milk tea was AMAZING., My favourites ...",[],0.666667,0.333333,0.0,0.0,"[packed, wait, expensive, frequent, lines]",[]
