### Initialization

In [0]:
# Install third party modules
!pip install flair

# Download spaCy models
!python -m spacy download en_core_web_md

import json

import en_core_web_md
import pandas as pd
from IPython.display import display

# pandas display settings
pd.set_option('display.max_columns', 10)
pd.set_option('max_colwidth', 1000)
pd.set_option('display.width', 1000)

# Initialize spaCy pipeline
SPACY = en_core_web_md.load()

RANDOM_SEED = 42  # for reproducibility

# Load Yelp reviews
reviews = []
with open("data/reviewCleaned.json", 'r', encoding='latin-1') as f:
    for line in f:
        reviews.append(json.loads(line))

column_order = ['business_id', 'text', 'stars']
YELP_REVIEWS = pd.DataFrame.from_records(reviews, columns=column_order)
YELP_REVIEWS.infer_objects()

print(f"\nFinished loading {len(YELP_REVIEWS)} to pandas DataFrame.")
print("\nSample records:")

display(YELP_REVIEWS.head())

print("\nPreliminary analysis:")
YELP_REVIEWS.describe()

# Global variables (reusable in other code cells)
# 1. SPACY: spaCy model for linguistic analysis
# 2. RANDOM_SEED: random seed for random generation
# 3. YELP_REVIEWS: pandas DataFrame containing Yelp reviews

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/16/22/8fc8e5978ec05b710216735ca47415700e83f304dec7e4281d61cefb6831/flair-0.4.4-py3-none-any.whl (193kB)
[K     |█▊                              | 10kB 21.2MB/s eta 0:00:01[K     |███▍                            | 20kB 2.2MB/s eta 0:00:01[K     |█████                           | 30kB 3.2MB/s eta 0:00:01[K     |██████▊                         | 40kB 2.1MB/s eta 0:00:01[K     |████████▌                       | 51kB 2.6MB/s eta 0:00:01[K     |██████████▏                     | 61kB 3.1MB/s eta 0:00:01[K     |███████████▉                    | 71kB 3.6MB/s eta 0:00:01[K     |█████████████▌                  | 81kB 4.0MB/s eta 0:00:01[K     |███████████████▏                | 92kB 4.5MB/s eta 0:00:01[K     |█████████████████               | 102kB 3.5MB/s eta 0:00:01[K     |██████████████████▋             | 112kB 3.5MB/s eta 0:00:01[K     |████████████████████▎           | 122kB 3.5MB/s eta 0:00:0

Collecting en_core_web_md==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4MB)
[K     |████████████████████████████████| 95.4MB 1.1MB/s 
[?25hBuilding wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.1.0-cp36-none-any.whl size=97126237 sha256=61679df97bca6902eb889dcbe60bb8efa46c5ccd8d2ef4ac74e3d9e165d5379d
  Stored in directory: /tmp/pip-ephem-wheel-cache-21dm39p7/wheels/c1/2c/5f/fd7f3ec336bf97b0809c86264d2831c5dfb00fc2e239d1bb01
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')

Finished loading 15222 to pandas DataFrame.

Sample records:


Unnamed: 0,business_id,text,stars
0,ZBE-H_aUlicix_9vUGQPIQ,"We had my Mother's Birthday Party here on 10/29/16. What a Great time we all had. The food, music and waiters were Great!!! Thanks Lyles!!!",5.0
1,e-YnECeZNt8ngm0tu4X9mQ,"Good Korean grill near Eaton Centre. The marinate is good. We got beef, ox liver, salmon, fish fillet, chicken, pork, pork belly. The fish fillet was bland and liver was meh. Salmon and chicken was really flavourable. Such a fun place to eat at for a date or group of friends. Even alone. No judgments here. \nThe staff is attentive, nice and considerate. Bigger groups will most likely be seated on the second floor which is way bigger.\nCaution: will smell like BBQ grill after.",4.0
2,j7HO1YeMQGYo3KibMXZ5vg,"Was recommended to try this place by few people and today was my first time here. All I can say is, I am coming back very soon.\n\nSERVICE\nWasn't sure if the guy was the owner but he was friendly and talked story while we waited for our food. Loved it!! Food came out within 10 min. \n\nFOOD\nTried hamburger steak and it was so delicious. Gravy/sauce they put on the hamburger steak was perfect! Also came with onion rings on top which I love. Chicken katsu was amazing! Chicken katsu here is crunchy and surprisingly has a flavor by itself that you really don't need a sauce for it. Best chicken katsu I had. \n\nOVERALL\nIt was a journey to get to this place as it took about 30min from my house but the service and food here made it worth the drive. I also love how they had a poster of Keali'i Reichel. (They had other posters but Keali'i Reichel happens to be my favorite). Place is clean, service is fast and friendly and food is delicious. What more could you ask for?",5.0
3,7e3PZzUpG5FYOTGt3O3ePA,"Ambience: Would not expect something this nice at Cannery Hotel but it is the nicest looking restaurant there. More for couples than group gatherings.\n\nService: The ambience & food make up for this, which unfortunately for us, the service has been terrible. We have come fairly close to restaurant closing both times (within the hour), but they do close very early for Vegas. The staff makes it VERY clear that they want to go home right from the start in hurrying orders and are more aggressive as time goes on. Unfortunate.\n\nFood: Very good. A little salty on some items during our first visit but good overall and again, warrants the overall 3 stars. Steak. Scallops wrapped in bacon. Calamari. Cobb salad. etc.",3.0
4,vuHzLZ7nAeT-EiecOkS5Og,"Absolutely the WORST pool company that I have EVER had to deal with. The customer service is horrible. After leaving many messages over the course of a few weeks I was only able to contact them when I called them AGAIN. I asked to speak with the actual pool tech who initially came to my house. The RUDE lady on the phone told me that she was more than capable to answer my questions - about a pump that SHE HAS NOT SEEN, and about a conversation I had with the tech THAT SHE DID NOT HEAR. \n\nI was assigned to them by my home warranty company, and I will be filing a serious complaint with them and the BBB. I was told to take the cash out option from the warranty company for the part and then they would do the work and I could just pay them directly. After I received the cash out and called to schedule the appointment I was told that I need to replace the entire pool pump system and that would cost an additional $400 and that there was an electrical problem and that it would cost...",1.0



Preliminary analysis:


Unnamed: 0,stars
count,15222.0
mean,3.646367
std,1.455229
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


### Run sentiment analysis for each review segment

In [0]:
# Review segmentation
from tqdm import tqdm

YELP_REVIEWS['review_segment'] = ""
for index, row in tqdm(list(YELP_REVIEWS.iterrows())):
    doc = SPACY(row['text'])
    YELP_REVIEWS.at[index, 'review_segment'] = [sent.text.strip() for sent in doc.sents]

100%|██████████| 15222/15222 [07:00<00:00, 36.23it/s]


In [0]:
# Rule-based sentiment analysis with TextBlob
from textblob import TextBlob
from tqdm import tqdm

YELP_REVIEWS['TextBlob'] = ""
for index, row in tqdm(list(YELP_REVIEWS.iterrows())):
    sentiments = []
    for segment in row['review_segment']:
        polarity, subjectivity = TextBlob(segment).sentiment
        sentiments.append((polarity, subjectivity))

    YELP_REVIEWS.at[index, 'TextBlob'] = sentiments

100%|██████████| 15222/15222 [01:19<00:00, 191.71it/s]


In [0]:
# Neural network-based sentiment analysis with Flair
%tensorflow_version 1.x
import flair
from tqdm import tqdm

flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

def get_sentiment(sentences):
    inputs = [flair.data.Sentence(s) for s in sentences]
    flair_sentiment.predict(inputs)

    return [i.labels[0]._value if i.labels[0]._score >= 0.7 else "NEUTRAL" for i in inputs]

YELP_REVIEWS['Flair'] = ""
for index, row in tqdm(list(YELP_REVIEWS.iterrows())):
    YELP_REVIEWS.at[index, 'Flair'] = get_sentiment(row['review_segment'])

2019-11-02 15:53:42,465 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4/classy-imdb-en-rnn-cuda%3A0/imdb-v0.4.pt not found in cache, downloading to /tmp/tmpi3tczsv2


100%|██████████| 1501979561/1501979561 [01:04<00:00, 23200816.14B/s]

2019-11-02 15:54:47,745 copying /tmp/tmpi3tczsv2 to cache at /root/.flair/models/imdb-v0.4.pt





2019-11-02 15:54:53,253 removing temp file /tmp/tmpi3tczsv2
2019-11-02 15:54:53,440 loading file /root/.flair/models/imdb-v0.4.pt


100%|██████████| 15222/15222 [1:05:27<00:00,  3.54it/s]


In [0]:
# Display analysis results
import textwrap

from tabulate import tabulate

for i, row in YELP_REVIEWS.sample(10, random_state=RANDOM_SEED).iterrows():
    print(f"Review #{i} | Business ID: {row['business_id']} | Star rating: {row['stars']}")
    print(f"\nFull review:\n{textwrap.fill(row['text'], 120)}")
    
    print("\nReview analysis:")
    headers = ["segment", "polarity\n(TextBlob NLP)", "subjectivity\n(TextBlob NLP)", "sentiment\n(Flair NLP)"]
    data = zip(row['review_segment'], row['TextBlob'], row['Flair'])
    print(tabulate([[textwrap.fill(r, 120), p, s, l] for r, (p, s), l in data], headers=headers, tablefmt='grid', numalign="left"))
    print('=' * 180)

Review #8138 | Business ID: xJ_L2sJN1zk3VDpZELrV_Q | Star rating: 4.0

Full review:
Came here on Christmas Eve and it wasn't crowded at all. Prices are reasonable for brunch. And food was good portion.

Review analysis:
+----------------------------------------------------------+------------------+------------------+---------------+
| segment                                                  | polarity         | subjectivity     | sentiment     |
|                                                          | (TextBlob NLP)   | (TextBlob NLP)   | (Flair NLP)   |
| Came here on Christmas Eve and it wasn't crowded at all. | 0                | 0                | NEGATIVE      |
+----------------------------------------------------------+------------------+------------------+---------------+
| Prices are reasonable for brunch.                        | 0.2              | 0.6              | NEGATIVE      |
+----------------------------------------------------------+------------------+-----------