### Initialization

In [0]:
# Download spaCy models
!python -m spacy download en_core_web_md
!pip install spacy-langdetect

import json

import en_core_web_md
import pandas as pd
from IPython.display import display
from spacy_langdetect import LanguageDetector

# pandas display settings
pd.set_option('display.max_columns', 10)
pd.set_option('max_colwidth', 1000)
pd.set_option('display.width', 1000)

# Initialize spaCy pipeline
SPACY = en_core_web_md.load()
SPACY.add_pipe(LanguageDetector(), name='language_detector', last=True)

RANDOM_SEED = 42  # for reproducibility

# Load Yelp reviews
reviews = []
with open("data/reviewSelected100.json", 'r', encoding='latin-1') as f:
    for line in f:
        reviews.append(json.loads(line))

column_order = ['review_id', 'business_id', 'date', 'user_id', 'text', 'stars', 'useful', 'funny', 'cool']
YELP_REVIEWS = pd.DataFrame.from_records(reviews, columns=column_order)
YELP_REVIEWS.infer_objects()

print(f"\nFinished loading {len(YELP_REVIEWS)} records to pandas DataFrame.")
print("\nSample records:")

display(YELP_REVIEWS.head())

print("\nPreliminary analysis:")
YELP_REVIEWS.describe()

# Global variables (reusable in other code cells)
# 1. SPACY: spaCy model for linguistic analysis
# 2. RANDOM_SEED: random seed for random generation
# 3. YELP_REVIEWS: pandas DataFrame containing Yelp reviews

Collecting en_core_web_md==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4MB)
[K     |████████████████████████████████| 95.4MB 1.4MB/s 
[?25hBuilding wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.1.0-cp36-none-any.whl size=97126237 sha256=8494154bff67ae3bec220df6b0ebff0016c16356440ed324ed6f510ef549b5c3
  Stored in directory: /tmp/pip-ephem-wheel-cache-k17uw82v/wheels/c1/2c/5f/fd7f3ec336bf97b0809c86264d2831c5dfb00fc2e239d1bb01
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
Collecting spacy-langdetect
  Downloading https://files.pythonhosted.org/packages/29/70/72dad19abe81ca8e85ff9

Unnamed: 0,review_id,business_id,date,user_id,text,stars,useful,funny,cool
0,8aoJJdKEO3ypoZNszpPu7Q,ZBE-H_aUlicix_9vUGQPIQ,2016-11-09 20:07:25,bGgAL09pxLnV_FFgR4ZADg,"We had my Mother's Birthday Party here on 10/29/16. What a Great time we all had. The food, music and waiters were Great!!! Thanks Lyles!!!",5.0,0,0,0
1,J5NOCLdhuhor7USRhtYZ8w,e-YnECeZNt8ngm0tu4X9mQ,2015-12-05 05:06:43,pFCb-1j6oI3TDjr26h2cJQ,"Good Korean grill near Eaton Centre. The marinate is good. We got beef, ox liver, salmon, fish fillet, chicken, pork, pork belly. The fish fillet was bland and liver was meh. Salmon and chicken was really flavourable. Such a fun place to eat at for a date or group of friends. Even alone. No judgments here. \nThe staff is attentive, nice and considerate. Bigger groups will most likely be seated on the second floor which is way bigger.\nCaution: will smell like BBQ grill after.",4.0,0,0,0
2,PXiLWAYRt3xnHaJ8MB4rzw,j7HO1YeMQGYo3KibMXZ5vg,2014-10-11 05:16:15,mEzc6LeTNiQgIVsq3poMbg,"Was recommended to try this place by few people and today was my first time here. All I can say is, I am coming back very soon.\n\nSERVICE\nWasn't sure if the guy was the owner but he was friendly and talked story while we waited for our food. Loved it!! Food came out within 10 min. \n\nFOOD\nTried hamburger steak and it was so delicious. Gravy/sauce they put on the hamburger steak was perfect! Also came with onion rings on top which I love. Chicken katsu was amazing! Chicken katsu here is crunchy and surprisingly has a flavor by itself that you really don't need a sauce for it. Best chicken katsu I had. \n\nOVERALL\nIt was a journey to get to this place as it took about 30min from my house but the service and food here made it worth the drive. I also love how they had a poster of Keali'i Reichel. (They had other posters but Keali'i Reichel happens to be my favorite). Place is clean, service is fast and friendly and food is delicious. What more could you ask for?",5.0,2,1,3
3,VrLarvxZYJm74yAqtpe9PQ,7e3PZzUpG5FYOTGt3O3ePA,2016-07-25 03:45:26,o-zUN2WEZgjQS7jnNsec0g,"Ambience: Would not expect something this nice at Cannery Hotel but it is the nicest looking restaurant there. More for couples than group gatherings.\n\nService: The ambience & food make up for this, which unfortunately for us, the service has been terrible. We have come fairly close to restaurant closing both times (within the hour), but they do close very early for Vegas. The staff makes it VERY clear that they want to go home right from the start in hurrying orders and are more aggressive as time goes on. Unfortunate.\n\nFood: Very good. A little salty on some items during our first visit but good overall and again, warrants the overall 3 stars. Steak. Scallops wrapped in bacon. Calamari. Cobb salad. etc.",3.0,0,0,0
4,C1CUpidlVFprUCkApqzCmA,vuHzLZ7nAeT-EiecOkS5Og,2016-04-11 18:49:11,Wlx0iBXJvk4x0EeOt2Bz1Q,"Absolutely the WORST pool company that I have EVER had to deal with. The customer service is horrible. After leaving many messages over the course of a few weeks I was only able to contact them when I called them AGAIN. I asked to speak with the actual pool tech who initially came to my house. The RUDE lady on the phone told me that she was more than capable to answer my questions - about a pump that SHE HAS NOT SEEN, and about a conversation I had with the tech THAT SHE DID NOT HEAR. \n\nI was assigned to them by my home warranty company, and I will be filing a serious complaint with them and the BBB. I was told to take the cash out option from the warranty company for the part and then they would do the work and I could just pay them directly. After I received the cash out and called to schedule the appointment I was told that I need to replace the entire pool pump system and that would cost an additional $400 and that there was an electrical problem and that it would cost...",1.0,11,0,3



Preliminary analysis:


Unnamed: 0,stars,useful,funny,cool
count,15300.0,15300.0,15300.0,15300.0
mean,3.646601,1.292745,0.43,0.514575
std,1.45513,3.241261,1.866658,2.33405
min,1.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0
75%,5.0,1.0,0.0,0.0
max,5.0,191.0,122.0,180.0


### Clean Data

In [0]:
import json
from collections import Counter

from tqdm import tqdm

CLEANED_REVIEWS = YELP_REVIEWS.copy()

# Drop unused columns
CLEANED_REVIEWS.drop(['review_id', 'date', 'user_id', 'useful', 'funny', 'cool'], axis=1, inplace=True)

# Remove duplicate records
CLEANED_REVIEWS.drop_duplicates(inplace=True)

# Remove reviews in foreign language (i.e. not english)
language_statistics = Counter()
CLEANED_REVIEWS['is_english'] = 0
print("Running language detection...")
for index, row in tqdm(list(CLEANED_REVIEWS.iterrows())):
    doc = SPACY(row['text'])
    language = doc._.language['language'] 
    language_statistics[language] += 1
    if language == 'en':
        CLEANED_REVIEWS.at[index, 'is_english'] = 1
print("Language statistics:", language_statistics)
CLEANED_REVIEWS = CLEANED_REVIEWS.loc[CLEANED_REVIEWS['is_english'] == 1].drop('is_english', axis=1)

# Write cleaned data to file
print("\nWriting cleaned records to file...")
with open("data/reviewCleaned.json", 'w', encoding='latin-1') as f:
    for _, row in tqdm(list(CLEANED_REVIEWS.iterrows())):
        json_text = json.dumps(
            {
                'business_id': row['business_id'],
                'text': row['text'],
                'stars': row['stars']
            }
        )
        print(json_text, file=f)
print("\nDone.")

Running language detection...


100%|██████████| 15275/15275 [08:39<00:00, 29.40it/s]


Language statistics: Counter({'en': 15222, 'fr': 32, 'es': 8, 'UNKNOWN': 3, 'de': 3, 'cy': 2, 'tr': 2, 'af': 1, 'nl': 1, 'ro': 1})

Writing cleaned records to file...


100%|██████████| 15222/15222 [00:00<00:00, 28521.21it/s]


Done.



