In [None]:
import time
import pandas as pd
import numpy as np
import os
import json
import nltk
import string
from tqdm.autonotebook import tqdm
tqdm.pandas()
from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
df_rev = pd.read_pickle('results/tor_reviews.p')

In [None]:
df_rev.head()

In [None]:
punctuations = '!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~' # string.punctuation excluding .,

In [None]:
def tokenize(sent):
    tknzr = TweetTokenizer()
    return tknzr.tokenize(sent.lower())

In [None]:
def review_len(tokens):
    tokens = [token for token in tokens if token not in punctuations]
    return len(tokens)
    

In [27]:
def get_compound_sentiment_score(sent):
    """
    If you use the VADER sentiment analysis tools, please cite:
    cite: Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
    Tutorial: http://t-redactyl.io/blog/2017/04/using-vader-to-handle-sentiment-analysis-with-social-media-text.html
    """
    sentim_analyzer = SentimentIntensityAnalyzer()
    return sentim_analyzer.polarity_scores(sent)['compound']
    

In [18]:
def get_net_sentiment_score(tokens):
    """
    If you use the VADER sentiment analysis tools, please cite:
    cite: Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
    """
    # (positive words - negative words) / number of tokens
    sentim_analyzer = SentimentIntensityAnalyzer()
    pos_count = 0
    neg_count = 0
    tokens = [token for token in tokens if token not in punctuations]
    for token in tokens:
        if sentim_analyzer.polarity_scores(token)['compound'] >= 0.33:
            pos_count += 1
        if sentim_analyzer.polarity_scores(token)['compound'] <= -0.33:
            neg_count += 1
    if len(tokens) > 0:
        return (pos_count - neg_count) / len(tokens)
#     print(pos_count, neg_count)
    return 0

In [None]:
# compare two methods
sent1 = 'it is a bad time for me now'
sent2 = 'it is a bad time for me now!!!'
sent3 = 'it is a bad time for me now!!! :('
print([get_compound_sentiment_score(sent) for sent in [sent1, sent2, sent3]])
print([get_net_sentiment_score(sent) for sent in [sent1.split(' '), sent2.split(' '), sent3.split(' ')]])

In [19]:
def get_punc_count(tokens):
    count = 0
    for token in tokens:
        if token in punctuations:
            count += 1
    if len(tokens) > 0:
        return count / len(tokens)
    return 0

In [20]:
def get_avg_word_len(tokens):
    tot_len = 0
    tokens = [token for token in tokens if token not in punctuations]
    for token in tokens:
        tot_len += len(token)
    if len(tokens) > 0:
        return tot_len/len(tokens)
    return 0

In [28]:
df_rev['review_tokens'] = df_rev.text.progress_apply(lambda x: tokenize(x))
df_rev['sent_score_compound'] = df_rev.text.progress_apply(lambda x: get_compound_sentiment_score(x))
df_rev['sent_score_net'] = df_rev.review_tokens.progress_apply(lambda x: get_net_sentiment_score(x))
df_rev['review_length']= df_rev.review_tokens.progress_apply(lambda x: review_len(x))
df_rev['punc_count'] = df_rev.review_tokens.progress_apply(lambda x: get_punc_count(x))
df_rev['avg_word_len'] = df_rev.review_tokens.progress_apply(lambda x: get_avg_word_len(x))
df_rev.to_pickle('results/reviews_hu.p')

HBox(children=(IntProgress(value=0, max=422790), HTML(value='')))

HBox(children=(IntProgress(value=0, max=422790), HTML(value='')))

HBox(children=(IntProgress(value=0, max=422790), HTML(value='')))

HBox(children=(IntProgress(value=0, max=422790), HTML(value='')))

HBox(children=(IntProgress(value=0, max=422790), HTML(value='')))

In [30]:
df = df_rev.copy()

In [29]:
df_rev.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,review_tokens,sent_score_compound,sent_score_net,review_length,punc_count,avg_word_len
0,f5O7v_X_jCg2itqacRfxhg,0,2017-10-12,0,kS4hrhEScwB9V5JATYjvVQ,5,Sansotei serves some top notch ramen. They tak...,0,hxqo4NyJFfeOmuoVi--s1A,"[sansotei, serves, some, top, notch, ramen, .,...",-0.25,0.0,108,0.009174,4.111111
1,Edr6SafmQrZa6CcFuItoqg,0,2014-06-03,0,U04YgYr4PiGUxB46J4xapg,2,Limited vegetarian options. Ordered Greek sala...,0,B7Fh30KQT1xPSGt_pIhRRA,"[limited, vegetarian, options, ., ordered, gre...",-0.5719,-0.029412,34,0.055556,4.911765
2,7xA6iSP0Ndn08tpBFQtUKA,0,2017-05-22,0,YDJDfKnx6VpMMo4EBxycGg,1,Non-existened service! The waiter did not eve...,0,FCtoTo9zSH1cSAkascfEHw,"[non-existened, service, !, the, waiter, did, ...",-0.8948,-0.011236,89,0.072917,4.404494
3,SmizR7MLt-558FJJQRBBoQ,1,2011-06-01,1,2Hk7DNwu3rb2jKHaFfPyCA,4,It might be a bit unfair to review Globe so ea...,1,YHWsLBS8jzZiPjKHMFOaAA,"[it, might, be, a, bit, unfair, to, review, gl...",0.9982,0.028571,700,0.012694,4.305714
4,iKMLsX1Je7P3wAOEc9scDg,0,2011-11-07,0,D2TcNaPqXxDGQ6T0n-vHXw,4,I have been itching to get to Origin for month...,4,YHWsLBS8jzZiPjKHMFOaAA,"[i, have, been, itching, to, get, to, origin, ...",0.9994,0.036066,915,0.022436,4.093989


In [31]:
df_rev['sent_score_compound'].corr(df_rev['stars'])

0.5702584820774301

In [32]:
df_rev['sent_score_net'].corr(df_rev['stars'])

0.5005883109103079

In [33]:
df_rev['sent_score_net'].corr(df_rev['sent_score_compound'])

0.5106916591080742

In [34]:
df_rev['avg_word_len'].corr(df_rev['stars'])

0.12499467179090185

In [35]:
df_rev['review_length'].corr(df_rev['stars'])

-0.13069784308872917

In [36]:
df_rev['punc_count'].corr(df_rev['stars'])

0.09420106296298679

In [37]:
aggregator = {'text':np.size,
              'stars': np.mean, 
              'funny':np.sum, 
              'cool':np.sum, 
              'useful': np.sum,
              'sent_score_compound': np.mean,
              'sent_score_net':np.mean, 
              'review_length': np.mean,
              'punc_count':np.mean, 
              'avg_word_len': np.mean
               }
bus_rev_res = df_rev.groupby(['business_id']).agg(aggregator)
user_rev_res = df_rev.groupby(['user_id']).agg(aggregator)

In [40]:
bus_rev_res.columns = ['count_review', 'avg_stars', 'count_funny', 'count_cool', 'count_useful', 'avg_sent_score_compound',
       'avg_sent_score_net', 'avg_review_length', 'avg_punc_count', 'avg_word_len']

In [41]:
user_rev_res.columns = ['count_review', 'avg_stars', 'count_funny', 'count_cool', 'count_useful', 'avg_sent_score_compound',
       'avg_sent_score_net', 'avg_review_length', 'avg_punc_count', 'avg_word_len']

In [43]:
bus_rev_res.describe()

Unnamed: 0,count_review,avg_stars,count_funny,count_cool,count_useful,avg_sent_score_compound,avg_sent_score_net,avg_review_length,avg_punc_count,avg_word_len
count,10914.0,10914.0,10914.0,10914.0,10914.0,10914.0,10914.0,10914.0,10914.0,10914.0
mean,38.738318,3.41034,15.216969,19.639546,43.09245,0.612687,0.044565,125.593871,0.032031,4.024299
std,68.904825,0.748847,30.874045,38.581838,75.464723,0.267952,0.020743,43.527664,0.01323,0.13651
min,3.0,1.0,0.0,0.0,0.0,-0.8823,-0.063111,16.0,0.0,2.992308
25%,7.0,2.984758,1.0,2.0,6.0,0.4962,0.032531,97.361607,0.025192,3.952123
50%,16.0,3.5,5.0,7.0,17.0,0.677829,0.045149,120.229021,0.031035,4.018439
75%,43.0,3.952381,16.0,20.0,49.0,0.799474,0.056699,147.504032,0.036997,4.089074
max,1837.0,5.0,805.0,1068.0,1460.0,0.99045,0.173461,434.333333,0.306463,9.192526


In [44]:
user_rev_res.describe()

Unnamed: 0,count_review,avg_stars,count_funny,count_cool,count_useful,avg_sent_score_compound,avg_sent_score_net,avg_review_length,avg_punc_count,avg_word_len
count,93075.0,93075.0,93075.0,93075.0,93075.0,93075.0,93075.0,93075.0,93075.0,93075.0
mean,4.542466,3.550671,1.784346,2.302938,5.053033,0.590675,0.051629,105.541242,0.032247,4.051401
std,14.039775,1.313905,26.972399,41.258408,52.974741,0.512652,0.046541,90.282135,0.039676,0.316654
min,1.0,1.0,0.0,0.0,0.0,-0.9981,-0.230769,0.0,0.0,0.0
25%,1.0,3.0,0.0,0.0,0.0,0.4497,0.025316,46.0,0.009009,3.876992
50%,1.0,4.0,0.0,0.0,1.0,0.8213,0.046577,79.0,0.025,4.021465
75%,3.0,5.0,1.0,1.0,2.0,0.942271,0.072884,135.0,0.04424,4.188679
max,1262.0,5.0,5254.0,9629.0,10893.0,0.9996,0.5,1086.0,1.0,21.25


In [46]:
bus_rev_res.to_pickle('results/bus_rev_res_hu.p')
user_rev_res.to_pickle('results/user_rev_res_hu.p')

In [52]:
df_bus_univ = pd.read_pickle('results/rest_univ.p')

In [53]:
df_bus_res_univ = pd.merge(df_bus_univ[['name']], bus_rev_res, left_index=True, right_index=True, how='left')
df_bus_res_univ.to_pickle('results/bus_res_univ_hu.p')

In [54]:
df_user_univ = pd.read_pickle('results/tor_users.p')

In [57]:
df_user_res_univ = pd.merge(df_user_univ, user_rev_res, left_index=True, right_index=True, how='left')

In [59]:
df_user_res_univ.to_pickle('results/user_res_univ_hu.p')

In [62]:
df_bus_res_univ.head()

Unnamed: 0_level_0,name,count_review,avg_stars,count_funny,count_cool,count_useful,avg_sent_score_compound,avg_sent_score_net,avg_review_length,avg_punc_count,avg_word_len
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9A2quhZLyWk0akUetBd8hQ,Bnc Cake House,7.0,4.142857,8.0,9.0,8.0,0.859314,0.06916,84.714286,0.094369,4.477817
6OuOZAok8ikONMS_T3EzXg,Thai One On,7.0,2.0,0.0,0.0,2.0,0.221543,0.020897,104.142857,0.023388,3.995028
tZnSodhPwNr4bzrwJ1CSbw,Southern Accent Restaurant,146.0,3.958904,58.0,95.0,157.0,0.818395,0.052183,188.424658,0.036806,4.091139
5J3b7j3Fzo9ISjChmoUoUA,Mabel's Bakery,23.0,3.782609,0.0,4.0,4.0,0.855739,0.068091,87.73913,0.031138,4.111695
PMDlKLd0Mxj0ngCpuUmE5Q,The Coffee Mill Restaurant,25.0,3.44,22.0,17.0,59.0,0.696356,0.033944,150.76,0.023563,4.042583


In [60]:
df_uaser_res_univ.head()

Unnamed: 0_level_0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,...,count_review,avg_stars,count_funny,count_cool,count_useful,avg_sent_score_compound,avg_sent_score_net,avg_review_length,avg_punc_count,avg_word_len
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hxqo4NyJFfeOmuoVi--s1A,5.0,0,0,0,0,0,0,0,0,0,...,1.0,5.0,0.0,0.0,0.0,-0.25,0.0,108.0,0.009174,4.111111
B7Fh30KQT1xPSGt_pIhRRA,3.5,0,0,0,0,0,0,0,0,0,...,1.0,2.0,0.0,0.0,0.0,-0.5719,-0.029412,34.0,0.055556,4.911765
FCtoTo9zSH1cSAkascfEHw,2.0,0,0,0,0,0,0,0,0,1,...,1.0,1.0,0.0,0.0,0.0,-0.8948,-0.011236,89.0,0.072917,4.404494
YHWsLBS8jzZiPjKHMFOaAA,3.38,1,0,1,2,1,4,9,0,14,...,98.0,3.336735,67.0,136.0,293.0,0.91529,0.031576,585.377551,0.022021,4.157742
XLA3LkbfQfeA-VYO7Zgzyg,4.0,0,0,0,0,0,0,0,0,0,...,1.0,2.0,0.0,0.0,0.0,0.6835,0.022124,226.0,0.058333,3.951327
