In [None]:
import time
import pandas as pd
import numpy as np
import os
import json
import nltk
import string
from tqdm.autonotebook import tqdm
tqdm.pandas()
from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
df_rev = pd.read_pickle('results/tor_reviews.p')

In [None]:
df_rev.head()

In [None]:
punctuations = '!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~' # string.punctuation excluding .,

In [None]:
def tokenize(sent):
    tknzr = TweetTokenizer()
    return tknzr.tokenize(sent.lower())

In [None]:
def review_len(tokens):
    tokens = [token for token in tokens if token not in punctuations]
    return len(tokens)
    

In [27]:
def get_compound_sentiment_score(sent):
    """
    If you use the VADER sentiment analysis tools, please cite:
    cite: Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
    Tutorial: http://t-redactyl.io/blog/2017/04/using-vader-to-handle-sentiment-analysis-with-social-media-text.html
    """
    sentim_analyzer = SentimentIntensityAnalyzer()
    return sentim_analyzer.polarity_scores(sent)['compound']
    

In [18]:
def get_net_sentiment_score(tokens):
    """
    If you use the VADER sentiment analysis tools, please cite:
    cite: Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
    """
    # (positive words - negative words) / number of tokens
    sentim_analyzer = SentimentIntensityAnalyzer()
    pos_count = 0
    neg_count = 0
    tokens = [token for token in tokens if token not in punctuations]
    for token in tokens:
        if sentim_analyzer.polarity_scores(token)['compound'] >= 0.33:
            pos_count += 1
        if sentim_analyzer.polarity_scores(token)['compound'] <= -0.33:
            neg_count += 1
    if len(tokens) > 0:
        return (pos_count - neg_count) / len(tokens)
#     print(pos_count, neg_count)
    return 0

In [64]:
# compare two methods
sent1 = 'it is a bad time for me now'
sent2 = 'it is a bad time for me now!!!'
sent3 = 'it is a bad time for me now!!! :('
print([get_compound_sentiment_score(sent) for sent in [sent1, sent2, sent3]])
print([get_net_sentiment_score(sent) for sent in [sent1.split(' '), sent2.split(' '), sent3.split(' ')]])

[-0.5423, -0.6571, -0.8061]
[-0.125, -0.125, -0.2222222222222222]


In [19]:
def get_punc_count(tokens):
    count = 0
    for token in tokens:
        if token in punctuations:
            count += 1
    if len(tokens) > 0:
        return count / len(tokens)
    return 0

In [20]:
def get_avg_word_len(tokens):
    tot_len = 0
    tokens = [token for token in tokens if token not in punctuations]
    for token in tokens:
        tot_len += len(token)
    if len(tokens) > 0:
        return tot_len/len(tokens)
    return 0

In [28]:
df_rev['review_tokens'] = df_rev.text.progress_apply(lambda x: tokenize(x))
df_rev['sent_score_compound'] = df_rev.text.progress_apply(lambda x: get_compound_sentiment_score(x))
df_rev['sent_score_net'] = df_rev.review_tokens.progress_apply(lambda x: get_net_sentiment_score(x))
df_rev['review_length']= df_rev.review_tokens.progress_apply(lambda x: review_len(x))
df_rev['punc_count'] = df_rev.review_tokens.progress_apply(lambda x: get_punc_count(x))
df_rev['avg_word_len'] = df_rev.review_tokens.progress_apply(lambda x: get_avg_word_len(x))
df_rev.to_pickle('results/reviews_hu.p')

HBox(children=(IntProgress(value=0, max=422790), HTML(value='')))

HBox(children=(IntProgress(value=0, max=422790), HTML(value='')))

HBox(children=(IntProgress(value=0, max=422790), HTML(value='')))

HBox(children=(IntProgress(value=0, max=422790), HTML(value='')))

HBox(children=(IntProgress(value=0, max=422790), HTML(value='')))

In [30]:
df = df_rev.copy()

In [29]:
df_rev.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,review_tokens,sent_score_compound,sent_score_net,review_length,punc_count,avg_word_len
0,f5O7v_X_jCg2itqacRfxhg,0,2017-10-12,0,kS4hrhEScwB9V5JATYjvVQ,5,Sansotei serves some top notch ramen. They tak...,0,hxqo4NyJFfeOmuoVi--s1A,"[sansotei, serves, some, top, notch, ramen, .,...",-0.25,0.0,108,0.009174,4.111111
1,Edr6SafmQrZa6CcFuItoqg,0,2014-06-03,0,U04YgYr4PiGUxB46J4xapg,2,Limited vegetarian options. Ordered Greek sala...,0,B7Fh30KQT1xPSGt_pIhRRA,"[limited, vegetarian, options, ., ordered, gre...",-0.5719,-0.029412,34,0.055556,4.911765
2,7xA6iSP0Ndn08tpBFQtUKA,0,2017-05-22,0,YDJDfKnx6VpMMo4EBxycGg,1,Non-existened service! The waiter did not eve...,0,FCtoTo9zSH1cSAkascfEHw,"[non-existened, service, !, the, waiter, did, ...",-0.8948,-0.011236,89,0.072917,4.404494
3,SmizR7MLt-558FJJQRBBoQ,1,2011-06-01,1,2Hk7DNwu3rb2jKHaFfPyCA,4,It might be a bit unfair to review Globe so ea...,1,YHWsLBS8jzZiPjKHMFOaAA,"[it, might, be, a, bit, unfair, to, review, gl...",0.9982,0.028571,700,0.012694,4.305714
4,iKMLsX1Je7P3wAOEc9scDg,0,2011-11-07,0,D2TcNaPqXxDGQ6T0n-vHXw,4,I have been itching to get to Origin for month...,4,YHWsLBS8jzZiPjKHMFOaAA,"[i, have, been, itching, to, get, to, origin, ...",0.9994,0.036066,915,0.022436,4.093989


In [31]:
df_rev['sent_score_compound'].corr(df_rev['stars'])

0.5702584820774301

In [32]:
df_rev['sent_score_net'].corr(df_rev['stars'])

0.5005883109103079

In [33]:
df_rev['sent_score_net'].corr(df_rev['sent_score_compound'])

0.5106916591080742

In [34]:
df_rev['avg_word_len'].corr(df_rev['stars'])

0.12499467179090185

In [35]:
df_rev['review_length'].corr(df_rev['stars'])

-0.13069784308872917

In [36]:
df_rev['punc_count'].corr(df_rev['stars'])

0.09420106296298679

In [None]:
# aggregator = {'text':np.size,
#               'stars': np.mean, 
#               'funny':np.sum, 
#               'cool':np.sum, 
#               'useful': np.sum,
#               'sent_score_compound': np.mean,
#               'sent_score_net':np.mean, 
#               'review_length': np.mean,
#               'punc_count':np.mean, 
#               'avg_word_len': np.mean
#                }
# bus_rev_res = df_rev.groupby(['business_id']).agg(aggregator)
# user_rev_res = df_rev.groupby(['user_id']).agg(aggregator)

In [102]:
df_rev.date =df_rev.date.astype(np.datetime64)
df_rev['ind'] = list(range(0, len(df_rev)))

In [103]:
aggregator = {'ind': np.size,
              'stars': np.mean, 
              'funny':np.sum, 
              'cool':np.sum, 
              'useful': np.sum,
              'sent_score_compound': np.mean,
              'sent_score_net':np.mean, 
              'review_length': np.mean,
              'punc_count':np.mean, 
              'avg_word_len': np.mean
               }

bus_rev_res = df_rev.sort_values('date').set_index('date').groupby(['business_id']).rolling(window='730d').agg(aggregator)
user_rev_res = df_rev.sort_values('date').set_index('date').groupby(['user_id']).rolling(window='730d').agg(aggregator)

In [104]:
bus_rev_res.columns = ['count_review', 'avg_stars', 'count_funny', 'count_cool', 'count_useful', 'avg_sent_score_compound',
       'avg_sent_score_net', 'avg_review_length', 'avg_punc_count', 'avg_word_len']

In [105]:
user_rev_res.columns = ['count_review', 'avg_stars', 'count_funny', 'count_cool', 'count_useful', 'avg_sent_score_compound',
       'avg_sent_score_net', 'avg_review_length', 'avg_punc_count', 'avg_word_len']

In [106]:
bus_rev_res = pd.DataFrame(bus_rev_res.to_records())
user_rev_res = pd.DataFrame(user_rev_res.to_records())

In [107]:
bus_rev_res.to_pickle('results/bus_rev_res_hu.p')
user_rev_res.to_pickle('results/user_rev_res_hu.p')

In [108]:
df_bus_univ = pd.read_pickle('results/rest_univ.p')

In [109]:
df_bus_res_univ = pd.merge(df_bus_univ[['name']], bus_rev_res, left_index=True, right_index=True, how='left')
df_bus_res_univ.to_pickle('results/bus_res_univ_hu.p')

In [110]:
df_user_univ = pd.read_pickle('results/tor_users.p')

In [111]:
df_user_res_univ = pd.merge(df_user_univ, user_rev_res, left_index=True, right_index=True, how='left')

In [112]:
df_user_res_univ.to_pickle('results/user_res_univ_hu.p')

In [116]:
user_rev_res.head()

Unnamed: 0,user_id,date,count_review,avg_stars,count_funny,count_cool,count_useful,avg_sent_score_compound,avg_sent_score_net,avg_review_length,avg_punc_count,avg_word_len
0,--56y1InAvNoQOD6YYrhVQ,2016-08-03,1.0,1.0,0.0,0.0,0.0,0.9337,0.02963,135.0,0.0,3.881481
1,--7gjElmOrthETJ8XqzMBw,2014-11-16,1.0,2.0,0.0,0.0,0.0,0.5106,0.041667,48.0,0.04,4.708333
2,--7gjElmOrthETJ8XqzMBw,2018-05-24,1.0,4.0,1.0,0.0,0.0,0.9181,0.041237,97.0,0.093458,4.329897
3,--Br-QsbO9ad5GbZxVGxaw,2016-08-24,1.0,2.0,0.0,0.0,0.0,0.8947,0.014599,137.0,0.055172,4.20438
4,--BumyUHiO_7YsHurb9Hkw,2017-01-13,1.0,5.0,0.0,0.0,0.0,0.9873,0.107527,93.0,0.088235,4.83871


In [115]:
bus_rev_res.head()

Unnamed: 0,business_id,date,count_review,avg_stars,count_funny,count_cool,count_useful,avg_sent_score_compound,avg_sent_score_net,avg_review_length,avg_punc_count,avg_word_len
0,--DaPTJW3-tB1vP-PfdTEg,2012-06-04,1.0,4.0,0.0,0.0,0.0,0.9869,0.082803,157.0,0.042683,4.0
1,--DaPTJW3-tB1vP-PfdTEg,2012-06-14,2.0,4.0,1.0,2.0,1.0,0.9419,0.090182,99.0,0.044597,4.073171
2,--DaPTJW3-tB1vP-PfdTEg,2012-11-11,3.0,4.333333,1.0,2.0,1.0,0.932067,0.087899,94.0,0.037483,4.259098
3,--DaPTJW3-tB1vP-PfdTEg,2013-04-07,4.0,4.25,1.0,2.0,1.0,0.93485,0.104987,78.5,0.028113,4.202136
4,--DaPTJW3-tB1vP-PfdTEg,2013-07-06,5.0,4.2,1.0,2.0,1.0,0.74788,0.083989,65.0,0.02249,4.198072
