In [1]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(style="darkgrid")
import matplotlib.pyplot as plt
%matplotlib inline

import datetime
import time

In [2]:
import warnings
warnings.filterwarnings("ignore")

<h1>Yelp Data Exploration</h1>

In [3]:
def convertJSON (jsonFilename) :
    # read the entire file into a python array
    with open(jsonFilename, 'r') as f:
        data = f.readlines()

    # remove the trailing "\n" from each line
    data = map(lambda x: x.rstrip(), data)

    # each element of 'data' is an individual JSON object.
    # i want to convert it into an *array* of JSON objects
    # which, in and of itself, is one large JSON object
    # basically... add square brackets to the beginning
    # and end, and have all the individual business JSON objects
    # separated by a comma
    data_json_str = "[" + ','.join(data) + "]"

    # now, load it into pandas
    data_df = pd.read_json(data_json_str)
    
    return data_df

SHARED_FOLDER = './pickle'
with open ('{}/business.pickle'.format(SHARED_FOLDER), 'rb') as f:
    df_business = pickle.load(f)


with open ('{}/user.pickle'.format(SHARED_FOLDER), 'rb') as f:
    df_users = pickle.load(f)


with open ('{}/review.pickle'.format(SHARED_FOLDER), 'rb') as f:
     df_reviews = pickle.load(f)


with open ('{}/tip.pickle'.format(SHARED_FOLDER), 'rb') as f:
     df_tips = pickle.load(f)


In [5]:
base_path = './yelp/json'
df_business = convertJSON(base_path+'/businesses/yelp_academic_dataset_business.json')
print('Businesses Loaded....')
df_reviews = convertJSON(base_path+'/reviews/yelp_academic_dataset_review.json')
print('reviews Loaded....')
df_users = convertJSON(base_path+'/users/yelp_academic_dataset_user.json')
print('users Loaded....')
df_tips = convertJSON(base_path+'/tips/yelp_academic_dataset_tip.json')
print('tips Loaded....')

Businesses Loaded....
reviews Loaded....
users Loaded....
tips Loaded....


In [6]:
df_business_vegas = df_business[(df_business.city == 'Las Vegas')]


print('There are ', len(df_business_vegas), 'business in Las Vegas')


df_reviews_vegas = df_reviews[df_reviews.business_id.isin(df_business_vegas.business_id)]
df_reviews_vegas.date = df_reviews_vegas.date.apply(lambda x : x.to_datetime())

print('There are', len(df_reviews_vegas), 'reviews on those businesses')


df_tips_vegas = df_tips[df_tips.business_id.isin(df_business_vegas.business_id)]
df_tips_vegas.date = df_tips_vegas.date.apply(lambda x : x.to_datetime())

print('There are', len(df_tips_vegas), 'tips on those businesses')

df_users_vegas = df_users[df_users.user_id.isin(df_reviews_vegas.user_id) | df_users.user_id.isin(df_tips_vegas.user_id)]


print('There are', len(df_users_vegas), 'users corresponding to those reviews & tips')




There are  17423 business in Las Vegas
There are 861536 reviews on those businesses
There are 253748 tips on those businesses
There are 259396 users corresponding to those reviews & tips


In [7]:
df_users_vegas['friends_count'] = df_users_vegas.friends.apply(lambda x : len(x))

In [8]:
#Let's count tips
count_tips = df_tips_vegas.groupby(['user_id'], as_index=False).agg({
        "business_id" : "count"
    })

count_tips.columns = ['user_id', 'count_tips']

df_users_vegas = df_users_vegas.merge(count_tips, on='user_id')

In [9]:
#Let's count reviews
count_reviews = df_reviews_vegas.groupby(['user_id'], as_index=False).agg({
        "review_id" : "count"
    })

count_reviews.columns = ['user_id', 'count_reviews']

df_users_vegas = df_users_vegas.merge(count_reviews, on='user_id')

In [10]:
#let's count votes
df_users_vegas['count_votes'] = df_users_vegas.votes.apply(lambda x : x['cool'] + x['funny'] + x['useful'])

In [11]:
#Let's count the number of reviews on business
df_reviews_vegas['business_count'] = 1
df_reviews_vegas['business_count'] = df_reviews_vegas.groupby(['business_id'])['business_count'].cumsum()

#Add a Timestamp to the dataframe
df_reviews_vegas['timestamp'] = df_reviews_vegas['date'].apply(lambda x : x.timestamp())

In [12]:
#Reduce df_reviews_stars_size
rev_cols = ['user_id', 'business_id', 'review_id', 'date', 'business_count', 'timestamp', 'stars']
df_reviews_compute = df_reviews_vegas[rev_cols]

In [13]:
nb_days_after_review = 30 #1 month
nb_days_before_review = 4*30 #4 months

def computeInfluence (manif) :
    
    if((manif.name % 500) == 0):
        print(manif.name,'...')
    review_date = manif.date
    before_date = review_date - datetime.timedelta(days=nb_days_before_review)
    after_date = review_date + datetime.timedelta(days=nb_days_after_review)
    
    #Get the reviews on the business for the timelapse we need
    before_reviews = df_reviews_compute[(df_reviews_compute.business_id == manif.business_id) 
                                      & (df_reviews_compute.date > before_date) 
                                      & (df_reviews_compute.date <= review_date)]
    #before_reviews = df_business_vegas[(reviews.date <= review_date)]
    after_reviews = df_reviews_compute[(df_reviews_compute.business_id == manif.business_id) 
                                     & (df_reviews_compute.date > review_date) 
                                     & (df_reviews_compute.date <= after_date)]
    
    if (len(after_reviews) < 1) :
        #print('No reviews after the review', [len(after_reviews)])
        return 0
    elif (len(before_reviews) < 2) :  
        expected_reviews_count = manif.business_count
        max_timestamp = after_reviews['timestamp'].max()
        actual_reviews_count = after_reviews[after_reviews.timestamp == max_timestamp].business_count.values[0]
        influence = (actual_reviews_count - expected_reviews_count)/expected_reviews_count

        return influence
    
    # Plot the regression
    

    
    #Linear regression
    #regr = linear_model.SGDRegressor(alpha=0.1, n_iter=2, shuffle=False)
    X = before_reviews['timestamp'].values
    Y = before_reviews['business_count'].values
    #X = np.reshape(X, (len(X), 1))
    #Y = np.reshape(Y, (len(Y), 1))
    
    P = np.polyfit(X, Y, deg=1)
    
    #regr.fit(X, Y)
    
    
    #y = a*x + b
    a = P[0]
    b = P[1]
    
    #Expected review count on the after date
    expected_reviews_count = np.ceil(a*after_date.timestamp() + b)
    
    
    #Actual review count on the after date
    #since the reviews dataframe is sorted by date descending, we just have to take the last one on the list
    max_timestamp = after_reviews['timestamp'].max()
    actual_reviews_count = after_reviews[after_reviews.timestamp == max_timestamp].business_count.values[0]
    
    
    influence = (actual_reviews_count - expected_reviews_count)/expected_reviews_count
    
    if (influence == float("-inf")) :
        return 0   
    return influence

In [14]:
yelper_stars = df_users_vegas[df_users_vegas.count_reviews > 20].sample(100)
print(len(yelper_stars), 'users choosen')

100 users choosen


In [15]:
#Let's plot the reviews evolution on the business with the most reviews
businesses = df_business_vegas

#def computeBusinessInfluence(business) : 

reviews = df_reviews_vegas[df_reviews_vegas.user_id.isin(yelper_stars.user_id)][rev_cols]

reviews.sort_values(['date'], ascending=True, inplace=True)

reviews.reset_index(drop=True, inplace=True)

print(len(reviews), 'reviews to compute')
start_time = time.time()
reviews['influence'] = reviews.apply(computeInfluence, axis=1)
print("--- %s seconds ---" % (time.time() - start_time))
reviews.head()

5329 reviews to compute
0 ...
500 ...
1000 ...
1500 ...
2000 ...
2500 ...
3000 ...
3500 ...
4000 ...
4500 ...
5000 ...
--- 860.8625617027283 seconds ---


Unnamed: 0,user_id,business_id,review_id,date,business_count,timestamp,stars,influence
0,w6Vv-kldGpmvSGqXvTbAdQ,-7yF42k0CcJhtPw51oaOqQ,b76W7MMBvWOBh64lWS_jnw,2006-03-06,17,1141600000.0,5,0.0
1,w6Vv-kldGpmvSGqXvTbAdQ,34uJtlPnKicSaX1V8_tu1A,YlZq679-ckuY9KZmRaY2fA,2006-03-06,16,1141600000.0,5,-0.1
2,w6Vv-kldGpmvSGqXvTbAdQ,uNZ-RXanNptPHgUdzD3vfw,CTLIHYjrV3CoNpWqVWmPKQ,2006-03-31,1,1143756000.0,5,0.0
3,w6Vv-kldGpmvSGqXvTbAdQ,6L4pcXqK7G4GrGYd9q0Xbg,g3xU5hEnxenZnrbKsLUDYw,2006-03-31,1,1143756000.0,4,1.0
4,ZWOj6LmzwGvMDh-A85EOtA,rGhbYmGvoqU7IVrmuNQi2g,yuIKt0VQxS3AjTFJ9j221Q,2006-04-02,1,1143929000.0,4,0.0


In [16]:
reviews.to_csv('influence_reviews_100.csv', index=False)

In [23]:
print(len(reviews), 'reviews ')

#Pick the relevant reviews (with more than 10% influence)
star_reviews = reviews[reviews.influence.abs() >= 0.1]

print(len(star_reviews), 'star reviews ')

5329 reviews 
637 star reviews 


In [24]:
global_ys = reviews.groupby(['user_id'], as_index=False).agg({
        'influence' : 'mean',
        'review_id' : 'count'
    })

global_ys.sort_values(['influence'], ascending=False, inplace=True)

global_ys.head()

Unnamed: 0,user_id,influence,review_id
94,yvbuGvQKodq6PoRJ2whCfw,5.107417,10
6,1fujZVn5be9EooASzMkdng,2.563172,4
49,VsGlXCCjzsKyuel6Gpu-7A,1.946985,16
39,RYkq-O3vNwzMhg0kxTEkzg,1.915238,10
32,NFZVcGQZxZGfWmJ8wZTADQ,1.6,1


In [20]:


users_group = star_reviews.groupby(['user_id', 'stars'], as_index=False).agg({
        'influence' : 'mean',
        'review_id' : 'count'
    })

#users_group.sort_values(['influence'], ascending=False, inplace=True)
print(len(users_group.user_id.unique()), 'uniques users')

5329 reviews 
637 star reviews 
95 uniques users


In [None]:

plt.plot(reviews.timestamp, reviews['business_count'])

random_review = reviews.sample(1).review_id.values[0]
random_date = reviews[reviews.review_id == random_review].timestamp.values[0]
#end_date =  random_date + datetime.timedelta(days=nb_days_after_review)
#before_date = random_date - datetime.timedelta(days=nb_days_before_review)


plt.axvline(random_date, color='g')
#plt.axvline(end_date, color='r')
#plt.axvline(before_date, color='y')
plt.show()


results = reviews[reviews.review_id == random_review]


In [None]:
g = sns.FacetGrid(reviews, row="stars", margin_titles=True)
bins = np.linspace(-0.2, 0.2, 100)
g.map(plt.hist, "influence", color="steelblue", bins=bins, lw=0)

In [None]:
aa = reviews.groupby(['user_id', 'stars'], as_index=False).agg({
        'review_id' : 'count',
        'influence' : 'mean'
    })

test = aa[(aa.stars >= 3) & (aa.influence > 0.05)]

test  = aa[aa.user_id.isin(test.user_id)
               & (reviews.stars < 3)
               & (reviews.influence < -0.05)]

bb = reviews[reviews.user_id.isin(test.user_id)].groupby(['user_id', 'stars'], as_index=False).agg({
        'review_id' : 'count',
        'influence' : 'mean'
    })
#aa.sort_values(['influence'], inplace=True, ascending=False)
bb

In [None]:
reviews.to_csv('influence_reviews.csv')