In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import operator
import os
print(os.getcwd())
print(os.listdir('/Users/Brandon/Yelp Dataset/dataset'))
import json
import glob
from datetime import datetime
from pandas import HDFStore,DataFrame

/Users/Brandon/Yelp Dataset/Yelp-FA17/local-elites/brandon
['.DS_Store', 'business.json', 'checkin.json', 'photos.json', 'review.json', 'review5000.json', 'tip.json', 'user.json', 'user5000.json']


In [5]:
def load_json_to_df(datapass):
    '''
    Load the json file and parse the file to pandas dataframe format
    
    Input:
        datapass(str) : directory to the json file
    Output:
        df(dataframe) : pandas dataframe object
    '''
    
    data = [] 
    with open(datapass) as data_file: 
        for f in data_file:
            data.append(json.loads(f))
    df = pd.DataFrame(data)
    return df

LOAD THE DATA

In [6]:
%%time
import_file = "/Users/Brandon/Yelp Dataset/dataset/review.json"
review = load_json_to_df(import_file)

CPU times: user 1min 4s, sys: 20.6 s, total: 1min 24s
Wall time: 1min 33s


In [7]:
%%time
import_file = "/Users/Brandon/Yelp Dataset/dataset/business.json"
business = load_json_to_df(import_file)

CPU times: user 5.15 s, sys: 471 ms, total: 5.62 s
Wall time: 6.01 s


In [8]:
%%time
import_file = "/Users/Brandon/Yelp Dataset/dataset/user.json"
user = load_json_to_df(import_file)

CPU times: user 1min, sys: 26.7 s, total: 1min 26s
Wall time: 1min 41s


Sort by category

In [10]:
def categoryFind(x,cat):
    return cat in x;
restaurantIndex = business['categories'].apply(categoryFind,cat='Restaurants')
restaurants = business[restaurantIndex]

In [11]:
%%time
businessReviews = pd.merge(review, restaurants,on='business_id',right_index=True,sort=False)[['business_id', 'cool', 'funny', 'review_id', 'stars_x', 'text', 'useful', 'user_id', 'categories', 'review_count', 'stars_y']]
bus_rev_users = pd.merge(businessReviews, user,on='user_id',right_index=True,sort=False)[['business_id', 'cool_x', 'funny_x', 'review_id', 'stars_x', 'text', 'useful_x', 'user_id', 'categories', 'stars_y', 'average_stars', 'cool_y', 'elite', 'fans', 'friends', 'name', 'review_count_y', 'yelping_since']]


CPU times: user 38.9 s, sys: 26.4 s, total: 1min 5s
Wall time: 1min 27s


In [25]:
print(len(businessReviews))
print(len(bus_rev_users))
print(list(bus_rev_users))

2927731
2927731
['business_id', 'cool_x', 'funny_x', 'review_id', 'stars_x', 'text', 'useful_x', 'user_id', 'categories', 'stars_y', 'average_stars', 'cool_y', 'elite', 'fans', 'friends', 'name', 'review_count_y', 'yelping_since']


Define the first feature model to be analyzed

In [49]:
def numCategoryReviews(businessReviews):
    return businessReviews[['business_id','user_id']].groupby('user_id',as_index=False).count()

def average_star_category_rating(businessReviews):
    return businessReviews[['user_id','stars_x']].groupby('user_id',as_index=False).mean()

def std_star_category_rating(businessReviews):
    busRev = businessReviews[['user_id','stars_x']].groupby('user_id',as_index=False).agg(np.var)
    busRev['stars_x'] = busRev['stars_x'].pow(1./2)
    return busRev

def funny_useful_cool(businessReviews):
    return businessReviews[['user_id','funny','useful','cool']].groupby('user_id',as_index=False).sum()

def months_yelping(businessReviews):
    user_dates = businessReviews[['user_id','yelping_since']].drop_duplicates()
    curr_date = datetime(2017,10,17)
    uyelp_dates = [datetime.strptime(i, "%Y-%m-%d") for i in user_dates['yelping_since']]
    months_yelping = [(curr_date.year - i.year) * 12 + curr_date.month - i.month for i in uyelp_dates]
    user_dates['yelping_since'] = months_yelping
    return user_dates

def get_elite_users(df):
    temp = []
    for i in df['elite'].index.values:
        if df['elite'][i]:
            temp.append(i)
    return temp, df.loc[temp]

def feature1(df, df1):
    catRev = numCategoryReviews(df)
    catRev.columns = ['user_id','Num Category Reviews']
    averageCat = average_star_category_rating(df)
    averageCat.columns = ['user_id','Average Rating in Category']
    stdCat = std_star_category_rating(df)
    stdCat.columns = ['user_id','Std Dev of Ratings in Category']
    fuc = funny_useful_cool(df)
    monthsYelp = months_yelping(df1)
    monthsYelp.columns = ['user_id', 'Months Yelping']
    is_elite = df1[['user_id','elite']]
    features = catRev.merge(averageCat,on='user_id').merge(stdCat,on='user_id').merge(fuc,on='user_id').merge(monthsYelp,on='user_id').merge(is_elite,on='user_id')
    features = features.drop_duplicates('user_id')
    features.index = range(len(features))
    return features.fillna(0)
    
    

In [50]:
%%time
feature_set_1 = feature1(businessReviews, bus_rev_users)

CPU times: user 40.9 s, sys: 2.42 s, total: 43.3 s
Wall time: 44.8 s


In [51]:
temp = bus_rev_users[['user_id','elite']]
elite_ind, elite_users = get_elite_users(feature_set_1)

In [66]:
#elite_users
expertClassifier = [0]*len(feature_set_1)
for i in elite_ind:
    expertClassifier[i]=1
sum(expertClassifier)

50679

Random Forests Classifier

In [67]:
expert = 1
not_expert = 0
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [75]:
X, y = feature_set_1.drop('user_id',axis=1).drop('elite',axis=1), expertClassifier
clf = RandomForestClassifier(max_depth=4, random_state=0)
clf.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [99]:
clf.predict([[22,4.77,0.68,0,0,0,15]])

array([0])

In [98]:
feature_set_1[1:10]

Unnamed: 0,user_id,Num Category Reviews,Average Rating in Category,Std Dev of Ratings in Category,funny,useful,cool,Months Yelping,elite
1,---PLwSf5gKdIoVnyRHgBA,1,3.0,0.0,0,0,0,27,[]
2,---cu1hq55BP9DWVXXKHZg,2,2.5,2.12132,0,1,0,102,[]
3,---udAKDsn0yQXmzbWQNSw,1,4.0,0.0,0,0,0,39,[]
4,--0RtXvcOIE4XbErYca6Rw,1,4.0,0.0,0,0,0,53,[]
5,--0sXNBv6IizZXuV-nl0Aw,1,5.0,0.0,0,1,0,57,[]
6,--1av6NdbEbMiuBr7Aup9A,1,5.0,0.0,0,3,0,85,[]
7,--1mPJZdSY9KluaBYAGboQ,1,5.0,0.0,0,0,0,75,[]
8,--26jc8nCJBy4-7r3ZtmiQ,1,5.0,0.0,1,1,1,38,[]
9,--2HUmLkcNHZp0xw6AMBPg,22,4.772727,0.685344,0,0,0,15,[]
