# Topical Expert Model

In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import operator
import os
print(os.getcwd())
print(os.listdir('/Users/Brandon/Yelp Dataset/dataset'))
import json
import glob
from datetime import datetime
from pandas import HDFStore,DataFrame

/Users/Brandon/Yelp Dataset/Yelp-FA17/local-elites/brandon
['.DS_Store', 'business.json', 'checkin.json', 'photos.json', 'review.json', 'review5000.json', 'tip.json', 'user.json', 'user5000.json']


In [5]:
def load_json_to_df(datapass):
    '''
    Load the json file and parse the file to pandas dataframe format
    
    Input:
        datapass(str) : directory to the json file
    Output:
        df(dataframe) : pandas dataframe object
    '''
    
    data = [] 
    with open(datapass) as data_file: 
        for f in data_file:
            data.append(json.loads(f))
    df = pd.DataFrame(data)
    return df

LOAD THE DATA

In [6]:
%%time
import_file = "/Users/Brandon/Yelp Dataset/dataset/review.json"
review = load_json_to_df(import_file)

CPU times: user 1min 4s, sys: 20.6 s, total: 1min 24s
Wall time: 1min 33s


In [7]:
%%time
import_file = "/Users/Brandon/Yelp Dataset/dataset/business.json"
business = load_json_to_df(import_file)

CPU times: user 5.15 s, sys: 471 ms, total: 5.62 s
Wall time: 6.01 s


In [8]:
%%time
import_file = "/Users/Brandon/Yelp Dataset/dataset/user.json"
user = load_json_to_df(import_file)

CPU times: user 1min, sys: 26.7 s, total: 1min 26s
Wall time: 1min 41s


Sort by category

In [10]:
def categoryFind(x,cat):
    return cat in x;
restaurantIndex = business['categories'].apply(categoryFind,cat='Restaurants')
restaurants = business[restaurantIndex]

In [11]:
%%time
businessReviews = pd.merge(review, restaurants,on='business_id',right_index=True,sort=False)[['business_id', 'cool', 'funny', 'review_id', 'stars_x', 'text', 'useful', 'user_id', 'categories', 'review_count', 'stars_y']]
bus_rev_users = pd.merge(businessReviews, user,on='user_id',right_index=True,sort=False)[['business_id', 'cool_x', 'funny_x', 'review_id', 'stars_x', 'text', 'useful_x', 'user_id', 'categories', 'stars_y', 'average_stars', 'cool_y', 'elite', 'fans', 'friends', 'name', 'review_count_y', 'yelping_since']]

CPU times: user 38.9 s, sys: 26.4 s, total: 1min 5s
Wall time: 1min 27s


In [302]:
print(len(businessReviews))
print(len(bus_rev_users))
print(list(businessReviews))
print(list(bus_rev_users))

2927731
2927731
['business_id', 'cool', 'funny', 'review_id', 'stars_x', 'text', 'useful', 'user_id', 'categories', 'review_count', 'stars_y']
['business_id', 'cool_x', 'funny_x', 'review_id', 'stars_x', 'text', 'useful_x', 'user_id', 'categories', 'stars_y', 'average_stars', 'cool_y', 'elite', 'fans', 'friends', 'name', 'review_count_y', 'yelping_since']


## Define the first feature model to be analyzed

In [318]:
def numCategoryReviews(businessReviews):
    return businessReviews[['business_id','user_id']].groupby('user_id',as_index=False).count()

def average_star_category_rating(businessReviews):
    return businessReviews[['user_id','stars_x']].groupby('user_id',as_index=False).mean()

def std_star_category_rating(businessReviews):
    busRev = businessReviews[['user_id','stars_x']].groupby('user_id',as_index=False).agg(np.var)
    busRev['stars_x'] = busRev['stars_x'].pow(1./2)
    return busRev

def funny_useful_cool(businessReviews):
    return businessReviews[['user_id','funny','useful','cool']].groupby('user_id',as_index=False).sum()

def months_yelping(businessReviews):
    user_dates = businessReviews[['user_id','yelping_since']].drop_duplicates()
    curr_date = datetime(2017,10,17)
    uyelp_dates = [datetime.strptime(i, "%Y-%m-%d") for i in user_dates['yelping_since']]
    months_yelping = [(curr_date.year - i.year) * 12 + curr_date.month - i.month for i in uyelp_dates]
    user_dates['yelping_since'] = months_yelping
    return user_dates

def get_elite_users(df):
    temp = []
    for i in df['elite'].index.values:
        if df['elite'][i]:
            temp.append(i)
    return temp, df.loc[temp]

def feature1(df, df1):
    tot_reviews = df1[['user_id','review_count_y']]
    tot_reviews.columns = ['user_id','Total Reviews by User']
    catRev = numCategoryReviews(df)
    catRev.columns = ['user_id','Num Category Reviews']
    averageCat = average_star_category_rating(df)
    averageCat.columns = ['user_id','Average Rating in Category']
    stdCat = std_star_category_rating(df)
    stdCat.columns = ['user_id','Std Dev of Ratings in Category']
    fuc = funny_useful_cool(df)
    monthsYelp = months_yelping(df1)
    monthsYelp.columns = ['user_id', 'Months Yelping']
    is_elite = df1[['user_id','elite']]
    features = tot_reviews.merge(catRev,on='user_id').merge(averageCat,on='user_id').merge(stdCat,on='user_id').merge(fuc,on='user_id').merge(monthsYelp,on='user_id').merge(is_elite,on='user_id')
    features = features.drop_duplicates('user_id')
    features.index = range(len(features))
    elite_ind, elite_users = get_elite_users(features)
    expertClassifier = [0]*len(feature_set_1)
    for i in elite_ind:
        expertClassifier[i]=1
    features = pd.concat([features, pd.DataFrame(expertClassifier,columns=['is_elite'])],axis=1)
    return features.fillna(0)
    
    

In [None]:
%%time
feature_set_1 = feature1(businessReviews, bus_rev_users)

In [None]:
feature_set_1.head(5)

## Classifier Setups (Sklearn Library)*

In [277]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import tree

In [278]:
'''Runs the model specified by the clf (classifier), and features.
Features is assumed to have an 'is_elite' column which gives the classifications'''
def run_model(clf, features):
    train, test = train_test_split(X)
    train_classifier = train['is_elite'].values
    test_classifier = test['is_elite'].values
    train = train.drop('is_elite',axis=1)
    test = test.drop('is_elite', axis=1)
    clf.fit(train, train_classifier)
    rf_pred = clf.predict(test)
    model_perf= {'Model_Score' : clf.score(test, test_classifier),
                'Predictions' : rf_pred,
                'Prediction Probabilities' : clf.predict_proba(test),
                'Total_Tested' : len(rf_pred),
                'Num_Experts_Predicted' : sum(rf_pred),
                'Num_Experts_Actual' : sum(test_classifier),
                'Num_Experts_Training' : sum(train_classifier)} 
    return model_perf

'''Runs the model n times, and prints out a dictionary with the statistics'''
def bootstrap_model(clf, features, n):
    models = []
    for i in range(1,n):
        models.append(run_model(clf, features))
    return models

'''Gets statistics from the bootstrap list of dictionaries'''
def boot_statistics(models):
    stats = []
    expert_pred_percentage = [i['Num_Experts_Predicted']/i['Num_Experts_Actual'] for i in RF_bootstrap]
    stats = expert_pred_percentage
    return stats

 #### Drop unnessary columns from the features table (call it X).

In [279]:
X = feature_set_1.drop('user_id',axis=1).drop('elite',axis=1)

## Run a random forest classifier

In [280]:
clf_RF = RandomForestClassifier(max_depth=3)
run_model(clf_RF,X);

In [281]:
RF_bootstrap = bootstrap_model(clf_RF, X, 10)

## Run a gaussian naive bayes classifier

In [282]:
from sklearn.naive_bayes import GaussianNB
clf_NB = GaussianNB()
run_model(clf_NB,X);

## Run a decision tree classifier

In [285]:
clf_DT = tree.DecisionTreeClassifier()
run_model(clf_DT, X)

{'Model_Score': 0.92866929019093425,
 'Num_Experts_Actual': 12590,
 'Num_Experts_Predicted': 9782,
 'Num_Experts_Training': 38089,
 'Prediction Probabilities': array([[ 1.        ,  0.        ],
        [ 1.        ,  0.        ],
        [ 0.99676898,  0.00323102],
        ..., 
        [ 0.        ,  1.        ],
        [ 0.98324022,  0.01675978],
        [ 0.9787234 ,  0.0212766 ]]),
 'Predictions': array([0, 0, 0, ..., 1, 0, 0]),
 'Total_Tested': 205830}