# Topical Expert Model

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import operator
import os
print(os.getcwd())
print(os.listdir('/Users/Brandon/Yelp Dataset/dataset'))
import json
import glob
from datetime import datetime
from pandas import HDFStore,DataFrame
import operator

/Users/Brandon/Yelp Dataset/Yelp-FA17/local-elites/brandon
['.DS_Store', 'business.json', 'checkin.json', 'photos.json', 'review.json', 'review5000.json', 'tip.json', 'user.json', 'user5000.json']


In [2]:
def load_json_to_df(datapass):
    '''
    Load the json file and parse the file to pandas dataframe format
    
    Input:
        datapass(str) : directory to the json file
    Output:
        df(dataframe) : pandas dataframe object
    '''
    
    data = [] 
    with open(datapass) as data_file: 
        for f in data_file:
            data.append(json.loads(f))
    df = pd.DataFrame(data)
    return df

LOAD THE DATA

In [3]:
%%time
import_file = "/Users/Brandon/Yelp Dataset/dataset/review.json"
review = load_json_to_df(import_file)

CPU times: user 1min 3s, sys: 23.3 s, total: 1min 26s
Wall time: 1min 38s


In [4]:
%%time
import_file = "/Users/Brandon/Yelp Dataset/dataset/business.json"
business = load_json_to_df(import_file)

CPU times: user 5.25 s, sys: 453 ms, total: 5.7 s
Wall time: 6.06 s


In [5]:
%%time
import_file = "/Users/Brandon/Yelp Dataset/dataset/user.json"
user = load_json_to_df(import_file)

CPU times: user 53.3 s, sys: 17.5 s, total: 1min 10s
Wall time: 1min 21s


Sort by category

In [7]:
def categoryFind(df, cat):
    return [cat in i for i in df['categories']]

def category_counts(df):
    all_cats = []
    for i in df['categories']:
        for j in i:
            all_cats.append(j)
    categories = set(all_cats)
    category_counts = {}
    for cat in categories:
        category_counts[cat] = all_cats.count(cat)
    return len(categories), category_counts

def top_categories(counts, n):
    top_categories = sorted(counts.keys(), key=(lambda k: counts[k]),reverse=True)[:n]
    return top_categories

CPU times: user 54 s, sys: 691 ms, total: 54.7 s
Wall time: 57.3 s


In [8]:
num_cats, cat_counts = category_counts(business)
top_cats = top_categories(cat_counts, 100)
top_cats

['Restaurants',
 'Shopping',
 'Food',
 'Beauty & Spas',
 'Home Services',
 'Health & Medical',
 'Nightlife',
 'Bars',
 'Automotive',
 'Local Services',
 'Event Planning & Services',
 'Active Life',
 'Fashion',
 'Sandwiches',
 'Fast Food',
 'American (Traditional)',
 'Pizza',
 'Coffee & Tea',
 'Hair Salons',
 'Hotels & Travel',
 'Arts & Entertainment',
 'Home & Garden',
 'Auto Repair',
 'Italian',
 'Burgers',
 'Doctors',
 'Breakfast & Brunch',
 'Mexican',
 'Nail Salons',
 'Professional Services',
 'American (New)',
 'Chinese',
 'Real Estate',
 'Specialty Food',
 'Fitness & Instruction',
 'Pets',
 'Grocery',
 'Bakeries',
 'Cafes',
 'Hair Removal',
 'Dentists',
 'Hotels',
 'Desserts',
 'Skin Care',
 "Women's Clothing",
 'Education',
 'Japanese',
 'Ice Cream & Frozen Yogurt',
 'Pet Services',
 'Day Spas',
 'Massage',
 'General Dentistry',
 'Financial Services',
 'Pubs',
 'Chicken Wings',
 'Seafood',
 'Contractors',
 'Salad',
 'Gyms',
 'Sushi Bars',
 'Sports Bars',
 'Apartments',
 'Caterers

In [9]:
%%time
businessReviews = pd.merge(review, restaurants,on='business_id',right_index=True,sort=False)[['business_id', 'cool', 'funny', 'review_id', 'stars_x', 'text', 'useful', 'user_id', 'categories', 'review_count', 'stars_y']]
bus_rev_users = pd.merge(businessReviews, user,on='user_id',right_index=True,sort=False)[['business_id', 'cool_x', 'funny_x', 'review_id', 'stars_x', 'text', 'useful_x', 'user_id', 'categories', 'stars_y', 'average_stars', 'cool_y', 'elite', 'fans', 'friends', 'name', 'review_count_y', 'yelping_since']]

CPU times: user 38 s, sys: 28.2 s, total: 1min 6s
Wall time: 1min 37s


In [10]:
print(len(businessReviews))
print(len(bus_rev_users))
print(list(businessReviews))
print(list(bus_rev_users))

2927731
2927731
['business_id', 'cool', 'funny', 'review_id', 'stars_x', 'text', 'useful', 'user_id', 'categories', 'review_count', 'stars_y']
['business_id', 'cool_x', 'funny_x', 'review_id', 'stars_x', 'text', 'useful_x', 'user_id', 'categories', 'stars_y', 'average_stars', 'cool_y', 'elite', 'fans', 'friends', 'name', 'review_count_y', 'yelping_since']


## Feature Extraction From Yelp Data

In [11]:
def numCategoryReviews(businessReviews):
    return businessReviews[['business_id','user_id']].groupby('user_id',as_index=False).count()

def average_star_category_rating(businessReviews):
    return businessReviews[['user_id','stars_x']].groupby('user_id',as_index=False).mean()

def std_star_category_rating(businessReviews):
    busRev = businessReviews[['user_id','stars_x']].groupby('user_id',as_index=False).agg(np.var)
    busRev['stars_x'] = busRev['stars_x'].pow(1./2)
    return busRev

def funny_useful_cool(businessReviews):
    return businessReviews[['user_id','funny','useful','cool']].groupby('user_id',as_index=False).sum()

def months_yelping(businessReviews):
    user_dates = businessReviews[['user_id','yelping_since']].drop_duplicates()
    curr_date = datetime(2017,10,17)
    uyelp_dates = [datetime.strptime(i, "%Y-%m-%d") for i in user_dates['yelping_since']]
    months_yelping = [(curr_date.year - i.year) * 12 + curr_date.month - i.month for i in uyelp_dates]
    user_dates['yelping_since'] = months_yelping
    return user_dates

def get_elite_users(df):
    temp = []
    for i in df['elite'].index.values:
        if df['elite'][i]:
            temp.append(i)
    return temp, df.loc[temp]

## Define the first feature model to be analyzed
Features: Total Reviews by User, Number of Category Reviews, Average Rating in Category, Std Dev of Ratings in Category, Funny, Useful, and Cool Votes, Number of Months since joining Yelp.

In [16]:
def feature1(df, df1):
    tot_reviews = df1[['user_id','review_count_y']]
    tot_reviews.columns = ['user_id','Total Reviews by User']
    catRev = numCategoryReviews(df)
    catRev.columns = ['user_id','Num Category Reviews']
    averageCat = average_star_category_rating(df)
    averageCat.columns = ['user_id','Average Rating in Category']
    stdCat = std_star_category_rating(df)
    stdCat.columns = ['user_id','Std Dev of Ratings in Category']
    fuc = funny_useful_cool(df)
    monthsYelp = months_yelping(df1)
    monthsYelp.columns = ['user_id', 'Months Yelping']
    is_elite = df1[['user_id','elite']]
    features = tot_reviews.merge(catRev,on='user_id').merge(averageCat,on='user_id').merge(stdCat,on='user_id').merge(fuc,on='user_id').merge(monthsYelp,on='user_id').merge(is_elite,on='user_id')
    features = features.drop_duplicates('user_id')
    features.index = range(len(features))
    elite_ind, elite_users = get_elite_users(features)
    expertClassifier = [0]*len(features)
    for i in elite_ind:
        expertClassifier[i]=1
    features = pd.concat([features, pd.DataFrame(expertClassifier,columns=['is_expert'])],axis=1)
    return features.fillna(0)

In [13]:
businessReviews[['business_id','user_id']].groupby('user_id',as_index=False).count()

Unnamed: 0,user_id,business_id
0,---1lKK3aKOuomHnwAkAow,53
1,---PLwSf5gKdIoVnyRHgBA,1
2,---cu1hq55BP9DWVXXKHZg,2
3,---udAKDsn0yQXmzbWQNSw,1
4,--0RtXvcOIE4XbErYca6Rw,1
5,--0sXNBv6IizZXuV-nl0Aw,1
6,--1av6NdbEbMiuBr7Aup9A,1
7,--1mPJZdSY9KluaBYAGboQ,1
8,--26jc8nCJBy4-7r3ZtmiQ,1
9,--2HUmLkcNHZp0xw6AMBPg,22


In [None]:
%%time
feature_set_1 = feature1(businessReviews, bus_rev_users)

In [None]:
print(len(feature_set_1))
feature_set_1.head(5)

## Classifier Setups (Sklearn Library)*

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import tree

In [None]:
'''Runs the model specified by the clf (classifier), and features.
Features is assumed to have an 'is_expert' column which gives the classifications'''
def run_model(clf, features):
    train, test = train_test_split(X)
    train_classifier = train['is_expert'].values
    test_classifier = test['is_expert'].values
    train = train.drop('is_expert',axis=1)
    test = test.drop('is_expert', axis=1)
    clf.fit(train, train_classifier)
    rf_pred = clf.predict(test)
    model_perf= {'Model_Score' : clf.score(test, test_classifier),
                'Predictions' : rf_pred,
                'Prediction Probabilities' : clf.predict_proba(test),
                'Total_Tested' : len(rf_pred),
                'Num_Experts_Predicted' : sum(rf_pred),
                'Num_Experts_Actual' : sum(test_classifier),
                'Num_Experts_Training' : sum(train_classifier)} 
    return model_perf

'''Runs the model n times, and prints out a dictionary with the statistics'''
def bootstrap_model(clf, features, n):
    models = []
    for i in range(1,n):
        models.append(run_model(clf, features))
    return models

'''Gets statistics from the bootstrap list of dictionaries'''
def boot_statistics(models):
    stats = []
    expert_pred_percentage = [i['Num_Experts_Predicted']/i['Num_Experts_Actual'] for i in RF_bootstrap]
    stats = expert_pred_percentage
    return stats

 #### Drop unnessary columns from the features table (call it X).

In [None]:
X = feature_set_1.drop('user_id',axis=1).drop('elite',axis=1)

## Run a random forest classifier

In [None]:
clf_RF = RandomForestClassifier(max_depth=3)
run_model(clf_RF,X);

In [None]:
RF_bootstrap = bootstrap_model(clf_RF, X, 10)

## Run a gaussian naive bayes classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
clf_NB = GaussianNB()
run_model(clf_NB,X);

## Run a decision tree classifier

In [None]:
clf_DT = tree.DecisionTreeClassifier()
run_model(clf_DT, X)