## Building a Classifier for the Elite Types

#### Import libraries

In [1]:
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#### Load files (might take some time)

In [2]:
review=[]
with open('../../hduser1/Yelp/review.json') as json_file:
    for line in json_file:
        review.append(json.loads(line))
review_df = pd.DataFrame(review)

users=[]
with open('../../hduser1/Yelp/user.json') as json_file:
    for line in json_file:
        users.append(json.loads(line))
user_df = pd.DataFrame(users)

tips=[]
with open('../../hduser1/Yelp/tip.json') as json_file:
    for line in json_file:
        tips.append(json.loads(line))
tip_df = pd.DataFrame(tips)

business=[]
with open('../../hduser1/Yelp/business.json') as json_file:
    for line in json_file:
        business.append(json.loads(line))
business_df = pd.DataFrame(business)

In [3]:
review_df.head(3)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,type,useful,user_id
0,2aFiy99vNLklCx3T_tGS9A,0,2011-10-10,0,NxL8SIC5yqOdnlXCg18IBg,5,If you enjoy service by someone who is as comp...,review,0,KpkOkG6RIf4Ra25Lhhxf1A
1,2aFiy99vNLklCx3T_tGS9A,0,2010-12-29,0,pXbbIgOXvLuTi_SPs1hQEQ,5,After being on the phone with Verizon Wireless...,review,1,bQ7fQq1otn9hKX-gXRsrgA
2,2aFiy99vNLklCx3T_tGS9A,0,2011-04-29,0,wslW2Lu4NYylb1jEapAGsw,5,Great service! Corey is very service oriented....,review,0,r1NUhdNmL6yU9Bn-Yx6FTw


#### Look up user from user.json and define the elite status

In [4]:
# dictionary that maps user_id to the index in array
user_dict = {}
count = 0
for u in users:
    user_dict[u['user_id']] = count
    count += 1

In [7]:
# helper function for apply
def is_elite(x):
    # year = pd.to_datetime(x['date']).date().year
    year = x['date'].split('-')[0]
    user_idx = user_dict[x['user_id']]
    return year in users[user_idx]['elite']

# helper function for apply
def is_future_elite(x):
    # year = pd.to_datetime(x['date']).date().year
    year = x['date'].split('-')[0]
    user_idx = user_dict[x['user_id']]
    return str(int(year)+1) in users[user_idx]['elite']

Avoid using pd.timestamp!

It takes forever. (The cell below still takes long time)

Let me know if someone knows better way to code this

In [8]:
tip_df['by_elite'] = tip_df.apply(is_elite, axis = 1)
tip_df['by_future_elite'] = tip_df.apply(is_future_elite, axis = 1)
review_df['by_elite'] = review_df.apply(is_elite, axis = 1)
review_df['by_future_elite'] = review_df.apply(is_future_elite, axis = 1)

In [9]:
# user0 : non-elite
# user1 : real-elite
# user2 : potential-elite
# user3 : slack-elite

review_type0 = review_df.ix[~((review_df['by_elite'])) & ~((review_df['by_future_elite']))]['business_id'] # non-elites
review_type1 = review_df.ix[ (review_df['by_elite']) &  (review_df['by_future_elite'])]['business_id'] # real elites
review_type2 = review_df.ix[~(review_df['by_elite']) &  (review_df['by_future_elite'])]['business_id'] # potential
review_type3 = review_df.ix[ (review_df['by_elite']) & ~(review_df['by_future_elite'])]['business_id'] # slack

type0_r = business_df.ix[business_df.business_id.isin(review_type0)]
type1_r = business_df.ix[business_df.business_id.isin(review_type1)]
type2_r = business_df.ix[business_df.business_id.isin(review_type2)]
type3_r = business_df.ix[business_df.business_id.isin(review_type3)]

In [10]:
review_df['category'] = 'Non-elite'

In [11]:
review_df.loc[review_df.ix[((review_df['by_elite'])) & ((review_df['by_future_elite']))].index,['category'] ] = 'Real-elite'
review_df.loc[review_df.ix[(~(review_df['by_elite'])) & ((review_df['by_future_elite']))].index,['category'] ] = 'Potential-elite'
review_df.loc[review_df.ix[((review_df['by_elite'])) & (~(review_df['by_future_elite']))].index,['category'] ] = 'Slack-elite'

In [12]:
non_elite_review_id = review_df.ix[review_df['category'] == 'Non-elite']['review_id']
real_elite_review_id = review_df.ix[review_df['category'] == 'Real-elite']['review_id']
slack_elite_review_id = review_df.ix[review_df['category'] == 'Slack-elite']['review_id']
potential_elite_review_id = review_df.ix[review_df['category'] == 'Potential-elite']['review_id']

In [13]:
print(len(non_elite_review_id))
print(len(real_elite_review_id))
print(len(slack_elite_review_id))
print(len(potential_elite_review_id))

3232165
725732
79660
115593


In [18]:
# creating pickle for the later use
import pickle
pickle.dump(non_elite_review_id, open('non_elite_review_id.p','wb'))
pickle.dump(real_elite_review_id, open('real_elite_review_id.p','wb'))
pickle.dump(slack_elite_review_id, open('slack_elite_review_id.p','wb'))
pickle.dump(potential_elite_review_id, open('potential_elite_review_id.p','wb'))

In [26]:
# example loading them into your local environment
test= pickle.load(open('slack_elite_review_id.p','rb'))

In [23]:
# writing csv for the future use
review_df.to_csv('review_w_category.csv', encoding = 'utf-8')

In [24]:
# eample loading csv file as a pd.DataFrame
review_test = pd.read_csv('review_w_category.csv', encoding = 'utf8')

  interactivity=interactivity, compiler=compiler, result=result)


#### Tried naive classification (won't work)

In [39]:
from sklearn.model_selection import train_test_split
train_id, test_id, train_label, test_label = train_test_split(review_df['review_id'], review_df['category'], test_size=0.2, random_state=42)

In [47]:
train = review_df.ix[review_df['review_id'].isin(train_id)]
test = review_df.ix[review_df['review_id'].isin(test_id)]

In [54]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(pd.concat([train['cool'], train['funny'], train['stars'], train['useful']], axis=1), train_label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [56]:
prediction = model.predict(pd.concat([test['cool'], test['funny'], test['stars'], test['useful']], axis=1))

In [57]:
model.score(pd.concat([train['cool'], train['funny'], train['stars'], train['useful']], axis=1), train_label)

0.77824753500355148

In [60]:
from sklearn.metrics import accuracy_score
accuracy_score(test_label, prediction)

0.77823098130334811

In [61]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_label, prediction)

array([[646422,      0,      0,      0],
       [ 23129,      0,      0,      0],
       [145019,      0,      0,      0],
       [ 16060,      0,      0,      0]])

#### To-Do
* Over/under sampling
* Adding more features