## Automatically Categorizing Yelp Businesses

Build a baseline based on the article from Yelp Software Team, [Automatically Categorizing Yelp Businesses](https://engineeringblog.yelp.com/2015/09/automatically-categorizing-yelp-businesses.html)  
Not using text information, guess the multi-label assignment of business. Here, all sample set contains 'Chinese' tag and try to see if the model can detect more subtle labeling.

In [113]:
from utils import * 
import pickle
import numpy as np
import random
import matplotlib.pyplot as plt
from co_occurrence_net.category_map import CategoryMap
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [2]:
# load data
chinese_business = pd.read_csv('chinese_business_clean.csv', index_col = False)
chinese_reviews = pd.read_csv('chinese_review_clean.csv', index_col = False)
chinese_business['categories'] =  [eval(i) for i in chinese_business['categories']]

In [3]:
G = CategoryMap()
G.build_graph(chinese_business['categories'])

In [4]:
G.get_subcategories('Chinese')[:10]

[('Asian Fusion', 449),
 ('Food', 398),
 ('Fast Food', 260),
 ('Thai', 237),
 ('Dim Sum', 230),
 ('Buffets', 211),
 ('Japanese', 179),
 ('Seafood', 163),
 ('Sushi Bars', 158),
 ('Specialty Food', 134)]

## A Series of Binary Classifiers: One for Each Category

is a business of a given category 

We extract terms from names and reviews, using standard lexical analysis techniques of tokenization, normalization (e.g. lowercasing), and stop word filtering. If the business has been categorized as part of a chain (which we’ll describe in an upcoming blog post!) we’ll include that chain’s URL as a feature, and if the business has NAICS codes from one of our data partners, we’ll include those as well.

In [5]:
def data_split(business_df, review_df, topic):
    '''
    Split the original data into 2 classes, ones that includes topic label and don't
    '''
    print ('topic: {}'.format(topic))
    includes = set()
    not_includes = set()
    for i, topics in enumerate(business_df['categories']):
        if topic in topics:
            includes.add(business_df.iloc[i]['business_id'])
        else:
            not_includes.add(business_df.iloc[i]['business_id'])
    review_included = review_df.loc[review_df['business_id'].isin(includes)]
    review_not_included = review_df.loc[review_df['business_id'].isin(not_includes)]
    print ('include topic:     {} business, {} reviews'.format(len(includes), len(review_included)))
    print ('not include topic: {} business, {} reviews'.format(len(not_includes), len(review_not_included)))
    
    return review_included, review_not_included

In [6]:
t, f = data_split(chinese_business, chinese_reviews, G.get_subcategories('Chinese')[4][0])

topic: Dim Sum
include topic:     230 business, 21960 reviews
not include topic: 3545 business, 156189 reviews


Yelp used following features:
- Tokenized Name
- Tokenized Review
- NAICS(we do not have an access)
- country (we disregard)
- Last Term in Name 

In [52]:
def genereate_feature(review_counter, name_counter, review_df, business_df, business_id):
    '''
    '''    
    # filter the restaurant name
    name = business_df.loc[business_df['business_id'] == business_id]['name']

    # filter the reviews for the specified business
    review = review_df.loc[review_df['business_id'] == business_id]['text']
    if (len(name_counter.build_analyzer()(name.values[0])) != 0):
        # extract the last word of the restaurant
        last_name = pd.Series(name_counter.build_analyzer()(name.values[0])[-1])

        # feature length 
        name_length = len(name_counter.get_feature_names()) 
        review_length = len(review_counter.get_feature_names())

        # NAME + LAST NAME + REVIEW
        name_feature = np.zeros(name_length)
        last_name_feature = np.zeros(name_length)
        review_feature = np.zeros(review_length)

        for r, d in zip(name_counter.transform(name).indices, name_counter.transform(name).data):
            name_feature[r] = d

        for r, d in zip(name_counter.transform(last_name).indices, name_counter.transform(last_name).data):
            last_name_feature[r] = d

        for r, d in zip(review_counter.transform(review).indices, review_counter.transform(review).data):
            review_feature[r] = d

        feature = np.array(np.concatenate((name_feature, last_name_feature, review_feature),axis = 0))
        return feature
    else:
        return None

In [53]:
def create_data_set(review_counter, name_counter, review_df, business_df, topic):
    '''
    '''
    # select business with topic tag
    t, f = data_split(business_df, review_df, topic)
    t_in =  (set(t['business_id']))
    t_not_in =  (set(f['business_id']))
    
    # output dimension
    name_length = len(name_counter.get_feature_names()) 
    review_length = len(review_counter.get_feature_names())
    feature_length = 2*name_length + review_length
    
    X_in = np.array([])
    Y_in = np.array([])
    
    for b in t_in:
        feature = genereate_feature(review_counter, name_counter, review_df, business_df, b)
        if feature:
            X_in = np.append(X_in, feature)
            Y_in = np.append(Y_in, 1)
    print ('positive set done')
    
    X_out = np.array([])
    Y_out = np.array([])
    
#     for b in t_not_in:
#         feature = genereate_feature(review_counter, name_counter, review_df, business_df, b)
#         X_out = np.append(X_out, feature)
#         Y_out = np.append(Y_out, 0)
#     print ('negative set done')
    
    
    return X_in.reshape(len(t_in), feature_length), Y_in.reshape(-1,1), t_not_in

In [9]:
# build CountVectorizer for review and name
review_counter = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
name_counter = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

# fit the CountVectorizer
reviews = review_counter.fit_transform(chinese_reviews['text'])
names = name_counter.fit_transform(chinese_business['name'])

In [10]:
genereate_feature(review_counter, name_counter, chinese_reviews, chinese_business, 'OygJyqypKFZJIZ6r9dML7w')

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [11]:
X, Y, not_list = create_data_set(review_counter, name_counter, chinese_reviews, chinese_business,'Dim Sum')

topic: Dim Sum
include topic:     230 business, 21960 reviews
not include topic: 3545 business, 156189 reviews
positive set done


Again, some business is breaking my code ...

In [12]:
print ('investigation begins')

bugs = []
for b in not_list:
    try:
        feature = genereate_feature(review_counter, name_counter, chinese_reviews, chinese_business, b)
#         X_in = np.append(X_in, feature)
#         Y_in = np.append(Y_in, 1)
    except:
        print (b)
        bugs.append(b)
        
print ('investigation done')

investigation begins
dAtT3iwh3Os5lkA7TySvNg
investigation done


In [13]:
chinese_business.loc[chinese_business['business_id'] != 'dAtT3iwh3Os5lkA7TySvNg']

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
64,165 E Beaver Creek Road,"{'RestaurantsTableService': True, 'GoodForMeal...",dAtT3iwh3Os5lkA7TySvNg,"[Seafood, Chinese, Restaurants, Barbeque]",Richmond Hill,"{'Monday': '11:00-23:00', 'Tuesday': '11:00-23...",1,43.847147,-79.378954,Top 1,,L4B 3P4,19,3.0,ON


Both 'Top' and '1' appear only once in the dataset. This is actually a special snowflake. But this business has to go.

In [39]:
chinese_business = chinese_business.loc[chinese_business['business_id'] != 'dAtT3iwh3Os5lkA7TySvNg']
chinese_business.to_csv('chinese_business_clean.csv', index = False)

Bootstrap from not list

In [46]:
boots = random.sample((not_list), int(len(not_list)*0.2))

In [51]:
# output dimension
name_length = len(name_counter.get_feature_names()) 
review_length = len(review_counter.get_feature_names())
feature_length = 2*name_length + review_length

X_out = np.array([])
Y_out = np.array([])
for b in boots:
    feature = genereate_feature(review_counter, name_counter, chinese_reviews, chinese_business, b)
    X_out = np.append(X_out, feature)
    Y_out = np.append(Y_out, 0)
X_out = X_out.reshape(len(boots), feature_length)
Y_out = Y_out.reshape(-1,1)
print ('negative set done')

negative set done


Original article builds Logistic Regression for each categories

In [57]:
lr = LogisticRegression(class_weight = 'balanced')

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
X_train_o, X_test_o, y_train_o, y_test_o = train_test_split(X_out, Y_out, test_size=0.2)

In [92]:
X_train = np.concatenate((X_train, X_train_o))
X_test = np.concatenate((X_test, X_test_o))

Y_train = np.concatenate((y_train, y_train_o))
Y_test = np.concatenate((y_test, y_test_o))

In [115]:
lr = LogisticRegression()
lr.fit(X_train, Y_train.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [116]:
confusion_matrix(Y_test.ravel(), lr.predict(X_test))

array([[136,   6],
       [ 14,  32]])

In [117]:
f1_score(Y_test.ravel(), lr.predict(X_test))

0.76190476190476197

In [121]:
np.argsort(lr.coef_)

array([[16672, 18463, 43749, ..., 13605, 38795, 12544]])

In [125]:
lr.coef_[0,16672], lr.coef_[0,12544]

(-0.28791993010302286, 0.72338751229731657)

In [223]:
def get_best_feature(coef, name_counter, review_counter, n):
    '''
    coef has a sequence of NAME, LAST NAME, REVIEW VOCAB 
    extract the top n features from coef
    '''
    name_v = name_counter.get_feature_names()
    review_v = review_counter.get_feature_names()
    
    name = coef[0][:len(name_v)]
    last_name = coef[0][len(name_v):2*len(name_v)]
    review = coef[0][2*len(name_v):]
    
    print ('name coefficient negative')
    for u in np.argsort(name)[:n]:
        print(name_v[u], name[u])
    print ('\n')
    print ('name coefficient positive')
    for u in np.argsort(name)[::-1][:n]:
        print(name_v[u], name[u])
    print ('\n')
    print ('last name coefficient negative')
    for u in np.argsort(last_name)[:n]:
        print(name_v[u], last_name[u])
    print ('\n')
    print ('last name coefficient positive')
    for u in np.argsort(last_name)[::-1][:n]:
        print(name_v[u], last_name[u])
    print ('\n')
    print ('review coefficient negative')
    for u in np.argsort(review)[:n]:
        print(review_v[u], review[u])
    print ('\n')
    print ('review coefficient positive')
    for u in np.argsort(review)[::-1][:n]:
        print(review_v[u], review[u])

In [224]:
get_best_feature(lr.coef_, name_counter, review_counter, 5)

name coefficient negative
china -0.157094805763
buffet -0.101678068884
garden -0.0734928673052
chinese -0.072400178135
food -0.051678281704


name coefficient positive
cuisine 0.0870469949447
shanghai 0.0861585781651
sum 0.0824983997347
dim 0.0824983997347
dumpling 0.0792403479249


last name coefficient negative
buffet -0.069676202635
garden -0.0557880695568
restaurant -0.0489154609091
food -0.0487191831454
dumplings -0.037461312508


last name coefficient positive
sum 0.0909825487412
cuisine 0.0798585846448
bistro 0.0657587489797
360 0.0649416671561
ring 0.0422551026267


review coefficient negative
food -0.287919930103
good -0.277915695372
went -0.264869671523
pork -0.264361788662
like -0.261832407678


review coefficient positive
dim 0.723387512297
sum 0.63375841905
dumpling 0.315769062247
dimsum 0.302306118397
dumplings 0.290983158503
