# Machine Learning For Local Restaurant Price Assessment And Prediction

## 1. Fetch original data from json

In this section, we generate dataframe from the data we collected using Yelp API. Explanation of each features can be found at [https://www.yelp.com/developers/documentation/v3/business_search].

In [12]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
import pandas as pd
import json
import numpy as np

In [1]:
# Read data
with open('./Dataset/restaurants_VA.json', encoding = 'utf8') as inFile:
    data = json.load(inFile)
    
data = pd.DataFrame.from_dict(data)
display(data.head(5))

print("Total {0} samples".format(len(data)))

Unnamed: 0,alias,categories,coordinates,display_phone,distance,id,image_url,is_closed,location,name,phone,price,rating,review_count,transactions,url
0,the-dutch-treat-rose-hill,"[{'alias': 'delis', 'title': 'Delis'}, {'alias...","{'latitude': 36.6903325, 'longitude': -83.3104...",(276) 445-4024,19538.084959,rd8oYQOtyc4LxjPQp8Muvw,,False,"{'address1': '21332 Wilderness Rd', 'address2'...",The Dutch Treat,12764454024,,5.0,2,[],https://www.yelp.com/biz/the-dutch-treat-rose-...
1,a-better-burger-jonesville,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...","{'latitude': 36.689639, 'longitude': -83.108766}",(276) 346-6768,34871.790634,bwCj2AcoOroZfCTxb6rCcg,https://s3-media2.fl.yelpcdn.com/bphoto/3KS3Xs...,False,"{'address1': '33739 Main St', 'address2': 'Ste...",A Better Burger,12763466768,$$,3.5,6,[],https://www.yelp.com/biz/a-better-burger-jones...
2,el-castillo-jonesville,"[{'alias': 'mexican', 'title': 'Mexican'}]","{'latitude': 36.7263373464484, 'longitude': -8...",(276) 346-4000,37317.843873,S9S9kFJSkmfpbjFForCWLQ,https://s3-media1.fl.yelpcdn.com/bphoto/NGC_GJ...,False,"{'address1': '236 Trade Center Ln', 'address2'...",El Castillo,12763464000,$,4.0,2,[],https://www.yelp.com/biz/el-castillo-jonesvill...
3,el-centenario-pennington-gap,"[{'alias': 'mexican', 'title': 'Mexican'}]","{'latitude': 36.7602500915527, 'longitude': -8...",(276) 546-0044,36034.813254,XFksdPFZhPHk458C0pl0Cg,https://s3-media1.fl.yelpcdn.com/bphoto/G5XFTv...,False,"{'address1': '930 E Morgan Ave', 'address2': N...",El Centenario,12765460044,,5.0,3,[],https://www.yelp.com/biz/el-centenario-penning...
4,rubys-country-steak-house-pennington-gap,"[{'alias': 'restaurants', 'title': 'Restaurant...","{'latitude': 36.7624955624342, 'longitude': -8...",(276) 546-6900,38271.262707,AsZk7i1UyQSElluN_ixSPQ,https://s3-media3.fl.yelpcdn.com/bphoto/n6JFx4...,False,"{'address1': '131 Industrial Dr', 'address2': ...",Rubys Country Steak House,12765466900,,3.5,2,[],https://www.yelp.com/biz/rubys-country-steak-h...


Total 6413 samples


## 2. Data preprocessing
In this step, we drop the not very useful features and extract some important features from a json format column.

In [2]:
# drop columns: 'alias', 'is_closed', 'url', 'transactions', 'phone', 'display_phone', 'distance' 
data = data.drop(columns=['alias', 'is_closed', 'url', 'transactions', 'phone', 'display_phone', 'distance'])

# extract latitude and longitude values
coords = data['coordinates'].tolist()
coords_json = str(coords).replace("'", "\"")
df = pd.DataFrame(json.loads(coords_json))
data['latitude'] = df['latitude']
data['longitude'] = df['longitude']

# extract city info
locations = data['location'].tolist()
cities = []
for loc in locations:
    cities.append(loc['city'])
data['city'] = cities

# drop columns: 'coordinates', 'location'
data = data.drop(columns=['coordinates', 'location'])
data.head()

Unnamed: 0,categories,id,image_url,name,price,rating,review_count,latitude,longitude,city
0,"[{'alias': 'delis', 'title': 'Delis'}, {'alias...",rd8oYQOtyc4LxjPQp8Muvw,,The Dutch Treat,,5.0,2,36.690332,-83.310449,Rose Hill
1,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...",bwCj2AcoOroZfCTxb6rCcg,https://s3-media2.fl.yelpcdn.com/bphoto/3KS3Xs...,A Better Burger,$$,3.5,6,36.689639,-83.108766,Jonesville
2,"[{'alias': 'mexican', 'title': 'Mexican'}]",S9S9kFJSkmfpbjFForCWLQ,https://s3-media1.fl.yelpcdn.com/bphoto/NGC_GJ...,El Castillo,$,4.0,2,36.726337,-83.099858,Jonesville
3,"[{'alias': 'mexican', 'title': 'Mexican'}]",XFksdPFZhPHk458C0pl0Cg,https://s3-media1.fl.yelpcdn.com/bphoto/G5XFTv...,El Centenario,,5.0,3,36.76025,-83.023682,Pennington Gap
4,"[{'alias': 'restaurants', 'title': 'Restaurant...",AsZk7i1UyQSElluN_ixSPQ,https://s3-media3.fl.yelpcdn.com/bphoto/n6JFx4...,Rubys Country Steak House,,3.5,2,36.762496,-83.017144,Pennington Gap


In [3]:
# filter out NaN price
data['price'] = data['price'].fillna('99999')
data = data[data['price'] != '99999']
data.reset_index(drop=True, inplace=True)

# transform $ sign into values
for i in range(len(data)):
    data.iloc[i,4] = len(data.iloc[i,4])
data.head()

Unnamed: 0,categories,id,image_url,name,price,rating,review_count,latitude,longitude,city
0,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...",bwCj2AcoOroZfCTxb6rCcg,https://s3-media2.fl.yelpcdn.com/bphoto/3KS3Xs...,A Better Burger,2,3.5,6,36.689639,-83.108766,Jonesville
1,"[{'alias': 'mexican', 'title': 'Mexican'}]",S9S9kFJSkmfpbjFForCWLQ,https://s3-media1.fl.yelpcdn.com/bphoto/NGC_GJ...,El Castillo,1,4.0,2,36.726337,-83.099858,Jonesville
2,"[{'alias': 'restaurants', 'title': 'Restaurant...",np8uV1xll22Yr-Q-B-ImkA,,Rooster's Pub,1,4.5,4,36.758436,-83.027057,Penningtn Gap
3,"[{'alias': 'newamerican', 'title': 'American (...",HGY1ojoLu07P_ky2LeRguQ,https://s3-media4.fl.yelpcdn.com/bphoto/BcdFv7...,Redstone Restaurant,1,4.5,3,36.689259,-82.75304,Duffield
4,"[{'alias': 'mexican', 'title': 'Mexican'}]",J5XS3VmxnLKhNlpiwDJ-3A,https://s3-media4.fl.yelpcdn.com/bphoto/P_X-58...,Little Mexico,1,4.0,5,36.859367,-82.756744,Big Stone Gap


We have a 'categories' column which contains the feautres of the restaurant.

In [4]:
alias={}
titles={}
for cates in data.categories:
    for cate in cates:
        if cate['alias'] in alias.keys():
            alias[cate['alias']]+=1
        else:
            alias[cate['alias']]=1
        if cate['title'] in titles.keys():
            titles[cate['title']]+=1
        else:
            titles[cate['title']]=1
alias_sorted=sorted(alias.items(), key=lambda d:d[1], reverse=True)
titles_sorted=sorted(titles.items(), key=lambda d:d[1], reverse=True)
print('Total number of categories:',len(alias_sorted))
print('\nTop 10 ranked categories: ')
for i in range(10):
    print('  '+alias_sorted[i][0]+':'+str(alias_sorted[i][1])+'; '+titles_sorted[i][0]+':'+str(titles_sorted[i][1]))

Total number of categories: 213

Top 10 ranked categories: 
  tradamerican:898; American (Traditional):898
  pizza:675; Pizza:675
  burgers:522; Burgers:522
  hotdogs:519; Fast Food:519
  italian:496; Italian:496
  sandwiches:479; Sandwiches:479
  seafood:457; Seafood:457
  mexican:422; Mexican:422
  newamerican:417; American (New):417
  breakfast_brunch:376; Breakfast & Brunch:376


## * Save data for specific feature processing
**Coordinates(latitude, longitude):** Save Coordinates(latitude, longitude) data in another data file.

In [5]:
coordinates_related_data = data[['id', 'name', 'latitude','longitude', 'rating','review_count','price']]
coordinates_data_file = './Dataset/coordinates_data'

with open(coordinates_data_file, 'wb') as outFile:
    pickle.dump(coordinates_related_data, outFile)

** Categories: ** Save categories data in another data file.

In [6]:
category_data = data[['id', 'name', 'categories', 'rating','review_count','price']]
category_data_file = './Dataset/categories_data'

with open(category_data_file, 'wb') as outFile:
    pickle.dump(category_data, outFile)

## 3. Extract the numerical data and build the classification models

In [5]:
X = data[['rating','review_count','latitude','longitude']]
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# do feature scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print('Train set size:',len(X_train))
print('Test set size:',len(X_test))

Train set size: 4095
Test set size: 1024


  return self.partial_fit(X, y)
  after removing the cwd from sys.path.
  """


In [7]:
nb = BernoulliNB()
nb.fit(X_train, y_train)
nb_scores = cross_val_score(nb, X_train, y_train, cv=5)
print('BernoulliNB Avg_Acc:',np.mean(nb_scores))

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_scores = cross_val_score(dt, X_train, y_train, cv=5)
print('DecisionTreeClassifier Avg_Acc:',np.mean(dt_scores))

svc = LinearSVC(multi_class='ovr', max_iter=10000)
svc.fit(X_train, y_train)
svc_scores = cross_val_score(svc, X_train, y_train, cv=5)
print('LinearSVC Avg_Acc:',np.mean(svc_scores))

lr = LogisticRegression(solver='newton-cg', multi_class='multinomial')
lr.fit(X_train, y_train)
lr_scores = cross_val_score(lr, X_train, y_train, cv=5)
print('LogisticRegression Avg_Acc:',np.mean(lr_scores))

rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train, y_train)
rf_scores = cross_val_score(rf, X_train, y_train, cv=5)
print('RandomForestClassifier Avg_Acc:', np.mean(rf_scores))

nn = MLPClassifier(max_iter=10000)
nn.fit(X_train, y_train)
nn_scores = cross_val_score(nn, X_train, y_train, cv=5)
print('MLPClassifier Avg_Acc:', np.mean(nn_scores))

BernoulliNB Avg_Acc: 0.6124894365983788
DecisionTreeClassifier Avg_Acc: 0.594144551016782




LinearSVC Avg_Acc: 0.6359291460623637
LogisticRegression Avg_Acc: 0.6718076046329469
RandomForestClassifier Avg_Acc: 0.6725386996032554
MLPClassifier Avg_Acc: 0.6918138898087383


In [9]:
from sklearn.metrics import classification_report

target_names = ['$', '$$', '$$$', '$$$$']
print('BernoulliNB:')
print(classification_report(y_test, nb.predict(X_test), target_names=target_names))
print('DecisionTreeClassifier:')
print(classification_report(y_test, dt.predict(X_test), target_names=target_names))
print('LinearSVC:')
print(classification_report(y_test, svc.predict(X_test), target_names=target_names))
print('LogisticRegression:')
print(classification_report(y_test, lr.predict(X_test), target_names=target_names))
print('RandomForestClassifier:')
print(classification_report(y_test, rf.predict(X_test), target_names=target_names))
print('MLPClassifier:')
print(classification_report(y_test, nn.predict(X_test), target_names=target_names))


BernoulliNB:
              precision    recall  f1-score   support

           $       0.54      0.51      0.52       392
          $$       0.66      0.73      0.69       596
         $$$       0.00      0.00      0.00        34
        $$$$       0.00      0.00      0.00         2

   micro avg       0.62      0.62      0.62      1024
   macro avg       0.30      0.31      0.30      1024
weighted avg       0.59      0.62      0.60      1024

DecisionTreeClassifier:
              precision    recall  f1-score   support

           $       0.53      0.56      0.54       392
          $$       0.68      0.64      0.66       596
         $$$       0.03      0.03      0.03        34
        $$$$       0.00      0.00      0.00         2

   micro avg       0.59      0.59      0.59      1024
   macro avg       0.31      0.31      0.31      1024
weighted avg       0.60      0.59      0.59      1024

LinearSVC:
              precision    recall  f1-score   support

           $       0.59    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [11]:
print('BernoulliNB:', nb.score(X_test, y_test))
print('DecisionTreeClassifier:', dt.score(X_test, y_test))
print('LinearSVC:', svc.score(X_test, y_test))
print('LogisticRegression:', lr.score(X_test, y_test))
print('RandomForestClassifier:', rf.score(X_test, y_test))
print('MLPClassifier:', nn.score(X_test, y_test))

BernoulliNB: 0.6171875
DecisionTreeClassifier: 0.58984375
LinearSVC: 0.6455078125
LogisticRegression: 0.6689453125
RandomForestClassifier: 0.64453125
MLPClassifier: 0.6728515625


In [8]:
from sklearn.metrics import classification_report

target_names = ['$', '$$', '$$$', '$$$$']
print(classification_report(y_train, lr.predict(X_train), target_names=target_names))

              precision    recall  f1-score   support

           $       0.62      0.64      0.63      1656
          $$       0.71      0.74      0.72      2307
         $$$       0.00      0.00      0.00       114
        $$$$       0.00      0.00      0.00        18

   micro avg       0.67      0.67      0.67      4095
   macro avg       0.33      0.34      0.34      4095
weighted avg       0.65      0.67      0.66      4095



  'precision', 'predicted', average, warn_for)


## 4. Generate Voting Classifier

**Geographic Data Classifier**

In [3]:
# Load geographic data
with open('./Dataset/geo_biz_data', 'rb') as infile:
    _, geo_biz_data = pickle.load(infile)

In [5]:
geo_biz_data.head()

Unnamed: 0,id,name,latitude,longitude,rating,review_count,price,geoid,usps,pop10,...,oceania,south_america,south_central_asia,south_eastern_asia,southern_africa,southern_europe,united_kingdom_(inc._crown_dependencies),western_africa,western_asia,western_europe
0,bwCj2AcoOroZfCTxb6rCcg,A Better Burger,36.689639,-83.108766,3.5,6,2,51105950500,VA,4914,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,S9S9kFJSkmfpbjFForCWLQ,El Castillo,36.726337,-83.099858,4.0,2,1,51105950500,VA,4914,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,np8uV1xll22Yr-Q-B-ImkA,Rooster's Pub,36.758436,-83.027057,4.5,4,1,51105950300,VA,5532,...,0.0,41.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0
3,HGY1ojoLu07P_ky2LeRguQ,Redstone Restaurant,36.689259,-82.75304,4.5,3,1,51169030300,VA,3674,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,J5XS3VmxnLKhNlpiwDJ-3A,Little Mexico,36.859367,-82.756744,4.0,5,1,51195931200,VA,6649,...,0.0,0.0,0.0,0.0,0.0,55.0,0.0,0.0,0.0,7.0


In [8]:
geo_X = geo_biz_data.drop(columns=['id', 'name', 'usps', 'geoid', 'price', 'latitude', 'longitude']).values
geo_y = geo_biz_data['price'].values

# Devide train/test data set
geo_X_train, geo_X_test, geo_y_train, geo_y_test = train_test_split(geo_X, geo_y, test_size=0.2, random_state=42)
print('{0} train + {1} test'.format(len(geo_X_train), len(geo_X_test)))

# Feature scaling
scaler = StandardScaler()
scaler.fit(geo_X_train)
geo_X_train = scaler.transform(geo_X_train)
geo_X_test = scaler.transform(geo_X_test)

4092 train + 1024 test


In [15]:
# Geo classifier
geo_clf = SVC(kernel="linear", C=0.01, decision_function_shape='ovo')

cross_scores = cross_val_score(geo_clf, geo_X_train, geo_y_train, cv=5)
    
geo_clf.fit(geo_X_train, geo_y_train)
geo_y_pred = geo_clf.predict(geo_X_test)
    
score = geo_clf.score(geo_X_test, geo_y_test)
    
print("Geo Classifer:")
print('Cross Val Avg. Score:',np.mean(cross_scores))
print("Accuracy Score in test set: {0}".format(score))

Geo Classifer:
Cross Val Avg. Score: 0.694048362177379
Accuracy Score in test set: 0.7001953125
