# Machine Learning For Local Restaurant Price Assessment And Prediction

## 1. Fetch original data from json

In this section, we generate dataframe from the data we collected using Yelp API. Explanation of each features can be found at [https://www.yelp.com/developers/documentation/v3/business_search].

In [1]:
import pandas as pd
import json

# Read data
with open('./Dataset/restaurants_VA.json', encoding = 'utf8') as inFile:
    data = json.load(inFile)
    
data = pd.DataFrame.from_dict(data)
display(data.head(5))

print("Total {0} samples".format(len(data)))

Unnamed: 0,alias,categories,coordinates,display_phone,distance,id,image_url,is_closed,location,name,phone,price,rating,review_count,transactions,url
0,the-dutch-treat-rose-hill,"[{'alias': 'delis', 'title': 'Delis'}, {'alias...","{'latitude': 36.6903325, 'longitude': -83.3104...",(276) 445-4024,19538.084959,rd8oYQOtyc4LxjPQp8Muvw,,False,"{'address1': '21332 Wilderness Rd', 'address2'...",The Dutch Treat,12764454024,,5.0,2,[],https://www.yelp.com/biz/the-dutch-treat-rose-...
1,a-better-burger-jonesville,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...","{'latitude': 36.689639, 'longitude': -83.108766}",(276) 346-6768,34871.790634,bwCj2AcoOroZfCTxb6rCcg,https://s3-media2.fl.yelpcdn.com/bphoto/3KS3Xs...,False,"{'address1': '33739 Main St', 'address2': 'Ste...",A Better Burger,12763466768,$$,3.5,6,[],https://www.yelp.com/biz/a-better-burger-jones...
2,el-castillo-jonesville,"[{'alias': 'mexican', 'title': 'Mexican'}]","{'latitude': 36.7263373464484, 'longitude': -8...",(276) 346-4000,37317.843873,S9S9kFJSkmfpbjFForCWLQ,https://s3-media1.fl.yelpcdn.com/bphoto/NGC_GJ...,False,"{'address1': '236 Trade Center Ln', 'address2'...",El Castillo,12763464000,$,4.0,2,[],https://www.yelp.com/biz/el-castillo-jonesvill...
3,el-centenario-pennington-gap,"[{'alias': 'mexican', 'title': 'Mexican'}]","{'latitude': 36.7602500915527, 'longitude': -8...",(276) 546-0044,36034.813254,XFksdPFZhPHk458C0pl0Cg,https://s3-media1.fl.yelpcdn.com/bphoto/G5XFTv...,False,"{'address1': '930 E Morgan Ave', 'address2': N...",El Centenario,12765460044,,5.0,3,[],https://www.yelp.com/biz/el-centenario-penning...
4,rubys-country-steak-house-pennington-gap,"[{'alias': 'restaurants', 'title': 'Restaurant...","{'latitude': 36.7624955624342, 'longitude': -8...",(276) 546-6900,38271.262707,AsZk7i1UyQSElluN_ixSPQ,https://s3-media3.fl.yelpcdn.com/bphoto/n6JFx4...,False,"{'address1': '131 Industrial Dr', 'address2': ...",Rubys Country Steak House,12765466900,,3.5,2,[],https://www.yelp.com/biz/rubys-country-steak-h...


Total 6413 samples


## 2. Data preprocessing
In this step, we drop the not very useful features and extract some important features from a json format column.

In [2]:
# drop columns: 'alias', 'is_closed', 'url', 'transactions', 'phone', 'display_phone', 'distance' 
data = data.drop(columns=['alias', 'is_closed', 'url', 'transactions', 'phone', 'display_phone', 'distance'])

# extract latitude and longitude values
coords = data['coordinates'].tolist()
coords_json = str(coords).replace("'", "\"")
df = pd.DataFrame(json.loads(coords_json))
data['latitude'] = df['latitude']
data['longitude'] = df['longitude']

# extract city info
locations = data['location'].tolist()
cities = []
for loc in locations:
    cities.append(loc['city'])
data['city'] = cities

# drop columns: 'coordinates', 'location'
data = data.drop(columns=['coordinates', 'location'])
data.head()

Unnamed: 0,categories,id,image_url,name,price,rating,review_count,latitude,longitude,city
0,"[{'alias': 'delis', 'title': 'Delis'}, {'alias...",rd8oYQOtyc4LxjPQp8Muvw,,The Dutch Treat,,5.0,2,36.690332,-83.310449,Rose Hill
1,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...",bwCj2AcoOroZfCTxb6rCcg,https://s3-media2.fl.yelpcdn.com/bphoto/3KS3Xs...,A Better Burger,$$,3.5,6,36.689639,-83.108766,Jonesville
2,"[{'alias': 'mexican', 'title': 'Mexican'}]",S9S9kFJSkmfpbjFForCWLQ,https://s3-media1.fl.yelpcdn.com/bphoto/NGC_GJ...,El Castillo,$,4.0,2,36.726337,-83.099858,Jonesville
3,"[{'alias': 'mexican', 'title': 'Mexican'}]",XFksdPFZhPHk458C0pl0Cg,https://s3-media1.fl.yelpcdn.com/bphoto/G5XFTv...,El Centenario,,5.0,3,36.76025,-83.023682,Pennington Gap
4,"[{'alias': 'restaurants', 'title': 'Restaurant...",AsZk7i1UyQSElluN_ixSPQ,https://s3-media3.fl.yelpcdn.com/bphoto/n6JFx4...,Rubys Country Steak House,,3.5,2,36.762496,-83.017144,Pennington Gap


In [4]:
# filter out NaN price
data['price'] = data['price'].fillna('99999')
data = data[data['price'] != '99999']

# transform $ sign into values
for i in range(len(data)):
    data.iloc[i,4] = len(data.iloc[i,4])
data.head()

Unnamed: 0,categories,id,image_url,name,price,rating,review_count,latitude,longitude,city
1,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...",bwCj2AcoOroZfCTxb6rCcg,https://s3-media2.fl.yelpcdn.com/bphoto/3KS3Xs...,A Better Burger,2,3.5,6,36.689639,-83.108766,Jonesville
2,"[{'alias': 'mexican', 'title': 'Mexican'}]",S9S9kFJSkmfpbjFForCWLQ,https://s3-media1.fl.yelpcdn.com/bphoto/NGC_GJ...,El Castillo,1,4.0,2,36.726337,-83.099858,Jonesville
6,"[{'alias': 'restaurants', 'title': 'Restaurant...",np8uV1xll22Yr-Q-B-ImkA,,Rooster's Pub,1,4.5,4,36.758436,-83.027057,Penningtn Gap
12,"[{'alias': 'newamerican', 'title': 'American (...",HGY1ojoLu07P_ky2LeRguQ,https://s3-media4.fl.yelpcdn.com/bphoto/BcdFv7...,Redstone Restaurant,1,4.5,3,36.689259,-82.75304,Duffield
13,"[{'alias': 'mexican', 'title': 'Mexican'}]",J5XS3VmxnLKhNlpiwDJ-3A,https://s3-media4.fl.yelpcdn.com/bphoto/P_X-58...,Little Mexico,1,4.0,5,36.859367,-82.756744,Big Stone Gap


We have a 'categories' column which contains the feautres of the restaurant.

In [3]:
alias={}
titles={}
for cates in data.categories:
    for cate in cates:
        if cate['alias'] in alias.keys():
            alias[cate['alias']]+=1
        else:
            alias[cate['alias']]=1
        if cate['title'] in titles.keys():
            titles[cate['title']]+=1
        else:
            titles[cate['title']]=1
alias_sorted=sorted(alias.items(), key=lambda d:d[1], reverse=True)
titles_sorted=sorted(titles.items(), key=lambda d:d[1], reverse=True)
print('Number of categories:',len(alias_sorted))
print('\nTop 10 ranked categories: ')
for i in range(10):
    print('  '+alias_sorted[i][0]+':'+str(alias_sorted[i][1])+'; '+titles_sorted[i][0]+':'+str(titles_sorted[i][1]))

Number of categories: 231

Top 10 ranked categories: 
  tradamerican:1039; American (Traditional):1039
  pizza:861; Pizza:861
  hotdogs:779; Fast Food:779
  sandwiches:680; Sandwiches:680
  burgers:636; Burgers:636
  italian:574; Italian:574
  seafood:529; Seafood:529
  mexican:504; Mexican:504
  newamerican:470; American (New):470
  breakfast_brunch:451; Breakfast & Brunch:451


## 3. Extract the numerical data and build the classification models

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = data[['rating','review_count','latitude','longitude']]
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# do feature scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print('Train set size:',len(X_train))
print('Test set size:',len(X_test))

Train set size: 4095
Test set size: 1024


  return self.partial_fit(X, y)
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


In [6]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

nb = BernoulliNB()
nb.fit(X_train, y_train)
nb_scores = cross_val_score(nb, X_train, y_train, cv=5)
print('BernoulliNB Avg_Acc:',np.mean(nb_scores))

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_scores = cross_val_score(dt, X_train, y_train, cv=5)
print('DecisionTreeClassifier Avg_Acc:',np.mean(dt_scores))

svc = LinearSVC(multi_class='ovr', max_iter=10000)
svc.fit(X_train, y_train)
svc_scores = cross_val_score(svc, X_train, y_train, cv=5)
print('LinearSVC Avg_Acc:',np.mean(svc_scores))

lr = LogisticRegression(solver='newton-cg', multi_class='multinomial')
lr.fit(X_train, y_train)
lr_scores = cross_val_score(lr, X_train, y_train, cv=5)
print('LogisticRegression Avg_Acc:',np.mean(lr_scores))

rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train, y_train)
rf_scores = cross_val_score(rf, X_train, y_train, cv=5)
print('RandomForestClassifier Avg_Acc:', np.mean(rf_scores))

nn = MLPClassifier(max_iter=10000)
nn.fit(X_train, y_train)
nn_scores = cross_val_score(nn, X_train, y_train, cv=5)
print('MLPClassifier Avg_Acc:', np.mean(nn_scores))

BernoulliNB Avg_Acc: 0.6124894365983788
DecisionTreeClassifier Avg_Acc: 0.590481532070991




LinearSVC Avg_Acc: 0.6359291460623637
LogisticRegression Avg_Acc: 0.6718076046329469
RandomForestClassifier Avg_Acc: 0.6761981426785614
MLPClassifier Avg_Acc: 0.692056003221649


In [7]:
print('Test accuracy:', nn.score(X_test, y_test))

Test accuracy: 0.67578125
