In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
businesses = pd.read_json("/kaggle/input/yelp-dataset/yelp_academic_dataset_business.json", lines=True, orient='columns', chunksize=100000)
reviews = pd.read_json("/kaggle/input/yelp-dataset/yelp_academic_dataset_review.json", lines=True, orient='columns', chunksize=100000)

In [3]:
for business in businesses:
    subset_business = business
    break
    
for review in reviews:
    subset_review = review
    break

In [4]:
subset_business.shape

In [5]:
subset_business.head()

In [6]:
subset_review.shape

In [7]:
subset_review.head()

In [8]:
subset_business.groupby(['city'])['review_count'].sum().sort_values()

In [9]:
city = subset_business[(subset_business['city'] == 'Austin')]
austin = city[['business_id','name','address', 'categories', 'attributes','stars']]
austin

# Data Preprocessing

In [10]:
rest = austin[austin['categories'].str.contains('Restaurant.*')==True].reset_index()
rest

In [11]:
rest["attributes"][0]

In [12]:
# Function that extract keys from the nested dictionary
def extract_keys(attr, key):
    if attr == None:
        return "{}"
    if key in attr:
        return attr.pop(key)

# convert string to dictionary
import ast
def str_to_dict(attr):
    if attr != None:
        return ast.literal_eval(attr)
    else:
        return ast.literal_eval("{}") 


In [13]:
str_to_dict(extract_keys({'RestaurantsGoodForGroups': 'True',
 'RestaurantsTakeOut': 'True',
 'HasTV': 'False',
 'RestaurantsReservations': 'False',
 'OutdoorSeating': 'True',
 'RestaurantsDelivery': 'False',
 'NoiseLevel': "'loud'",
 'WiFi': "'no'",
 'Ambience': "{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}",
 'BikeParking': 'True',
 'GoodForKids': 'True',
 'RestaurantsPriceRange2': '1',
 'RestaurantsAttire': "u'casual'",
 'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}",
 'BusinessAcceptsCreditCards': 'True'}, "BusinessParking"))

In [14]:
# get dummies from nested attributes
rest['BusinessParking'] = rest.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'BusinessParking')), axis=1)
rest['Ambience'] = rest.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Ambience')), axis=1)
rest['GoodForMeal'] = rest.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'GoodForMeal')), axis=1)
rest['Dietary'] = rest.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Dietary')), axis=1)
rest['Music'] = rest.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Music')), axis=1)

In [15]:
rest

In [16]:
# create table with attribute dummies
df_attr = pd.concat([ rest['attributes'].apply(pd.Series), rest['BusinessParking'].apply(pd.Series),
                    rest['Ambience'].apply(pd.Series), rest['GoodForMeal'].apply(pd.Series), 
                    rest['Dietary'].apply(pd.Series) ], axis=1)
df_attr_dummies = pd.get_dummies(df_attr)
df_attr_dummies

In [17]:
# get dummies from categories
df_categories_dummies = pd.Series(rest['categories']).str.get_dummies(',')
df_categories_dummies

In [18]:
# pull out names and stars from rest table 
result = rest[['name','stars']]
result

In [19]:
df_final = pd.concat([df_attr_dummies, df_categories_dummies, result], axis=1)
df_final.drop('Restaurants',inplace=True,axis=1)
df_final

In [20]:
mapper = {1.0:1,1.5:2, 2.0:2, 2.5:3, 3.0:3, 3.5:4, 4.0:4, 4.5:5, 5.0:5}
df_final['stars'] = df_final['stars'].map(mapper)

In [21]:
df_final

In [22]:
X = df_final.iloc[:,:-2]
y = df_final['stars']

In [23]:
from sklearn.model_selection import train_test_split
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X, y, test_size=0.2, random_state=1)

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train_knn, y_train_knn)

y_pred = knn.predict(X_test_knn)

accuracy_train = knn.score(X_train_knn, y_train_knn)
accuracy_test = knn.score(X_test_knn, y_test_knn)

print(f"Score on training set: {accuracy_train}")
print(f"Score on test set: {accuracy_test}")

In [25]:
display(df_final.iloc[-1:])

In [26]:
print("Validation set (Restaurant name): ", df_final['name'].values[-1])

In [27]:
test_set = df_final.iloc[-1:,:-2]

test_set

In [28]:
X_val =  df_final.iloc[:-1,:-2]
X_val

In [29]:
y_val = df_final['stars'].iloc[:-1]
y_val

In [30]:
# test set from the df_final table (only last row): Restaurant name: "Chi Chinese Buffet"
test_set = df_final.iloc[-1:,:-2]

# validation set from the df_final table (exclude the last row)
X_val =  df_final.iloc[:-1,:-2]
y_val = df_final['stars'].iloc[:-1]

In [31]:
n_knn = knn.fit(X_val, y_val)

In [32]:
distances, indices =  n_knn.kneighbors(test_set)
n_knn.kneighbors(test_set)[0][0]
n_knn.kneighbors(test_set)[1][0]

In [33]:
# distances and indeces from validation set (Chi Chinese Buffet)
distances, indices =  n_knn.kneighbors(test_set)
#n_knn.kneighbors(test_set)[1][0]

# create table distances and indeces from "Chi Chinese Buffet"
final_table = pd.DataFrame(n_knn.kneighbors(test_set)[0][0], columns = ['distance'])
final_table['index'] = n_knn.kneighbors(test_set)[1][0]
final_table.set_index('index')

In [34]:
# get names of the restaurant that similar to the "Chi Chinese Buffet"
result = final_table.join(df_final,on='index')
result[['distance','index','name','stars']].head(5)