In [4]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [15]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/Data/business_final.csv')

In [7]:
def great_circle_mile(lat1, lon1, lat2, lon2):
    """
    Compute geodesic distances (great-circle distance) of two points given their coordinates. 
    The function returns the distance in miles. 
    Note: 1. Calculation uses the earth's mean radius of 6371.009 km, 
    2. The central subtended angle is calculated by formula: 
    alpha = cos-1*[sin(lat1)*sin(lat2)+ cos(lat1)*cos(lat2)*cos(lon1-lon2)]
    """
    
    from math import sin, cos, acos, radians
    
    lat1, lon1, lat2, lon2 = radians(lat1), radians(lon1), radians(lat2), radians(lon2) # convert degrees to radians
    earth_radius = 6371.009  # use earth's mean radius in kilometers
    alpha = acos(sin(lat1)*sin(lat2) + cos(lat1)*cos(lat2)*cos(lon1-lon2)) # alpha is in radians
    dis_km = alpha * earth_radius
    dis_mile = dis_km * 0.621371   # convert kilometer to mile
    
    return dis_mile

In [8]:
pos1 = (51.5073219, -0.1276474) # London
pos2 = (52.5170365, 13.3888599) # Berlin
pos3 = (-33.8548157,151.2164539) # Sydney

In [9]:
%%timeit
# great_circle distance
distance_12 = great_circle_mile(pos1[0], pos1[1], pos2[0], pos2[1])
distance_12

2.24 µs ± 632 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [10]:
from geopy.distance import distance

In [11]:
%%timeit
# geodesic distance
distance2_12 = distance(pos1, pos2).miles
distance2_12

369 µs ± 100 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [12]:
distance_12 = great_circle_mile(pos1[0], pos1[1], pos2[0], pos2[1])
distance2_12 = distance(pos1, pos2).miles
error_12 = (distance_12 - distance2_12)/distance2_12

distance_13 = great_circle_mile(pos1[0], pos1[1], pos3[0], pos3[1])
distance2_13 = distance(pos1, pos3).miles
error_13 = (distance_13 - distance2_13)/distance2_13

# print errors, 1-2 is distance between London and Berlin, 1-3 is distance between London and Sydney
print("absolute error:", (distance_12-distance2_12), (distance_13-distance2_13)) 
print("percent error:", error_12, error_13)

absolute error: -1.8326838373754981 2.892945931140275
percent error: -0.0031598293601707256 0.0002740520023695907


In [16]:
# compute globe mean ratings of all businesses and all reviews
globe_mean = ((data.stars * data.review_count).sum())/(data.review_count.sum())
print("global mean rating is:", globe_mean)

global mean rating is: 3.7542257010146605


In [17]:
print(data.review_count.quantile([0.1,0.25,0.5,0.75,0.9]))
k = 22 # set strength k to 22, which is the 50% quantile of the review counts for all businesses
data['adjusted_score'] = (data.review_count * data.stars + k * globe_mean)/(data.review_count + k)
print("\nrank by the adjusted score in descending order:")
print(data[['review_count','stars','adjusted_score']].sort_values('adjusted_score', ascending=False).head(5))
print("\nrank by the original score in descending order:")
print(data[['review_count','stars','adjusted_score']].sort_values('stars', ascending=False).head(5))
print("\nrank by the least number of reviews:")
print(data[['review_count','stars','adjusted_score']].sort_values('review_count', ascending=True).head(5))

0.10     6.0
0.25     8.0
0.50    15.0
0.75    37.0
0.90    97.0
Name: review_count, dtype: float64

rank by the adjusted score in descending order:
        review_count  stars  adjusted_score
12307            991    5.0        4.972945
134154           799    5.0        4.966617
25015            769    5.0        4.965351
16555            705    5.0        4.962301
70650            623    5.0        4.957508

rank by the original score in descending order:
       review_count  stars  adjusted_score
0                 7    5.0        4.054930
60693            41    5.0        4.564968
60680            33    5.0        4.501690
60668            12    5.0        4.193911
60664            22    5.0        4.377113

rank by the least number of reviews:
        review_count  stars  adjusted_score
110756             5    2.5        3.521962
109034             5    5.0        3.984925
109040             5    5.0        3.984925
109043             5    2.0        3.429369
60700              5  

In [18]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

# use geolocate query to find the coordinate for the location of interest
geolocator = Nominatim(user_agent="yelp_recommender") # use geopy.geocoders to make geolocation queries
address = "san jose, 95132"

try:
    location = geolocator.geocode(address, timeout=10) 
except GeocoderTimedOut as e:
    print("Error: geocode failed to locate the address of interest {} with message {}".format(address, e.message))   

In [19]:
print(location)
print(location.raw)
print(location.latitude, location.longitude)

San Jose, Santa Clara County, CAL Fire Northern Region, California, United States
{'place_id': 297983195, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 112143, 'boundingbox': ['37.1231596', '37.4691477', '-122.0460405', '-121.5858438'], 'lat': '37.3361663', 'lon': '-121.890591', 'display_name': 'San Jose, Santa Clara County, CAL Fire Northern Region, California, United States', 'class': 'boundary', 'type': 'administrative', 'importance': 0.8274377648053739, 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_boundary_administrative.p.20.png'}
37.3361663 -121.890591


In [22]:
# update data type of the 'postal_code' column of business dataframe to string type
data['postal_code'] = data.postal_code.astype(str)

In [23]:
class Recommender:
    
    def __init__(self, n=5, original_score=False):
        """initiate a Recommender object by passing the desired number of recommendations to make, the default number is 10.
        By default, the adjusted score will be used for ranking; To rank by the original average rating of the restaurant, pass original_score=True
        """
        self.n = n # number of recommendations to make, default is 5
        self.original_score = original_score # boolean indicating whether the original average rating or the adjusted score is used
        # initiate a list of column names to display in the recommendation results
        self.column_to_display = ['state','city','name','address','attributes.RestaurantsPriceRange2','cuisine','style','review_count','stars','adjusted_score']
        
        # initiate the list of recommendations to be all the open restaurants from the entire catalog of 'business' dataframe sorted by the score of interest
        if self.original_score:  # set sorting criteria to the originial star rating
            score = 'stars'
        else:  # set sorting criteria to the adjusted score
            score = 'adjusted_score'
        self.recomm = data[data.is_open == 1].sort_values(score, ascending=False)
        
    def _filter_by_location(self):
        """Filter and update the dataframe of recommendations by the matching location of interest.
        A combination of state, city and zipcode is used as the location information, partially missing information can be handled. 
        Matching restaurant is defined as the restaurant within the acceptable distance (max_distance) of the location of interest.
        note: this hidden method should only be called within the method 'keyword'
        """       
        from geopy.geocoders import Nominatim
        from geopy.exc import GeocoderTimedOut
        geolocator = Nominatim(user_agent="yelp_recommender") # use geopy.geocoders to make geolocation queries
        address = [self.city, self.state, self.zipcode]
        address = ",".join([str(i) for i in address if i != None])
        # use geolocate query to find the coordinate for the location of interest
        try:
            location = geolocator.geocode(address, timeout=10) 
        except GeocoderTimedOut as e:
            print("Error: geocode failed to locate the address of interest {} with message {}".format(address, e.message))            

        # calculate the geodesic distance between each restaurant and the location of interest and add as a new column ''distance_to_interest'
        self.recomm['distance_to_interest'] = self.recomm.apply(lambda row: great_circle_mile(row.latitude, row.longitude, location.latitude, location.longitude), axis=1)
        # add the new column 'distance_to_interest' to the list of columns to display in the recommendation result
        self.column_to_display.insert(0, 'distance_to_interest')
        # filter by the desired distance
        self.recomm = self.recomm[self.recomm.distance_to_interest <= self.max_distance]

    def _filter_by_state(self):
        """ Filter and update the dataframe of recommendations by the matching state.
        note: this hidden method should only be called within the method 'keyword'
        """
        self.recomm = self.recomm[self.recomm.state == self.state]
    
    def _filter_by_cuisine(self):
        """ Filter and update the dataframe of recommendations by the matching cuisine of interest. 
        note: this hidden method should only be called within the method 'keyword'
        """                         
        idx = []
        for i in self.recomm.index: 
            if self.recomm.loc[i,'cuisine'] is not np.nan:
                entries = self.recomm.loc[i,'cuisine'].split(',')
                if self.cuisine in entries:
                    idx.append(i)
        self.recomm = self.recomm.loc[idx]
    
    def _filter_by_style(self):  
        """ Filter and update the dataframe of recommendations by the matching style of interest. 
        note: this hidden method should only be called within the method 'keyword'
        """
        idx = []
        for i in self.recomm.index: 
            if self.recomm.loc[i,'style'] is not np.nan:
                entries = self.recomm.loc[i,'style'].split(',')
                if self.style in entries:
                    idx.append(i)
        self.recomm = self.recomm.loc[idx]
    
    def _filter_by_price(self):
        """Filter and update the dataframe of recommendations by the matching price range of interest. 
        note: this hidden method should only be called within the method 'keyword'
        """
        self.recomm = self.recomm[self.recomm['attributes.RestaurantsPriceRange2'].isin(self.price)]
    
    def display_recommendation(self):
        """ Display the list of top n recommended restaurants
        """
        if len(self.recomm) == 0:
            print("Sorry, there is no matching recommendations.")
        elif self.n < len(self.recomm):  # display only the top n from the recommendation list
            print("Below is a list of the top {} recommended restaurants for you: ".format(self.n))
            print(self.recomm.iloc[:self.n][self.column_to_display])
        else:  # display all if # of recommendations is less than self.n
            print("Below is a list of the top {} recommended restaurants for you: ".format(len(self.recomm)))
            print(self.recomm[self.column_to_display])
    
    # non-personalized keyword filtering recommender module
    def keyword(self, df=data[data.is_open == 1], zipcode=None, city=None, state=None, max_distance=10, cuisine=None, style=None, price=None):
        """Non-personalized recommendation by keyword filtering: 
        Support filtering by the distance and location (zipcode, city, state) of interest, 
        by the desired cuisine, by the desired style, and by the desired price range. 
        The module supports multiple price range inputs separated by comma.
        ---
        Note:
        df: the default restaurant catalog is all the open restaurants in the 'business' dataframe, 
            if a subset is prefered, e.g. previous filtered result, the subset can be passed to df
        state: needs to be the upper case of the state abbreviation, e.g.: 'NV', 'CA'
        max_distance: the max acceptable distance between the restaurant and the location of interest, unit is in miles, default is 10
        ---
        """
        # re-initiate the following variables every time the module is called so that the recommendation starts fresh
        self.recomm = df # start with the desired restaurant catalog
        self.recomm['distance_to_interest'] = np.nan # reset the distance between each restaurant and the location of interest
        self.column_to_display = ['state','city','name','address','attributes.RestaurantsPriceRange2','cuisine','style','review_count','stars','adjusted_score'] # reset the columns to display
        
        # assign variables based on user's keyword inputs
        self.zipcode = zipcode
        self.city = city
        self.state = state
        self.max_distance = max_distance
        self.cuisine = cuisine
        self.style = style
        self.price = price 
             
        # filter by restaurant location
        if (self.zipcode != None) or (self.city != None) or (self.state != None):      
            if (self.zipcode != None) or (self.city != None): # use zipcode and/or city whenever available
                self._filter_by_location()
            else: # filter by state if state is the only location information available
                self._filter_by_state()
            if len(self.recomm) == 0:
                print("no restaurant found for the matching location of interest.")
                return None
            
          # filter by restaurant 'cuisine'
        if self.cuisine != None:
            self._filter_by_cuisine()
            if len(self.recomm) == 0:
                print("no restaurant found for the matching cuisine of {}".format(self.cuisine))
                return None
    
        # filter by restaurant 'style'
        if self.style != None:
            self._filter_by_style() 
            if len(self.recomm) == 0:
                print("no restaurant found for the matching style of {}".format(self.style))
                return None
        
        # filter by restaurant price range
        if self.price != None:
            self.price = [i.strip() for i in price.split(',')] #extract multiple inputs of price range
            self._filter_by_price()
            if len(self.recomm) == 0:
                print("no restaurant found for the matching price of {}".format(self.price))
                return None
        
        # sort the matching list of restaurants by the score of interest
        if self.original_score:  # set sorting criteria to the originial star rating
            score = 'stars'
        else:  # set sorting criteria to the adjusted score
            score = 'adjusted_score'
        self.recomm = self.recomm.sort_values(score, ascending=False)
        
        # display the list of top n recommendations
        self.display_recommendation()
        
        return self.recomm

In [24]:
%%time
# initiate a Recommender object
kw = Recommender(n=3)

# test0: display only (same as no keywords)
print("------\nresult from test0 (display only): ")
kw.display_recommendation()

# test1: no keywords
print("------\nresult from test1 (no keywords): ")
kw.keyword();

# test 2: a combination of city, state and zipcode
print("------\nresult from test2 (a combination of city and state): ")
kw.keyword(city='Phoenix', state='AZ', zipcode='85023');

# test 3: a combination of cuisine and style
print("------\nresult from test3 (a combination of cuisine and style): ")
kw.keyword(cuisine='barbeque', style='restaurants');

# test 4: a combination of state, cuisine and style
print("------\nresult from test4 (a combination of state, cuisine and style): ")
kw.keyword(state='NV', cuisine='desserts', style='restaurants');

# test 5: no matching location
print("------\nresult from test5 (no matching location): ")
kw.keyword(city='milpitas', zipcode='95035');

# test 6: no matching 'cuisine'
print("------\nresult from test6 (no matching cuisine): ")
kw.keyword(cuisine='abc');

# test 7: no matching 'style'
print("------\nresult from test7 (no matching style): ")
kw.keyword(style='abc');

# test 8: a combination of location, cuisine and style
print("------\nresult from test8 (a combination of location, cuisine and style): ")
kw.keyword(city='Phoenix', zipcode='85023',cuisine='barbeque', style='restaurants');

# test 9: a combination of price range, cuisine and style
print("------\nresult from test9 (a combination of price range, cuisine and style): ")
kw.keyword(price='1', cuisine='barbeque', style='restaurants');

# test 10: a combination of two price ranges, location, cuisine and style
print("------\nresult from test10 (a combination of two price ranges, location, cuisine and style): ")
kw.keyword(price='1, 2', zipcode='85023',cuisine='barbeque', style='restaurants');

# test 11: use the original average rating and return top 10 recommendations
print("------\nresult from test11 (top 10 recommendations ranked by original average rating): ")
kw2 = Recommender(n=10, original_score=True)
kw2.keyword(city='Phoenix', zipcode='85023',cuisine='barbeque', style='restaurants');

------
result from test0 (display only): 
Below is a list of the top 3 recommended restaurants for you: 
       state         city                name          address  \
12307     MO  Saint Louis     Blues City Deli  2438 McNair Ave   
134154    NV       Sparks    Carlillos Cocina  415 S Rock Blvd   
25015     LA  New Orleans  Free Tours By Foot   2613 Laurel St   

       attributes.RestaurantsPriceRange2                             cuisine  \
12307                                  1  american (traditional), sandwiches   
134154                                 2                             mexican   
25015                                NaN                                 NaN   

                                                   style  review_count  stars  \
12307          delis, bars, restaurants, nightlife, pubs           991    5.0   
134154  bars, breakfast & brunch, restaurants, nightlife           799    5.0   
25015                                                NaN          