In [458]:
import numpy as np
import pandas as pd 
import requests 
import json

import re 

import scrapy
from scrapy.crawler import CrawlerProcess 
from scrapy.selector import HtmlXPathSelector
from bs4 import BeautifulSoup
from scipy.sparse.linalg import svds
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler

## Scraping restaurants

In [197]:
headers = {
            'accept-encoding': 'gzip, deflate, sdch, br',
            'accept-language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'cache-control': 'max-age=0',
        }

url = 'https://www.openrice.com/en/hongkong/restaurants?page='
base_url = 'https://www.openrice.com'
links =[]


In [482]:
class linksFinder:

    def __init__(self):
        self.base_url = ''
        self.url = ''
        self.headers = {}
        self.restaurants_links = []
        self.restaurants_info = []

    def get_url(self, base_url, url, headers):
        self.base_url = base_url
        self.url = url
        self.headers = headers

    def get_links(self, start_num, end_num):

        for i in range(start_num, end_num):
            link = self.url + str(i)
            request = requests.get(link, headers=self.headers)
            soup = BeautifulSoup(request.text, 'lxml')

            for h2tag in soup.findAll('h2'):
                for atag in h2tag.findAll('a'):
                    link_tail = atag.get('href')
                    full_link = self.base_url + link_tail

                    self.restaurants_links.append(full_link)

        return print('Scraping done!')

    def save_links(self):
        with open('restaurants_links.txt', 'w') as output_file:
            for link in self.restaurants_links:
                output_file.write(link + '\n')

    def get_restaurants_info(self):
        for link in self.restaurants_links:
            request = requests.get(link, headers=self.headers)
            soup = BeautifulSoup(request.text, 'lxml')

            name = soup.find('div', {'class': 'poi-name'}).find('span').get_text()
            cuisine = soup.find('div', {'class': 'header-poi-categories dot-separator'}).get_text().strip().split('\n')
            rating = soup.find('div', {'class': 'header-score'}).get_text()
            rating = float(rating)

            bookmark = soup.find('div', {'class': 'header-bookmark-count js-header-bookmark-count'}).get_text()
            bookmark = int(bookmark)

            address = soup.find('div', {'class': 'content'}).find('a').get_text().strip()
            district = soup.find('div', {'class': 'header-poi-district dot-separator'}).get_text().split('\n')[1]

            price = soup.find('div', {'class': 'header-poi-price dot-separator'}).get_text().strip()

            review = soup.find('div', {'class': 'main-menu table-center'}).find_all('a', href=True)[1].get_text()
            review_count = re.search(r'\d+', review).group()
            review_count = int(review_count)

            happy = soup.find('div', {'class': 'header-smile-section'}).get_text().split('\n')[2]
            happy = int(happy)
            okay = soup.find('div', {'class': 'header-smile-section'}).get_text().split('\n')[4]
            okay = int(okay)
            sad = soup.find('div', {'class': 'header-smile-section'}).get_text().split('\n')[6]
            sad = int(sad)

            all_items = soup.find_all('div', {'class': 'condition-item'})
            condition_item_avail = []
            for item in all_items:
                check_class = item.find('span').get('class')[1]
                condition = item.find('span', {'class': 'condition-name'}).get_text()
                if check_class == 'd_sr2_lhs_tick_desktop':
                    condition_item_avail.append(condition)

            info = {
                "name": name,
                "cuisine": cuisine,
                "rating": rating,
                "bookmark": bookmark,
                "price-range": price,
                "address": address,
                "district": district,
                "review_count": review_count,
                "review_happy": happy,
                "review_okay": okay,
                "review_sad": sad,
                "available_condition": condition_item_avail
            }

            self.restaurants_info.append(info)

        return print('all restaurant info is scraped!')

    def save_json(self):

        with open('restaurants_info.json', 'w') as file:
            json.dump(self.restaurants_info, file, indent=4, separators=(',', ':'))

        return print('the json file is saved!')

In [None]:
Finder = linksFinder()

Finder.get_url(base_url, url, headers)
Finder.get_links(start_num=1, end_num=30)

Finder.save_links()
Finder.get_restaurants_info()
Finder.save_json()

In [203]:
restaurant_info

[{'name': 'Burgeroom',
  'cuisine': ['American', 'Hamburger', 'Fast Food'],
  'rating': 4,
  'bookmark': 46366,
  'price-range': '$51-100',
  'address': 'Shop D, G/F, Food Street, 50-56 Paterson Street, Fashion Walk, Causeway Bay',
  'district': 'Causeway Bay',
  'review_count': 1272,
  'review_happy': 1032,
  'review_okay': 112,
  'review_sad': 63,
  'available_condition': ['Alcoholic Drinks',
   'May Bring Your Own Wine',
   'Cake-cutting',
   'Parking',
   'Delivery',
   'Outdoor Seating',
   'Spot payment']},
 {'name': '心之食堂',
  'cuisine': ['Japanese',
   'International',
   'Sushi/Sashimi',
   'Skewer',
   'Izakaya'],
  'rating': 4,
  'bookmark': 15567,
  'price-range': '$201-400',
  'address': 'Room 1203, 12/F, Bartlock Centre, 3 Yiu Wa Street, Causeway Bay',
  'district': 'Causeway Bay',
  'review_count': 894,
  'review_happy': 780,
  'review_okay': 47,
  'review_sad': 27,
  'available_condition': ['Online Reservation',
   'Wi-Fi',
   'Open Till Late',
   'Alcoholic Drinks',
   

## Processing restaurant info

Because the scrapped information of restaurant is not ideal for later steps, I need to perform processing first

For testing purpose, I import the scrapped json document, which is stored in my local drive


In [474]:
df = pd.DataFrame()

df = pd.read_json('restaurants_info.json')

df.shape

(370, 12)

As it turns out, the openrice restaurant is not good for scraping a wide range of pages it will re-cycle restaurants in later pages 

In [475]:
df.drop_duplicates(['name','address'],inplace = True)
df.shape

(250, 12)

In [256]:
def unpack_cuisine(source, target = 3, defaultValue=None):
    n = len(source)
    if n < target:
        return [*source, *([defaultValue] * (target - len(source)))]
    elif n > target:
        return source[0:target]
    else:
        return source

In [257]:
df['cuisine_main_type'], df['cuisine_sub_type'], df['cuisine_minor_type'] = zip(*df['cuisine'].apply(lambda x: unpack_cuisine(x)))
df.drop('cuisine',axis = 1, inplace = True)

columns = ['name',
           'address',
           'district',
           'cuisine_main_type', 
           'cuisine_sub_type',
           'cuisine_minor_type',
           'available_condition',
           'price-range',
           'rating',
           'bookmark',
           'review_count',
           'review_happy',
           'review_okay',
           'review_sad']

df = df[columns]

In [258]:
df.tail()

Unnamed: 0,name,address,district,cuisine_main_type,cuisine_sub_type,cuisine_minor_type,available_condition,price-range,rating,bookmark,review_count,review_happy,review_okay,review_sad
245,梨滿園韓國料理,"1/F, Witty Commercial Building Deli2, 1A-1L Tu...",Mong Kok,Korean,Korean BBQ,,"[Online Reservation, Wi-Fi, Alcoholic Drinks, ...",$101-200,4.0,10424,521,452,25,16
246,榮哥廚房私房菜,"Shop L, 4/F, Phase 3, Kwun Tong Industrial Cen...",Kwun Tong,Guangdong,International,Stir-Fry,"[Online Reservation, Wi-Fi, Alcoholic Drinks, ...",$201-400,4.5,12136,335,304,7,6
247,Cafe Paradise,"G/F, 412 Portland Street, Prince Edward",Prince Edward,Italian,Western,Dessert,"[Online Reservation, Alcoholic Drinks, Cake-cu...",$51-100,4.0,8795,355,316,14,4
248,桂花小幸,"Shop 133-135, 1/F, TKO Plaza, 1 Tong Tak Stree...",Tseung Kwan O,Taiwan,Taiwanese Drink,,[Spot payment],$51-100,4.0,4154,383,281,49,13
249,家賀屋刺身寿司專門店,"1/F, 83 Wan Chai Road, Wan Chai",Wan Chai,Japanese,Sushi/Sashimi,,"[Alcoholic Drinks, Phone Reservation, Catering...",$101-200,4.0,14669,472,417,16,11


## Search engine

<h3> Filtering option: </h3>

name

name_contain

district

country

dish

available condition:

 ['Online Reservation',
 'Wi-Fi',
 'Alcoholic Drinks',
 'May Bring Your Own Wine',
 'Cake-cutting',
 'VIP Room',
 'Phone Reservation',
 'Reward Dining Points',
 '10% Service Charge',
 'Parking',
 'Delivery',
 'Spot payment',
 'Outdoor Seating',
 'Open Till Late',
 'TV Broadcast',
 'Exclusive Online Booking',
 'Sea View',
 'Live Music',
 'Smoking Area',
 'Eco-Friendly',
 'Live Sports Broadcast',
 'Vegetarian',
 'Certified Halal Food',
 'All You Can Eat Hotpot',
 'Catering Service']

price_range:

Below 50, 51-100, 101-200, 201-400, 401-800, Above 801

In [486]:
class searchEngine:

    def __init__(self):
        self.df = pd.DataFrame()

    def get_json(self, json_doc):

        self.df = pd.read_json(json_doc)
        self.df.drop_duplicates(['name', 'address'], inplace=True)

        def unpack_cuisine(source, target=3, defaultValue=None):
            n = len(source)
            if n < target:
                return [*source, *([defaultValue] * (target - len(source)))]
            elif n > target:
                return source[0:target]
            else:
                return source

        self.df['cuisine_main_type'], self.df['cuisine_sub_type'], self.df['cuisine_minor_type'] = zip(
            *self.df['cuisine'].apply(lambda x: unpack_cuisine(x)))
        self.df.drop('cuisine', axis=1, inplace=True)

        columns = ['name',
                   'address',
                   'district',
                   'cuisine_main_type',
                   'cuisine_sub_type',
                   'cuisine_minor_type',
                   'available_condition',
                   'price-range',
                   'rating',
                   'bookmark',
                   'review_count',
                   'review_happy',
                   'review_okay',
                   'review_sad']

        self.df = self.df[columns]

    def search(self, filter_select):
        restaurant_list = self.df.copy()
        restaurant_to_remove = []

        if 'name' in filter_select:

            for i in range(len(self.df)):
                if filter_select['name'] != self.df['name'][i]:
                    restaurant_to_remove.append(i)

        if 'name_contain' in filter_select:

            for i in range(len(self.df)):
                flag = 0
                for pattern in range(len(filter_select['name_contain'])):
                    if re.search(filter_select['name_contain'][pattern], self.df['name'][i]):
                        break
                if flag == 0:
                    restaurant_to_remove.append(i)

        if 'country' in filter_select:

            for i in range(len(self.df)):
                if filter_select['country'] != self.df['cuisine_main_type'][i]:
                    restaurant_to_remove.append(i)

        if 'dish' in filter_select:

            for i in range(len(self.df)):
                if filter_select['dish'] != self.df['cuisine_sub_type'][i] and \
                        filter_select['dish'] != self.df['cuisine_minor_type'][i]:
                    restaurant_to_remove.append(i)

        if 'avail_cond' in filter_select:

            for i in range(len(self.df)):
                cond_count = len(self.df['available_condition'][i])
                for option in filter_select['avail_cond']:
                    if option not in self.df['available_condition'][i]:
                        cond_count = cond_count - 1

                if cond_count <= 0:
                    restaurant_to_remove.append(i)

        if 'price range' in filter_select:

            min_range = int(filter_select['price range'][0:filter_select['price range'].find('-')])

            max_range = int(filter_select['price range'][filter_select['price range'].find('-') + 1:])

            for i in range(len(self.df)):
                if self.df['price-range'][i] == 'Below $50':
                    price_min = 0
                    price_max = 50
                elif self.df['price-range'][i] == '$51-100':
                    price_min = 51
                    price_max = 100
                elif self.df['price-range'][i] == '$101-200':
                    price_min = 101
                    price_max = 200
                elif self.df['price-range'][i] == '$201-400':
                    price_min = 201
                    price_max = 400
                elif self.df['price-range'][i] == '$401-800':
                    price_min = 401
                    price_max = 800
                elif self.df['price-range'][i] == 'Above $801':
                    price_min = 801
                    price_max = 10000

                if price_min < min_range or price_max > max_range:
                    restaurant_to_remove.append(i)

        for index_remove in set(restaurant_to_remove):
            restaurant_list.drop(index=index_remove, axis=0, inplace=True)

        return restaurant_list

    def top_N_similar(self, restaurant, district, n):
        scale = StandardScaler()
        list_similar = []

        if (restaurant != None) & (district == None):
            target_idx = self.df[self.df['name'] == restaurant].index[0]

        elif (restaurant == None) & (district != None):
            target_idx = self.df[self.df['district'] == district].index[0]

        else:
            target_idx = self.df[(self.df['name'] == restaurant) & (self.df['district'] == district)].index[0]

        n = (n + 1) * -1

        df_avail_cond = self.df['available_condition'].str.join('|').str.get_dummies()
        df_sim = pd.concat([self.df, df_avail_cond], axis=1)
        df_sim.drop(['available_condition'], axis=1, inplace=True)

        df_sim_matrix = df_sim.drop(['name', 'address'], axis=1)
        df_sim_matrix = pd.get_dummies(df_sim_matrix, drop_first=True)

        df_sim_matrix.iloc[:, :6] = scale.fit_transform(df_sim_matrix.iloc[:, :6])

        df_sim['similarity'] = list(cosine_similarity(df_sim_matrix, dense_output=True))

        top_n_indices = np.argsort(df_sim['similarity'][target_idx])[n:]
        top_n_indices = top_n_indices[:-1]

        for index in top_n_indices:
            list_similar.append(self.df.iloc[index, :])

        df_sim_results = pd.DataFrame(list_similar)

        return df_sim_results

        return restaurant_list

In [487]:
engine = searchEngine()
engine.get_json('restaurants_info.json')

results = engine.search({'country': 'Japanese',
                           'avail_cond':['Online Reservation','Alcoholic Drinks','Phone Reservation']})

In [488]:
result

Unnamed: 0,name,address,district,cuisine_main_type,cuisine_sub_type,cuisine_minor_type,available_condition,price-range,rating,bookmark,review_count,review_happy,review_okay,review_sad
5,心之食堂,"Room 1203, 12/F, Bartlock Centre, 3 Yiu Wa Str...",Causeway Bay,Japanese,International,Sushi/Sashimi,"[Online Reservation, Wi-Fi, Open Till Late, Al...",$201-400,4.0,15569,894,780,47,27
7,鉄人旨花,"2/F, Shining building, 477-481 Jaffe Road, Cau...",Causeway Bay,Japanese,Teppanyaki,Sushi/Sashimi,"[Alcoholic Drinks, May Bring Your Own Wine, Ca...",Above $801,4.5,16798,1058,1012,16,11
11,七福神和食亭,"1/F, 311-313 Nathan Road, Jordan",Jordan,Japanese,Sushi/Sashimi,,"[Online Reservation, Alcoholic Drinks, May Bri...",$101-200,4.0,24945,993,826,72,32
19,漁獲浜燒,"18/F, Macau Yat Yuen Centre, 525 Hennessy Road...",Causeway Bay,Japanese,,,"[Online Reservation, Wi-Fi, Alcoholic Drinks, ...",$201-400,4.5,26754,732,698,10,5
21,Day and Nite by Master Kama,"1/F, 50 Shan Tung Street, Mong Kok",Mong Kok,Japanese,Italian,Seafood,"[Online Reservation, Phone Reservation, Reward...",$101-200,4.0,40880,857,646,89,48
31,嵐山日本料理,"G/F, 11 Minden Avenue, Tsim Sha Tsui",Tsim Sha Tsui,Japanese,Sushi/Sashimi,Skewer,"[Online Reservation, Wi-Fi, Alcoholic Drinks, ...",$201-400,4.0,23243,545,489,24,5
39,焱丸水產,"Shop 5&6, G/F, Wah Fung Building, 23 Minden Av...",Tsim Sha Tsui,Japanese,Seafood,Sushi/Sashimi,"[Online Reservation, Phone Reservation, Reward...",$201-400,4.0,31248,721,605,42,21
42,FireBird,"G/F, Hotel Pennington by Rhombus, 13 Penningto...",Causeway Bay,Japanese,Skewer,Izakaya,"[Online Reservation, Alcoholic Drinks, May Bri...",$201-400,4.0,16058,552,496,19,14
43,和匠日式燒肉店,"Shop 3, G/F, Tak Wai Building, 23-27 Cheong Lo...",Jordan,Japanese,Roast Meat,Skewer,"[Online Reservation, Wi-Fi, Alcoholic Drinks, ...",$201-400,4.0,17805,698,635,26,19
44,極尚大喜屋日本料理,"Shop 102, 1/F, Albion Plaza, 2-6 Granville Roa...",Tsim Sha Tsui,Japanese,Sushi/Sashimi,All-you-can-eat,"[Online Reservation, Wi-Fi, Alcoholic Drinks, ...",$201-400,4.0,76447,684,499,55,25


## Recommendation system

In [470]:
df[df['name']=='Yadllie Plate']

Unnamed: 0,name,address,district,cuisine_main_type,cuisine_sub_type,cuisine_minor_type,available_condition,price-range,rating,bookmark,review_count,review_happy,review_okay,review_sad
2,Yadllie Plate,"11/F, CTMA Centre, 1 Sai Yeung Choi Street, Mo...",Mong Kok,Korean,Korean Fried Chicken,,"[Online Reservation, Wi-Fi, Alcoholic Drinks, ...",$101-200,4.0,51289,946,718,97,40


In [490]:
df_sim_results = engine.top_N_similar('FireBird', None, 3)

df_sim_results

Unnamed: 0,name,address,district,cuisine_main_type,cuisine_sub_type,cuisine_minor_type,available_condition,price-range,rating,bookmark,review_count,review_happy,review_okay,review_sad
63,鳥居居酒屋,"17/F, Macau Yat Yuen Centre, 525 Hennessy Road...",Causeway Bay,Japanese,Robatayaki,Skewer,"[Online Reservation, Alcoholic Drinks, May Bri...",$201-400,4.0,20823,626,545,22,29
69,琥酌,"G/F, 159A Sai Yeung Choi Street North, Prince ...",Prince Edward,Japanese,Roast Meat,Izakaya,"[Online Reservation, Alcoholic Drinks, May Bri...",$201-400,4.0,17746,512,444,26,8
229,備長,"26/F, Macau Yat Yuen Centre, 525 Hennessy Road...",Causeway Bay,Japanese,Skewer,Izakaya,"[Online Reservation, Alcoholic Drinks, May Bri...",$201-400,4.0,10230,368,310,21,13
