# TA scraper for Restaurant Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from lxml import html
import pandas as pd
import requests
from collections import OrderedDict
import json
import argparse
import re
import os
import sys
import numpy as np

In [3]:
def clean(text):
    if text:
        # Removing \n \r and \t
        return ' '.join(''.join(text).split()).strip()
    return None

In [4]:
def process_request(url, retry=0):
    print('Fetching {}'.format(url))
    headers = {
                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7",
                "cache-control": "max-age=0",
                "upgrade-insecure-requests": "1",
                "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36",
            }
    response = requests.get(url, headers=headers)
    if response.status_code == 404:
        return {'error': 'Page not found', 'status_code': 404}

    parser = html.fromstring(response.text, url)
    return process_page(parser, url)


def process_page(parser, url):
    script_text = ' '.join(''.join(parser.xpath('//script//text()')).split())
    try:
        raw_name = re.findall("FoodEstablishment\"\,\"name\"\:\"[a-zA-Z0-9 \.\-\']{2,}\"", script_text)[0]
        raw_name = raw_name[27:-1]
    except:
        raw_name = ''
        
    try:
        raw_description = re.findall("\"description\"\:\"[A-Za-z0-9 ,'.\-]{10,}", script_text)[0]
        raw_description = raw_description[15:]
    except:
        raw_description = ''
    
    
    try:
        raw_features = re.findall("\]\}\,\"features\"\:\{\"[A-Za-z0-9 ,\"'.\-\:\[\{\}]{10,}\]", script_text)
        raw_features =raw_features[0][42:]
        jsn_f = json.loads(raw_features)
        features = ''
        for f in jsn_f:
            features+=f['tagValue']
            features+=', '
    except:
        features = ''
    try:    
        raw_cuisines = re.findall("\]\}\,\"cuisines\"\:\{\"[A-Za-z0-9 ,\"'.\-:\[\{\}]{10,}\]", script_text)
        raw_cuisines =raw_cuisines[0][42:]
        jsn_f = json.loads(raw_cuisines)
        cuisines = ''
        for f in jsn_f:
            cuisines+=f['tagValue']
            cuisines+=', '
    except:
        cuisines = ''
    try:
      raw_meals = re.findall("\]\}\,\"meals\"\:\{\"[A-Za-z0-9 ,\"'.\-:\[\{\}]{10,}\]", script_text)
      raw_meals =raw_meals[0][39:]
      jsn_f = json.loads(raw_meals)
      meals = ''
      for f in jsn_f:
          meals+= f['tagValue']
          meals+=', '
    except:
      meals = ''
    try:  
      raw_dietary = re.findall("\]\}\,\"dietaryRestrictions\"\:\{\"[A-Za-z0-9 ,\"'.\-:\[\{\}]{10,}\]", script_text)
      raw_dietary =raw_dietary[0][53:]
      jsn_f = json.loads(raw_dietary)
      dietary = ''
      for f in jsn_f:
          dietary+= f['tagValue']
          dietary+=', '
    except:  
        dietary = ''
    try:   
      raw_image = re.findall("\"images\"\:\{[A-Za-z0-9 \,\'\".\-\:\{\}/]{10,}\}\}", script_text)[0]      
      raw_image = raw_image[9:-1]
      image = json.loads(raw_image)['original']['url']
    except:
        image = ''
    try:    
      lat = re.findall("latitude\"\:[0-9 -]{1,}\.[0-9]{1,}", script_text)[0]
      lon = re.findall("longitude\"\:[0-9 -]{1,}\.[0-9]{1,}", script_text)[0]
    except:
      lat = ''
      lon = ''
    XPATH_FULL_ADDRESS_JSON = '//script[@type="application/ld+json"]//text()'
    
    raw_address_json = parser.xpath(XPATH_FULL_ADDRESS_JSON)
    
    hotel_rating = 0
    address = {}
    if raw_address_json:
        try:
            parsed_address_info = json.loads(raw_address_json[0])
            rating = parsed_address_info.get('aggregateRating', {})
            address = parsed_address_info.get("address", {})
                
            hotel_rating = rating.get('ratingValue')
            review_count = rating.get('reviewCount')
            
        except Exception as e:
            review_count = hotel_rating = 0
            raise e

    additional_info_dict = OrderedDict()
    
    data = {
            'name': raw_name,
            'features': features,
            'cuisines' : cuisines,
            'meals' : meals,
            'special meals' :dietary,
            'official_description': raw_description,
            'rating': float(hotel_rating) if hotel_rating else 0.0,
            'street': address.get('streetAddress'),
            'country': address.get("addressCountry", {}).get("name"),
            'region': address.get('addressRegion'),
            'coords':{
                'latitude' : lat[10:],
                'longitude' : lon[11:]
                },
            'image' : image,
            
    }
    try:
        data.update({'price' : min(price)})
    except:
        pass
        
    return data

In [5]:
def info_request(url): 
    # try:
  parser = argparse.ArgumentParser()
  parser.add_argument('url', help='Tripadvisor hotel url')
  scraped_data = process_request(url)
  return  scraped_data
    # except:
    #     pass
    

def save_json(scraped_data):
    with open('/content/drive/MyDrive/final_dataset.json', 'w') as f:
        json.dump(scraped_data, f, indent=4, ensure_ascii=False)
    return 'Done!'

def get_dataset(df, glb_cnt = 0):
    fnl_res = {}
    cnt = glb_cnt
    for row in df.url:
        url = row 
        res = info_request(url)
        fnl_res.update({cnt : res})
        cnt+= 1
        print(cnt)
        if cnt == df.size + glb_cnt:
            break
    return cnt, fnl_res

In [6]:
files = os.listdir('/content/drive/MyDrive/parts2')

In [7]:
files

['Restaurants_Ankara_2021-04-01.csv',
 'Restaurants_Antalya_2021-04-01.csv',
 'Restaurants_Bodrum_2021-04-01.csv',
 'Restaurants_California_2021-04-01.csv',
 'Restaurants_China_2021-04-01.csv',
 'Restaurants_Iran_2021-04-01.csv',
 'Restaurants_Istanbul_2021-04-01.csv',
 'Restaurants_New Yourk_2021-04-01.csv',
 'Restaurants_Texas_2021-04-01.csv',
 'Restaurants_Tokyo_2021-04-01.csv']

In [8]:
file = files[7]

'Restaurants_New Yourk_2021-04-01.csv'

In [9]:
df =  pd.read_csv("/content/drive/MyDrive/parts2/{}".format(file))
for file in files:
  df = df.append(pd.read_csv("/content/drive/MyDrive/parts2/{}".format(file)))

In [10]:
df = df.drop_duplicates(subset=['url'])

In [17]:
# df = pd.read_csv("/content/drive/MyDrive/parts/{}".format(file))

In [18]:
df.head()

Unnamed: 0,url
0,https://www.tripadvisor.com/Restaurant_Review-...
1,https://www.tripadvisor.com/Restaurant_Review-...
2,https://www.tripadvisor.com/Restaurant_Review-...
3,https://www.tripadvisor.com/Restaurant_Review-...
4,https://www.tripadvisor.com/Restaurant_Review-...


In [19]:
# df = df.iloc[50:] 

In [20]:
#  df['hotel_url'][2000]

In [11]:
len(df.index)

329

In [14]:
  for i in range(0,3):
    try:
      with open('/content/drive/MyDrive/final_dataset.json', 'r+') as f:
        try:
          da = json.load(f)
        except:
          da = {}
      idx = len(da)
      print(idx)
      fnl_res= {}
      cnt,res = get_dataset(df.head(100), idx)
      print('data')
      fnl_res.update(res)
      with open('/content/drive/MyDrive/final_dataset.json', 'r+') as f:
        try:
          data = json.load(f)
        except:
          data = {}
      data.update(fnl_res)    
      save_json(data)
      df = df.iloc[100:]      
    except:
      pass
      

300
Fetching https://www.tripadvisor.com/Restaurant_Review-g14129573-d14109193-Reviews-Yakiniku_Kyoshotei_Ginza-Ginza_Chuo_Tokyo_Tokyo_Prefecture_Kanto.html
301
Fetching https://www.tripadvisor.com/Restaurant_Review-g14133707-d8283419-Reviews-Nabezo_Shinjuku_Meijidori-Shinjuku_3_Chome_Shinjuku_Tokyo_Tokyo_Prefecture_Kant.html
302
Fetching https://www.tripadvisor.com/Restaurant_Review-g1066456-d12708952-Reviews-Gyukatsu_Motomura_Harajuku-Shibuya_Tokyo_Tokyo_Prefecture_Kanto.html
303
Fetching https://www.tripadvisor.com/Restaurant_Review-g14134311-d1904872-Reviews-Kikko-Asakusa_Taito_Tokyo_Tokyo_Prefecture_Kanto.html
304
Fetching https://www.tripadvisor.com/Restaurant_Review-g14133707-d1676825-Reviews-Nabezo_Shinjuku_3_Chome-Shinjuku_3_Chome_Shinjuku_Tokyo_Tokyo_Prefecture_Kanto.html
305
Fetching https://www.tripadvisor.com/Restaurant_Review-g14129573-d6070758-Reviews-Steak_House_Hama_Ginza-Ginza_Chuo_Tokyo_Tokyo_Prefecture_Kanto.html
306
Fetching https://www.tripadvisor.com/Restaurant_R