# TA scraper for Hotel Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from lxml import html
import pandas as pd
import requests
from collections import OrderedDict
import json
import argparse
import re
import os
import sys
import numpy as np

In [None]:
def clean(text):
    if text:
        # Removing \n \r and \t
        return ' '.join(''.join(text).split()).strip()
    return None

In [None]:
def process_request(url, retry=0):
    print('Fetching {}'.format(url))
    headers = {
                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7",
                "cache-control": "max-age=0",
                "upgrade-insecure-requests": "1",
                "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36",
            }
    response = requests.get(url, headers=headers)
    if response.status_code == 404:
        return {'error': 'Page not found', 'status_code': 404}

    parser = html.fromstring(response.text, url)
    return process_page(parser, url)


def process_page(parser, url):
    script_text = ' '.join(''.join(parser.xpath('//script//text()')).split())
    
    raw_ratings = raw_ratings = re.findall("\"ratingCounts\"\:\[[0-9,]{10,}\]", script_text)
    try:
        raw_ratings = raw_ratings[0][15:]
        raw_ratings = json.loads(raw_ratings) 
    except:
        pass
    raw_price = re.findall("\"data-perNight\"\:\"[0-9]{3,}\"", script_text)
    
    price = []
    for p in raw_price:
        price.append(re.sub("[^0-9]", "", p))
    try:
        raw_description = re.findall("\"locationDescription\"\:\"[A-Za-z ,'.-]{10,}\"", script_text)[0]   
        raw_description = raw_description[23:-2]
    except:
        raw_description = ''
    
    XPATH_NAME = '//h1[@id="HEADING"]//text()'

    XPATH_AMENITIES = "//div[contains(text(),'Property amenities')]/following-sibling::div//div[@data-test-target='amenity_text']//text()"
    
    XPATH_ROOMS = "//div[contains(text(),'Room features')]/following-sibling::div//div[@data-test-target='amenity_text']//text()"
    
    XPATH_types = "//div[contains(text(),'Room types')]/following-sibling::div//div[@data-test-target='amenity_text']//text()"
    
    XPATH_FULL_ADDRESS_JSON = '//script[@type="application/ld+json"]//text()'

    raw_name = parser.xpath(XPATH_NAME)

    amenities = parser.xpath(XPATH_AMENITIES)
    
    rooms = parser.xpath(XPATH_ROOMS)
    
    room_types = parser.xpath(XPATH_types)
    
    raw_address_json = parser.xpath(XPATH_FULL_ADDRESS_JSON)
    
    name = clean(raw_name)
    hotel_rating = 0
    address = {}
    if raw_address_json:
        try:
            parsed_address_info = json.loads(raw_address_json[0])
            rating = parsed_address_info.get('aggregateRating', {})
            address = parsed_address_info.get("address", {})
                
            hotel_rating = rating.get('ratingValue')
            review_count = rating.get('reviewCount')
            
        except Exception as e:
            review_count = hotel_rating = 0
            raise e

    ratings = {}
    if raw_ratings:
        ratings = {
                'Excellent': raw_ratings[4],
                'Good': raw_ratings[3],
                'Average': raw_ratings[2],
                'Poor': raw_ratings[1],
                'Terrible': raw_ratings[0]
                    }
        
    amenity_dict = {'Hotel Amenities': ','.join(amenities)}
    room_dict = {'Room Features': ','.join(rooms)}
    type_dict = {'Room Types': ','.join(room_types)}
    additional_info_dict = OrderedDict()
    
    data = {
            'name': name,
            'reviews' :ratings,
            'amenities': amenity_dict['Hotel Amenities'],
            'rooms' : room_dict['Room Features'],
            'types' : type_dict['Room Types'],
            'official_description': raw_description,
            'rating': float(hotel_rating) if hotel_rating else 0.0,
            'street': address.get('streetAddress'),
            'country': address.get("addressCountry", {}).get("name"),
            'region': address.get('addressRegion'),
    }
    try:
        data.update({'price' : min(price)})
    except:
        pass
        
    return data

In [None]:
def info_request(url): 
  parser = argparse.ArgumentParser()
  parser.add_argument('url', help='Tripadvisor hotel url')
  scraped_data = process_request(url)
  return  scraped_data

def save_json(scraped_data):
    with open('/content/drive/MyDrive/final_dataset.json', 'w') as f:
        json.dump(scraped_data, f, indent=4, ensure_ascii=False)
    return 'Done!'

def get_dataset(df, glb_cnt = 0):
    fnl_res = {}
    cnt = glb_cnt
    for row in df.hotel_url:
        url = 'https://www.tripadvisor.com/' + row 
        res = info_request(url)
       
        fnl_res.update({cnt : res})
        cnt+= 1
        print(cnt)
        if cnt == df.size + glb_cnt:
            break
    return cnt, fnl_res

In [None]:
files = os.listdir('/content/drive/MyDrive/data')

In [None]:
file = files[6]

'aribia.csv'

In [None]:
df = pd.read_csv("/content/drive/MyDrive/data/{}".format(file))

In [None]:
# df = df.iloc[2000:] 

In [None]:
#  df['hotel_url'][2000]

'Hotel_Review-g294472-d6276954-Reviews-88_Rooms_Hotel-Belgrade.html'

In [None]:
len(df.index)

4246

In [None]:
for i in range(0,43):
  try:
    with open('/content/drive/MyDrive/final_dataset.json', 'r+') as f:
      try:
        da = json.load(f)
      except:
        da = {}
    idx = len(da)
    print(idx)
    fnl_res= {}
    cnt,res = get_dataset(df.head(100), idx)
    fnl_res.update(res)
    with open('/content/drive/MyDrive/final_dataset.json', 'r+') as f:
      try:
        data = json.load(f)
      except:
        data = {}
    data.update(fnl_res)    
    save_json(data)
    df = df.iloc[100:]      
  except:
      pass

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1759
Fetching https://www.tripadvisor.com/hotel_review-g295424-d6728347-reviews-naia_downtown_jebel_ali_hotel-dubai_emirate_of_dubai.html
1760
Fetching https://www.tripadvisor.com/hotel_review-g12173515-d12541842-reviews-al_salam_grand_hotel-mileiha_emirate_of_sharjah.html
1761
Fetching https://www.tripadvisor.com/hotel_review-g295424-d7386694-reviews-prime_hotel-dubai_emirate_of_dubai.html
1762
Fetching https://www.tripadvisor.com/hotel_review-g295424-d1224075-reviews-dubai_concorde_residence_hotel-dubai_emirate_of_dubai.html
1763
Fetching https://www.tripadvisor.com/hotel_review-g295424-d3830329-reviews-ana_palace-dubai_emirate_of_dubai.html
1764
Fetching https://www.tripadvisor.com/hotel_review-g298063-d2471408-reviews-capital_hotel-ras_al_khaimah_emirate_of_ras_al_khaimah.html
1765
Fetching https://www.tripadvisor.com/hotel_review-g295424-d5565305-reviews-high_end_hotel_apartments-dubai_emirate_of_dubai.html
1766
Fetc