# TA scraper for Hotel Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
from lxml import html
import pandas as pd
import requests
from collections import OrderedDict
import json
import argparse
import re
import os
import sys
import numpy as np

In [4]:
def clean(text):
    if text:
        # Removing \n \r and \t
        return ' '.join(''.join(text).split()).strip()
    return None

In [15]:
def process_request(url, retry=0):
    print('Fetching {}'.format(url))
    headers = {
                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7",
                "cache-control": "max-age=0",
                "upgrade-insecure-requests": "1",
                "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36",
            }
    response = requests.get(url, headers=headers)
    if response.status_code == 404:
        return {'error': 'Page not found', 'status_code': 404}

    parser = html.fromstring(response.text, url)
    return process_page(parser, url)


def process_page(parser, url):
    script_text = ' '.join(''.join(parser.xpath('//script//text()')).split())
    
    raw_ratings = raw_ratings = re.findall("\"ratingCounts\"\:\[[0-9,]{10,}\]", script_text)
    try:
        raw_ratings = raw_ratings[0][15:]
        raw_ratings = json.loads(raw_ratings) 
    except:
        pass
    raw_price = re.findall("\"data-perNight\"\:\"[0-9]{3,}\"", script_text)
    
    price = []
    for p in raw_price:
        price.append(re.sub("[^0-9]", "", p))
    try:
        raw_description = re.findall("\"locationDescription\"\:\"[A-Za-z ,'.-]{10,}\"", script_text)[0]   
        raw_description = raw_description[23:-2]
    except:
        raw_description = ''
    
    XPATH_NAME = '//h1[@id="HEADING"]//text()'

    XPATH_AMENITIES = "//div[contains(text(),'Property amenities')]/following-sibling::div//div[@data-test-target='amenity_text']//text()"
    
    XPATH_ROOMS = "//div[contains(text(),'Room features')]/following-sibling::div//div[@data-test-target='amenity_text']//text()"
    
    XPATH_types = "//div[contains(text(),'Room types')]/following-sibling::div//div[@data-test-target='amenity_text']//text()"
    
    XPATH_FULL_ADDRESS_JSON = '//script[@type="application/ld+json"]//text()'

    raw_name = parser.xpath(XPATH_NAME)

    amenities = parser.xpath(XPATH_AMENITIES)
    
    rooms = parser.xpath(XPATH_ROOMS)
    
    room_types = parser.xpath(XPATH_types)
    
    raw_address_json = parser.xpath(XPATH_FULL_ADDRESS_JSON)
    
    name = clean(raw_name)
    hotel_rating = 0
    address = {}
    if raw_address_json:
        try:
            parsed_address_info = json.loads(raw_address_json[0])
            rating = parsed_address_info.get('aggregateRating', {})
            address = parsed_address_info.get("address", {})
                
            hotel_rating = rating.get('ratingValue')
            review_count = rating.get('reviewCount')
            
        except Exception as e:
            review_count = hotel_rating = 0
            raise e

    ratings = {}
    if raw_ratings:
        ratings = {
                'Excellent': raw_ratings[4],
                'Good': raw_ratings[3],
                'Average': raw_ratings[2],
                'Poor': raw_ratings[1],
                'Terrible': raw_ratings[0]
                    }
        
    amenity_dict = {'Hotel Amenities': ','.join(amenities)}
    room_dict = {'Room Features': ','.join(rooms)}
    type_dict = {'Room Types': ','.join(room_types)}
    additional_info_dict = OrderedDict()
    
    data = {
            'name': name,
            'reviews' :ratings,
            'amenities': amenity_dict['Hotel Amenities'],
            'rooms' : room_dict['Room Features'],
            'types' : type_dict['Room Types'],
            'official_description': raw_description,
            'rating': float(hotel_rating) if hotel_rating else 0.0,
            'street': address.get('streetAddress'),
            'country': address.get("addressCountry", {}).get("name"),
            'region': address.get('addressRegion'),
    }
    try:
        data.update({'price' : min(price)})
    except:
        pass
        
    return data

In [16]:
def info_request(url): 
    parser = argparse.ArgumentParser()
    parser.add_argument('url', help='Tripadvisor hotel url')
    scraped_data = process_request(url)
    return  scraped_data

def save_json(scraped_data):
    with open('tripadvisor_hotel_scraped_data.json', 'w') as f:
        json.dump(scraped_data, f, indent=4, ensure_ascii=False)
    return 'Done!'

def get_dataset(df, glb_cnt = 0):
    fnl_res = {}
    cnt = glb_cnt
    for row in df.hotel_url:
        url = 'https://www.tripadvisor.com/' + row 
        res = info_request(url)
        print(res)
        fnl_res.update({cnt : res})
        cnt+= 1
        if cnt == df.size + glb_cnt:
            break
    return cnt, fnl_res

In [17]:
files = os.listdir('/content/drive/MyDrive/data')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/data/{}".format(files[0]))

In [None]:
# df = df.iloc[2000:] 
df.size

In [None]:
for i in range(0,60):
    try:
        with open('/content/drive/MyDrive/final_dataset.json', 'r+') as f:
            try:
                da = json.load(f)
            except:
                da = {}
        idx = len(da)
        print(idx)
        fnl_res= {}
        cnt,res = get_dataset(df.head(100), idx)
        fnl_res.update(res)
        with open('/content/drive/MyDrive/final_dataset.json', 'r+') as f:
            try:
                data = json.load(f)
            except:
                data = {}
        data.update(fnl_res)    
        save_json(data)
        df = df.iloc[100:]      
    except:
        pass

In [19]:
res = info_request('https://www.tripadvisor.com/Hotel_Review-g60763-d1776857-Reviews-Langham_Place_New_York_Fifth_Avenue-New_York_City_New_York.html')

Fetching https://www.tripadvisor.com/Hotel_Review-g60763-d1776857-Reviews-Langham_Place_New_York_Fifth_Avenue-New_York_City_New_York.html


In [20]:
res

{'name': 'The Langham New York Fifth Avenue',
 'reviews': {'Excellent': 2211,
  'Good': 361,
  'Average': 114,
  'Poor': 46,
  'Terrible': 36},
 'amenities': 'Paid private parking on-site,Free High Speed Internet (WiFi),Fitness Center with Gym / Workout Room,Restaurant,Babysitting,Pets Allowed ( Dog / Pet Friendly ),Airport transportation,Business Center with Internet Access,Secured parking,Wifi,Breakfast available,Breakfast buffet,Breakfast in the room,Complimentary instant cofffee,Special diet menus,Conference facilities,Banquet room,Meeting rooms,24-hour security,Baggage storage,Concierge,Currency exchange,Newspaper,Non-smoking hotel,Butler service,Doorperson,24-hour front desk,Dry cleaning,Laundry service,Ironing service,Shoeshine,Soundproof rooms,Air conditioning,Housekeeping,Room service,Safe,Minibar,Refrigerator,Flatscreen TV,VIP room facilities,Bottled water,Iron,Non-smoking rooms,Suites',
 'rooms': 'Soundproof rooms,Air conditioning,Housekeeping,Room service,Safe,Minibar,Refri

**`Testing the new dataset`**

In [5]:
df = pd.read_csv('dataset/Hotels_features_dataset.csv').T

In [6]:
df.sample(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70097,70098,70099,70100,70101,70102,70103,70104,70105,70106
rooms,"Air conditioning,Fireplace,Housekeeping,Room s...","Air conditioning,Housekeeping,Room service,Saf...","Air conditioning,Private balcony,Room service,...","Air conditioning,Room service,Safe,VIP room fa...","Air conditioning,Room service,Safe,Kitchenette...","Air conditioning,Housekeeping,Room service,Saf...","Air conditioning,Fireplace,Room service,Safe,M...","Bathrobes,Air conditioning,Desk,Housekeeping,R...","Air conditioning,Housekeeping,Private balcony,...","Allergy-free room,Blackout curtains,Desk,Dinin...",...,"Soundproof rooms,Air conditioning,Desk,Houseke...","Soundproof rooms,Air conditioning,Housekeeping...","Soundproof rooms,Air conditioning,Housekeeping...","Bathrobes,Air conditioning,Housekeeping,Privat...","Bathrobes,Air conditioning,Housekeeping,Safe,T...","Soundproof rooms,Bathrobes,Air conditioning,Di...","Air conditioning,Housekeeping,Private balcony,...","Air conditioning,Safe,Kitchenette,Refrigerator...","Air conditioning,Housekeeping,Private balcony,...","Air conditioning,Private beach,Housekeeping,Ki..."
official_description,,,,,,,,,,,...,Gracefully complementing Manhattan's luxurious...,Gracefully complementing Manhattan's luxurious...,Gracefully complementing Manhattan's luxurious...,Gracefully complementing Manhattan's luxurious...,Gracefully complementing Manhattan's luxurious...,Gracefully complementing Manhattan's luxurious...,Gracefully complementing Manhattan's luxurious...,Gracefully complementing Manhattan's luxurious...,Gracefully complementing Manhattan's luxurious...,Gracefully complementing Manhattan's luxurious...
amenities,"Free High Speed Internet (WiFi),Free breakfast...","Paid private parking nearby,Free High Speed In...","Valet parking,Free High Speed Internet (WiFi),...","Free High Speed Internet (WiFi),Pool,Fitness C...","Free parking,Free High Speed Internet (WiFi),P...","Free High Speed Internet (WiFi),Rooftop pool,F...","Free High Speed Internet (WiFi),Pool,Fitness C...","Valet parking,Free High Speed Internet (WiFi),...","Free High Speed Internet (WiFi),Pool,Free brea...","Valet parking,Free High Speed Internet (WiFi),...",...,"Free parking,Free High Speed Internet (WiFi),P...","Free parking,Free High Speed Internet (WiFi),P...","Free public parking nearby,Free High Speed Int...","Free parking,Free High Speed Internet (WiFi),P...","Free High Speed Internet (WiFi),Wifi,Hot tub,F...","Free parking,Free High Speed Internet (WiFi),P...","Free public parking nearby,Free High Speed Int...","Free internet,Internet,Pool,Outdoor pool,Bar /...","Free public parking nearby,Free High Speed Int...","Free parking,Free High Speed Internet (WiFi),W..."
