# Scraping Hotel Ratings on Booking # 

In this homework we will practice web scraping on the following [site](https://www.booking.com/searchresults.html?aid=304142&label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ&sid=28d97f630803f9d48b4a1f535cbdd33f&class_interval=1&dest_id=20061717&dest_type=city&group_adults=2&group_children=0&label_click=undef&no_rooms=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=Boston&ssb=empty&ssne_untouched=Cancún&rows=15). Let's get some basic information for each hotel in Boston.
On each hotel page, scrape the following information: 
1. Hotel Name
2. Class of Rating (Wonderful/Excellent/Very Good/Good)
3. Rating Score
4. Number of Reviews


** Save the data in "traveler_ratings.csv" in the following format: hotel_name, class_of_rating, rating, num_reviews **

**(10 pts)**

You can see an overview of the information as displayed:





![Information to be scraped](booking_sample.png)

Now let's scrape some reviews. For each review of each each hotel in Boston you are to scrape the following attributes: 
1. Reviewer name
2. Reviewer ethnicity
3. Number of reviews 
4. Number of helpful votes
5. Date
6. Rating
7. Negative Review
8. Positive Review

Note that you will also need the hotel's name!! Also, some reviews may not have all attributes. 

** Save the data in "review_ratings.csv" in the following format: hotel_name, reviewer_name, ethnicity, num_reviews, num_help_votes, date, rating, neg_review, pos_review **

**(25 pts)**

You can see an overview of the information as displayed:
![Information to be scraped](review_sample.png)

In [69]:
import requests
url = 'https://www.booking.com/searchresults.html?aid=304142&label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ&sid=28d97f630803f9d48b4a1f535cbdd33f&class_interval=1&dest_id=20061717&dest_type=city&group_adults=2&group_children=0&label_click=undef&no_rooms=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=Boston&ssb=empty&ssne_untouched=Canc%C3%BAn&rows=15'
response = requests.get(url)
html = response.text.encode('utf-8')

In [78]:
# the following code comes from Ms. Spinelli's python file with some modification
from bs4 import BeautifulSoup
import sys
import time
import os
import logging
import argparse
import codecs
import json
import re
def parse_hotellist_page(html):
    """ Parse the html pages returned by get_hotellist_page().
        Return the next url page to scrape (a city can have
        more than one page of hotels) if there is, else exit
        the script.
    """
    soup = BeautifulSoup(html, "html5lib")
    hotel_boxes = soup.findAll('div', {'class' :re.compile('sr_item_default')})
  
    for hotel_box in hotel_boxes:
               
        state = True
        try:
            name = hotel_box.find('span', {'class' :'sr-hotel__name'}).find(text=True).strip()
            name = name.replace(',', '')
        
        except Exception as e:
            state = False
            name = ""
            reviews = "N/A"
            ratings = "N/A"
            numreviews = "N/A"

        if (state):
            try:
                reviews = hotel_box.find('span', {'class' :'review-score-badge'}).find(text=True).strip()
            except Exception as e:
                reviews = "N/A"
            try:
                ratings = hotel_box.find('span', {'class' :'review-score-widget__text'}).find(text=True).strip()
            except Exception as e:
                ratings = "N/A"
            if ratings == '':
                try:
                    div = hotel_box.find('span', {'class' :'review-score-widget__text'})
                    parsestring = str(div).split('>')
                    ratings = parsestring[3].split('<')[0].strip()
                except Exception as e:
                    ratings = "N/A"
            try:
                numreviews = hotel_box.find('span', {'class' :'review-score-widget__subtext'}).find(text=True).strip()
                numreviews = ''.join(x for x in numreviews if x.isdigit())
            except Exception as e:
                numreviews = "N/A"
        print(name + ',' + ratings + ',' + reviews + ',' + numreviews, file = outfile)
            
    # Get next URL page if exists, else exit
    div = soup.find("div", {"class" : "results-paging"})

    # check if last page
    if div.find('span', {'class' : 'paging-end'}):
        return False
    # If it is not las page there must be the Next URL
    hrefs = div.findAll('a', href= True)

    for href in hrefs:
        if href.find(text = True) == 'Next page':
            url = href['href']
            response = requests.get(url)
            html = response.text.encode('utf-8')
            return parse_hotellist_page(html)

In [79]:
start = time.time()
outfile = open('traveler_ratings.csv', 'w')
print('hotel_name,class_of_rating,rating,num_reviews', file = outfile)
parse_hotellist_page(html)
outfile.close()
end = time.time()
print(end - start)

140.69387817382812


In [80]:
# extract urls of all hotels in Boston and store them in a list
urls = []
def get_all_urls(html):
    soup = BeautifulSoup(html, "html5lib")
    hotel_boxes = soup.findAll('div', {'class' :re.compile('sr_item_default')})
  
    for hotel_box in hotel_boxes:  
        state = True
        try:
            name = hotel_box.find('span', {'class' :'sr-hotel__name'}).find(text=True).strip()
            name = name.replace(',', '')
        
        except Exception as e:
            state = False

        if (state):
            try:
                link = hotel_box.find('a', {'class' :'hotel_name_link url'})['href'].strip()
                url = 'https://www.booking.com' + link
                url = url[0:url.index('\n')]
                urls.append(url)
            except Exception as e:
                url = 'N/A'
            
    # Get next URL page if exists, else exit
    div = soup.find("div", {"class" : "results-paging"})

    # check if last page
    if div.find('span', {'class' : 'paging-end'}):
        return False
    # If it is not las page there must be the Next URL
    hrefs = div.findAll('a', href= True)

    for href in hrefs:
        if href.find(text = True) == 'Next page':
            url = href['href']
            response = requests.get(url)
            html = response.text.encode('utf-8')
            return get_all_urls(html)

In [81]:
url = 'https://www.booking.com/searchresults.html?aid=304142&label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ&sid=28d97f630803f9d48b4a1f535cbdd33f&class_interval=1&dest_id=20061717&dest_type=city&group_adults=2&group_children=0&label_click=undef&no_rooms=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=Boston&ssb=empty&ssne_untouched=Canc%C3%BAn&rows=15'
response = requests.get(url)
html = response.text.encode('utf-8')
start = time.time()
get_all_urls(html)
end = time.time()
print(end - start)

134.15015602111816


In [82]:
len(urls)

398

In [83]:
urls

['https://www.booking.com/hotel/us/oakwood-boston.html',
 'https://www.booking.com/hotel/us/14-gloucester-st-2b-by-lyon-apartments-boston.html',
 'https://www.booking.com/hotel/us/14-gloucester-st-4a-by-lyon-apartments.html',
 'https://www.booking.com/hotel/us/112-myrtle-st-9.html',
 'https://www.booking.com/hotel/us/two-bedroom-boston-luxury-apartment.html',
 'https://www.booking.com/hotel/us/the-c-house-boston-massachusettes.html',
 'https://www.booking.com/hotel/us/14-gloucester-st-2a-by-lyon-apartments-boston.html',
 'https://www.booking.com/hotel/us/clearway-street-by-boston-furnished-rooms.html',
 'https://www.booking.com/hotel/us/14-gloucester-st-unit-4b.html',
 'https://www.booking.com/hotel/us/hyatt-harborside.html',
 'https://www.booking.com/hotel/us/r-boston-massachusettes.html',
 'https://www.booking.com/hotel/us/hyatt-regency-boston.html',
 'https://www.booking.com/hotel/us/the-godfrey-boston.html',
 'https://www.booking.com/hotel/us/super-location-2br-47-1bath-1-parking-s

In [84]:
# get all valid and invalid urls for review pages
review_urls = []
def get_urls_for_reviews(urls):
    for url in urls:
        new = 'https://www.booking.com/reviews/us/hotel' + url[url.find('us')+2:]
        review_urls.append(new)

In [85]:
get_urls_for_reviews(urls)

In [86]:
review_urls

['https://www.booking.com/reviews/us/hotel/oakwood-boston.html',
 'https://www.booking.com/reviews/us/hotel/14-gloucester-st-2b-by-lyon-apartments-boston.html',
 'https://www.booking.com/reviews/us/hotel/14-gloucester-st-4a-by-lyon-apartments.html',
 'https://www.booking.com/reviews/us/hotel/112-myrtle-st-9.html',
 'https://www.booking.com/reviews/us/hotel/two-bedroom-boston-luxury-apartment.html',
 'https://www.booking.com/reviews/us/hotel/the-c-house-boston-massachusettes.html',
 'https://www.booking.com/reviews/us/hotel/14-gloucester-st-2a-by-lyon-apartments-boston.html',
 'https://www.booking.com/reviews/us/hotel/clearway-street-by-boston-furnished-rooms.html',
 'https://www.booking.com/reviews/us/hotel/14-gloucester-st-unit-4b.html',
 'https://www.booking.com/reviews/us/hotel/hyatt-harborside.html',
 'https://www.booking.com/reviews/us/hotel/r-boston-massachusettes.html',
 'https://www.booking.com/reviews/us/hotel/hyatt-regency-boston.html',
 'https://www.booking.com/reviews/us/ho

In [134]:
# get all reviews for each hotel with a valid review page
def get_all_reviews_for_each_hotel(html):
    soup = BeautifulSoup(html, "html5lib")
    temp = soup.find('h1', {'class' :'item hotel_name'})
    parsestring = str(temp).split('>')
    hotel_name = parsestring[2].split('<')[0].strip()
    try:
        review_boxes = soup.findAll('li', {'class' :'review_item clearfix '})
    # if hotel doesn't have a review page then url is not valid, return
    except Exception as e:
        return
    for review_box in review_boxes:
        try:
            h4 = review_box.find('h4')
            parsestring = str(h4).split('>')
            reviewer_name = parsestring[2].split('<')[0].strip()
        except Exception as e:
            reviewer_name = 'N/A'
        try:
            country = review_box.find('span', {'class' :'reviewer_country'})
            parsestring = str(country).split('>')
            ethnicity = parsestring[5].split('<')[0].strip()
        except Exception as e:
            ethnicity = 'N/A'
        try:    
            num_reviews = review_box.find('div', {'class' :'review_item_user_review_count'}).find(text=True).strip()
        except Exception as e:
            num_reviews = 'N/A'
        try: 
            date = review_box.find('p', {'class' :'review_item_date'}).find(text=True).strip()
            date = date.replace(',', ' ')
        except Exception as e:
            date = 'N/A'
        try:
            rating = review_box.find('span', {'class' :'review-score-badge'}).find(text=True).strip()
        except Exception as e:
            rating = 'N/A'
        try:
            neg = review_box.find('p', {'class' :'review_neg'})
            parsestring = str(neg).split('>')
            neg_review = parsestring[4].split('<')[0].strip()
            neg_review = neg_review.replace(',', '.')
        except Exception as e:
            neg_review = 'N/A'
        while neg_review.find('\n') != -1:
            i = neg_review.index('\n')
            neg_review = neg_review[0:i] + neg_review[i+1:]
        try:   
            pos = review_box.find('p', {'class' :'review_pos'})
            parsestring = str(pos).split('>')
            pos_review = parsestring[4].split('<')[0].strip()
            pos_review = pos_review.replace(',', '.')
        except Exception as e:
            pos_review = 'N/A'
        while pos_review.find('\n') != -1:
            i = pos_review.index('\n')
            pos_review = pos_review[0:i] + pos_review[i+1:]
        print(hotel_name + ',' + reviewer_name + ',' + ethnicity + ',' + num_reviews + ',' + date + ',' + rating + ',' + neg_review + ',' + pos_review, file = outfile)
    
    # Get next URL page if exists, else exit
    try:
        next_page = review_box.find('p', {'class' :'page_link review_next_page'})
        parsestring = str(next_page).split('>')
        temp = parsestring[1]
        if len(temp) < 4:
            return
        quotes = [m.start() for m in re.finditer('"', temp)]
        next_url = temp[quotes[0]+1:quotes[1]]
        next_url = 'https://www.booking.com' + next_url
        response = requests.get(next_url)
        html = response.text.encode('utf-8')
        return get_all_urls(html)
    except Exception as e:
        return

In [135]:
# get all reviews for all hotels with valid review pages
def get_all_reviews(urls):
    for url in urls:
        response = requests.get(url)
        html = response.text.encode('utf-8')
        get_all_reviews_for_each_hotel(html)
    return

In [136]:
start = time.time()
outfile = open('review_ratings.csv', 'w')
print('hotel_name,reviewer_name,ethnicity,num_reviews,date,rating,neg_review,pos_review', file = outfile)
get_all_reviews(review_urls)
outfile.close()
end = time.time()
print(end - start)

1266.1616411209106


It took us about 2.3 minutes to produce traveler_ratings.csv and 21 minutes to produce review_ratings.csv.