# Web Scrapping and EDA of Hotel Reviews

In [None]:
$\underline{TripAdvisor:}$ Every review of a Hotel includes numerical ratings. When leaving a review at Trip Advisor a reviewer gives a total score (integers from 1-5) and then has the option to rate the hotel on some certain attributes. We will use 6 of those attributes: Value, Location, Sleep Quality, Rooms, Cleanliness and Service (also integers from 1-5).
    


In [4]:
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup 
import os, shutil
from datetime import datetime
sep = "~"
dtfmt = '%Y-%m-%d %H:%M:%S'

def hotel_details(soup_Page, city2):
    hotel_info = ""
    hotel_Page = soup_Page.find_all("div", {"class": "listing_rating"}) 
    
    for hotel_details in hotel_Page:  
        hotel_rating = hotel_details.find("span")
        if hotel_rating != None: hotel_rating = hotel_rating['alt'][0]
        else: hotel_rating = "nan"
        hotel_url = hotel_details.find("a")
        if hotel_url != None:
            Review_No = hotel_url.find(text=True)
            Review_No = Review_No.split(" ")[0]
            hotel_url = hotel_url['href']     
            hotel_url_parts = hotel_url.split("-")  
            head_url = hotel_url_parts[0] + "-" + hotel_url_parts[1] + "-" +  hotel_url_parts[2] + "-" + hotel_url_parts[3] + "-" 
            tail_url = hotel_url_parts[4] + "-" + hotel_url_parts[5]           
            #id[2] + sep + name[4]     
            hotel_detail = hotel_url_parts[2] + sep + hotel_url_parts[4] + sep + head_url + sep + tail_url + sep + hotel_rating + sep + Review_No
            hotel_info = hotel_info + hotel_detail + "\n"
    hotel_file = open(city2 + "/" + city2 + "_hotel_file.mcsv", "a+")
    hotel_file.write(hotel_info)
    hotel_file.close()

def hotel_Urls(base_urlx, sub_urlx, cityx):
    soup_Page = BeautifulSoup(urlopen(base_urlx + sub_urlx), 'lxml')
    div = soup_Page.find('div', {'class': 'unified pagination standard_pagination'})
    if div.find("span", {'class': 'nav next ui_button disabled'}) != None: # Last page
        hotel_details(soup_Page, cityx)
    elif div.find('span', {'class': 'nav previous ui_button disabled'})!=None: # First page
        hotel_details(soup_Page, cityx)
        urlmen = div.find('a', href = True)
        urlmen = urlmen['href']
        hotel_Urls(base_urlx, urlmen, cityx)
    else:
        hotel_details(soup_Page, cityx)
        urlmen = div.find_all('a', href = True)
        urlmen = urlmen[1]['href']
        hotel_Urls(base_urlx, urlmen, cityx)

def hotel_review(page_url):
    page_review_string = ""
    page_review = ""
    soup = BeautifulSoup(urlopen(page_url), "lxml")
    
    #find all reviews on page and prepare to extract urls
    reviews_onpage = soup.find_all("div", {"class": "reviewSelector"})
    review_urls = []
    #extract urls from each review and get a list of review urls
    for review in reviews_onpage:
        review_url = review.find('a', href = True)
        review_id = review['id']
        if str(type(review_url)) != "<type 'NoneType'>":
            review_url = review_url['href']
            curr_review = {'id' : review_id, 'url' : review_url}
            review_urls.append(curr_review)
     
    #begin processing reviews
    for url in review_urls:
        #construct url and request html
        soup = BeautifulSoup(urlopen(base_url + url['url']), "lxml")
        
        highlight_review = soup.find("div", {"id" : url['id']})        
        if highlight_review is not None:
            title = body = rati = value = locat = sleep = rooms = clean = servi = other = "nan"
            
            review_title  = highlight_review.find("div", {"property": "name"})  
            if review_title != None: title = review_title.getText()
                
            review_rating = highlight_review.find("div", {"class": "rating reviewItemInline"})
            if review_rating != None: 
                review_rating = review_rating.find("img")  
                if review_rating != None: rati = review_rating['alt'][0]
                    
            review_body = highlight_review.find("p", {"property": "reviewBody"})
            if review_body != None: 
                body =  review_body.getText(separator=' ') 
                body = body.replace('\n', '').replace('\r', '')

            quality_ratings = highlight_review.find_all("li", {"class": "recommend-answer"})
            if quality_ratings != None:  
                for rating in quality_ratings:
                    description = rating.find("div", {"class": "recommend-description"})
                    description = description.find(text = True)
                    score = rating.find("span")            
                    score = score['alt']
                    if   description == "Value":    value = score[0]
                    elif description == "Location": locat = score[0]
                    elif description == "Sleep Quality": sleep = score[0]
                    elif description == "Rooms": rooms = score[0]
                    elif description == "Cleanliness": clean = score[0]
                    elif description == "Service": servi = score[0]
                    else: other = score[0]
                    
            page_review = url['id'] + sep + title + sep + body + sep + rati + sep \
                        + value + sep + locat + sep + sleep + sep + rooms + sep + clean + sep + servi + sep + other
            page_review_string = page_review_string + page_review + "\n"
    return(page_review_string)


def hotel_run(base_urls, sub_urls, citys, cutNo_hotels, cutNo_reviews):
    timeStart = datetime.strptime(datetime.now().strftime(dtfmt), dtfmt)
    print("\n******* Starting time[" + str(timeStart) + "] Scraping reviews for " + str(cutNo_hotels) + " hotels ******************")    
   # if os.path.exists(citys): shutil.rmtree(citys)
   # os.makedirs(citys)
   # hotel_Urls(base_urls, sub_urls, citys)
    
    hotel_df = pd.read_csv(citys + "/" + citys + "_hotel_file.mcsv", sep="~", header=None, names = ['id', 'name', 'urlh', 'urlt', 'rat', 'reno'])
    hotel_df.drop_duplicates(['id'], inplace=True)
    hotel_df.reset_index(inplace=True)
    
    for index, row in hotel_df.iterrows():
        timeIndex = datetime.strptime(datetime.now().strftime(dtfmt), dtfmt) 
        output_string = ""
        hotel_name = row['name']
        head_urls = row['urlh']
        tail_urls = row['urlt']
        no_review = int(row['reno'].replace(',','')) 
        if no_review > cutNo_reviews: no_review = cutNo_reviews
        _or_value = 0
        
        if index > cutNo_hotels: # cutoff number of hotels to study
            timeEnd = datetime.strptime(datetime.now().strftime(dtfmt), dtfmt)
            print("*********** Finished with used time[" + str((timeStart - timeEnd)/60) + " mins] ********************************")
            break 
            
        print("\tStarting time[" + str(timeIndex) + "] " + str(index) + ": Scraping " + str(no_review) + " reviews of " + hotel_name) 
        while (_or_value < no_review):
            if _or_value == 0: _or = ""
            else:  _or = "or" + str(_or_value) + "-"
            reviewpg_url = base_urls + head_urls + _or + tail_urls 
            output_string = output_string + hotel_review(reviewpg_url)
            _or_value = _or_value + 10   
        review_file = open(citys + "/" + hotel_name + "-review.mcsv", "a+", encoding="utf-8") 
        review_file.write(output_string)
        review_file.close()       

In [5]:
cutNo_hotel = 40
cutNo_review = 50
base_url = "http://www.tripadvisor.com.au"
sub_url = "/Hotels-g255057-Canberra_Australian_Capital_Territory-Hotels.html"
city = "Canberra"
hotel_run(base_url, sub_url, city, cutNo_hotel, cutNo_review)


******* Starting time[2017-04-28 00:59:32] Scraping reviews for 40 hotels ******************


AttributeError: 'int' object has no attribute 'replace'

In [2]:
cutNo_hotel = 200
cutNo_review = 50
base_url = "http://www.tripadvisor.com.au"
sub_url = "/Hotels-g255060-Sydney_New_South_Wales-Hotels.html"
city = "Sydney"
hotel_run(base_url, sub_url, city, cutNo_hotel, cutNo_review)


******* Starting time[2017-04-27 19:18:27] Scraping reviews for 200 hotels ******************
	Starting time[2017-04-27 19:18:27] 0: Scraping 50 reviews of Radisson_Blu_Plaza_Hotel_Sydney
	Starting time[2017-04-27 19:20:10] 1: Scraping 50 reviews of The_Great_Southern_Hotel
	Starting time[2017-04-27 19:21:58] 2: Scraping 50 reviews of Travelodge_Hotel_Sydney
	Starting time[2017-04-27 19:23:36] 3: Scraping 50 reviews of Hotel_Bondi
	Starting time[2017-04-27 19:25:16] 4: Scraping 50 reviews of Manly_Paradise_Motel_Apartments
	Starting time[2017-04-27 19:26:55] 5: Scraping 50 reviews of Leisure_Inn_Sydney_Central
	Starting time[2017-04-27 19:28:41] 6: Scraping 50 reviews of Waldorf_Sydney_Central_Serviced_Apartments
	Starting time[2017-04-27 19:30:14] 7: Scraping 50 reviews of Oaks_Hyde_Park_Plaza
	Starting time[2017-04-27 19:31:50] 8: Scraping 50 reviews of Song_Hotel_Sydney
	Starting time[2017-04-27 19:33:38] 9: Scraping 50 reviews of Metro_Apartments_On_Darling_Harbour
	Starting time[

URLError: <urlopen error [Errno 11001] getaddrinfo failed>

In [None]:
cutNo_hotel = 40
cutNo_review = 100
base_url = "http://www.tripadvisor.com.au"
sub_url = "/Hotels-g255100-Melbourne_Victoria-Hotels.html"
city = "Melbourne"
hotel_run(base_url, sub_url, city, cutNo_hotel, cutNo_review)

In [None]:
cutNo_hotel = 40
cutNo_review = 100
base_url = "http://www.tripadvisor.com.au"
sub_url = "/Hotels-g255068-Brisbane_Brisbane_Region_Queensland-Hotels.html"
city = "Brisbane"
hotel_run(base_url, sub_url, city, cutNo_hotel, cutNo_review)

In [None]:
cutNo_hotel = 30
cutNo_review = 50
base_url = "http://www.tripadvisor.com.au"
sub_url = "/Hotels-g255103-Perth_Greater_Perth_Western_Australia-Hotels.html"
city = "Perth"
hotel_run(base_url, sub_url, city, cutNo_hotel, cutNo_review)

In [None]:
cutNo_hotel = 30
cutNo_review = 50
base_url = "http://www.tripadvisor.com.au"
sub_url = "/Hotels-g255097-Hobart_Greater_Hobart_Tasmania-Hotels.html"
city = "Hobart"
hotel_run(base_url, sub_url, city, cutNo_hotel, cutNo_review)

In [None]:
cutNo_hotel = 30
cutNo_review = 50
base_url = "http://www.tripadvisor.com.au"
sub_url = "/Hotels-g255066-Darwin_Top_End_Northern_Territory-Hotels.html"
city = "Darwin"
hotel_run(base_url, sub_url, city, cutNo_hotel, cutNo_review)