# Hotel Review Sentiment Analysis Part 1: Web Scrapping
## Adura ABIONA, PhD (UNSW)
### 4 May, 2017

## Introduction

This is an attempt to use sentimental analysis to analyse Australian hotels, from four major cities (Canberra, Sydney, Melbourne and Brisbane), based on reviewers' opionions (on a numerical scale of 1-5) from [**TripAdvisor**](http://www.tripadvisor.com.au) website.  

**Part I** of this work is based on data acquisition through web scraping from TripAdvisor using [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/).

[**TripAdvisor**](http://www.tripadvisor.com.au) is an online hotel review organisation. It allows hotel customers in many nations to write comments about their experiences in the hotel they stayed and rate the hotel's servion on a numerical scale of 1-5. 

The web scraping from tripAdvisor website starts with the ***hotel_run(base_url, sub_url, city, cutNo_hotel, cutNo_review)*** and Writes the review details of each hotel into hotelName-review.mcsv file in the city folder.
This function calls the following functions:    

**1.** ***hotel_Urls(base_url, sub_url, city)*** function uses BeautifulSou to scrap urls of all the hotel review lists from the *base_url + sub_url* page. These hotel urls are on more than two pages. It then calls the hotel_details() for each hotel review list with its page url as input.
        
**2.** ***hotel_details(soup_Page, city)*** function uses BeautifulSoup to scrap hotel review details from each page url of the hotel review list. The deatils for all the reviews are saved as city2_hotel_file.mcsv file.
        
**3.** ***hotel_review(page_url)*** function finds all reviews on a page and extract url for each review and process it. 

In [5]:
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup #To scrape the data from tripAdvisor website
import os, shutil
from datetime import datetime
sep = "~"
dtfmt = '%Y-%m-%d %H:%M:%S'
DataDir = "Datasets/" 

def hotel_Urls(base_urlx, sub_urlx, cityx): 
    # This function uses BeautifulSou to scrape urls of all the hotel review lists from the "base_url + sub_url" page. 
    #These urls are on more than two pages.
    # It then calls the hotel_details() for each hotel review list with its page url as input.
        
    soup_Page = BeautifulSoup(urlopen(base_urlx + sub_urlx), 'lxml')
    div = soup_Page.find('div', {'class': 'unified pagination standard_pagination'})
    if div.find("span", {'class': 'nav next ui_button disabled'}) != None: # Last page
        hotel_details(soup_Page, cityx)
    elif div.find('span', {'class': 'nav previous ui_button disabled'})!=None: # First page
        hotel_details(soup_Page, cityx)
        urlmen = div.find('a', href = True)
        urlmen = urlmen['href']
        hotel_Urls(base_urlx, urlmen, cityx)
    else:
        hotel_details(soup_Page, cityx)
        urlmen = div.find_all('a', href = True)
        urlmen = urlmen[1]['href']
        hotel_Urls(base_urlx, urlmen, cityx)

def hotel_details(soup_Page, city2):
    # This function uses BeautifulSoup to scrape hotel review details from each page url of the hotel review list.
    # The details are: rating, number of reviews and ecah review page url of the hotel. 
    # The deatils for all the review lists for all the hotels are then saved in the  city_Folder as city2_hotel_file.mcsv file.
        
    hotel_info = ""
    hotel_Page = soup_Page.find_all("div", {"class": "listing_rating"}) 
    
    for hotel_details in hotel_Page:  
        hotel_rating = hotel_details.find("span")
        if hotel_rating != None: hotel_rating = hotel_rating['alt'][0]
        else: hotel_rating = "nan"
        hotel_url = hotel_details.find("a")
        if hotel_url != None:
            Review_No = hotel_url.find(text=True)
            Review_No = Review_No.split(" ")[0]
            hotel_url = hotel_url['href']     
            hotel_url_parts = hotel_url.split("-")  
            head_url = hotel_url_parts[0] + "-" + hotel_url_parts[1] + "-" +  hotel_url_parts[2] + "-" + hotel_url_parts[3] + "-" 
            tail_url = hotel_url_parts[4] + "-" + hotel_url_parts[5]           
            #id[2] + sep + name[4]     
            hotel_detail = hotel_url_parts[2] + sep + hotel_url_parts[4] + sep + head_url + sep + tail_url + sep + hotel_rating + sep + Review_No
            hotel_info = hotel_info + hotel_detail + "\n"
    hotel_file = open(DataDir + city2 + "/" + city2 + "_hotel_file.mcsv", "a+")
    hotel_file.write(hotel_info)
    hotel_file.close()

def hotel_review(page_url):
    # This function finds all reviews on a page and extract url for each review process it. 
    # It gets the review title, body, overall rating (on the scale 0f 1-5) and other attribute ratings and these details
    
    page_review_string = ""
    page_review = ""
    soup = BeautifulSoup(urlopen(page_url), "lxml")
    
    #find all reviews on page and extract urls
    reviews_onpage = soup.find_all("div", {"class": "reviewSelector"})
    review_urls = []
    #extract urls from each review and get a list of review urls
    for review in reviews_onpage:
        review_url = review.find('a', href = True)
        review_id = review['id']
        if str(type(review_url)) != "<type 'NoneType'>":
            review_url = review_url['href']
            curr_review = {'id' : review_id, 'url' : review_url}
            review_urls.append(curr_review)
     
    #begin processing reviews
    for url in review_urls:
        #construct url and request html
        soup = BeautifulSoup(urlopen(base_url + url['url']), "lxml")
        
        highlight_review = soup.find("div", {"id" : url['id']})        
        if highlight_review is not None:
            title = body = rati = value = locat = sleep = rooms = clean = servi = other = "nan"
            
            review_title  = highlight_review.find("div", {"property": "name"})  
            if review_title != None: title = review_title.getText()
                
            review_rating = highlight_review.find("div", {"class": "rating reviewItemInline"})
            if review_rating != None: 
                review_rating = review_rating.find("img")  
                if review_rating != None: rati = review_rating['alt'][0]
                    
            review_body = highlight_review.find("p", {"property": "reviewBody"})
            if review_body != None: 
                body =  review_body.getText(separator=' ') 
                body = body.replace('\n', '').replace('\r', '')

            quality_ratings = highlight_review.find_all("li", {"class": "recommend-answer"})
            if quality_ratings != None:  
                for rating in quality_ratings:
                    description = rating.find("div", {"class": "recommend-description"})
                    description = description.find(text = True)
                    score = rating.find("span")            
                    score = score['alt']
                    if   description == "Value":    value = score[0]
                    elif description == "Location": locat = score[0]
                    elif description == "Sleep Quality": sleep = score[0]
                    elif description == "Rooms": rooms = score[0]
                    elif description == "Cleanliness": clean = score[0]
                    elif description == "Service": servi = score[0]
                    else: other = score[0]
                    
            page_review = url['id'] + sep + title + sep + body + sep + rati + sep \
                        + value + sep + locat + sep + sleep + sep + rooms + sep + clean + sep + servi + sep + other
            page_review_string = page_review_string + page_review + "\n"
    return(page_review_string)


def hotel_run(base_urls, sub_urls, citys, cutNo_hotels, cutNo_reviews):
    # This is the starting function for the web scraping of hotel reviews fro tripAdvisor website.
    # Writes the review deatils of each hotel into hotelName-review.mcsv file in the city folder
    
    timeStart = datetime.strptime(datetime.now().strftime(dtfmt), dtfmt)  # gets the starting time
    print("\n******* Starting time[" + str(timeStart) + "] Scraping reviews for " + str(cutNo_hotels) + " hotels ******************")    
    if os.path.exists(DataDir + citys): shutil.rmtree(DataDir + citys) # checks if directory exists and delete it
    os.makedirs(DataDir + citys) # make directory
    hotel_Urls(base_urls, sub_urls, citys) # call function
    
    #read file into dataframe
    hotel_df = pd.read_csv(DataDir + citys + "/" + citys + "_hotel_file.mcsv", sep="~", header=None, names = ['id', 'name', 'urlh', 'urlt', 'rat', 'reno'])
    hotel_df.drop_duplicates(['id'], inplace=True) # drops duplicate reviews 
    hotel_df.reset_index(inplace=True) # Reset the index of the dataframe
    
    for index, row in hotel_df.iterrows():
        timeIndex = datetime.strptime(datetime.now().strftime(dtfmt), dtfmt) 
        output_string = ""
        hotel_name = row['name']
        head_urls = row['urlh']
        tail_urls = row['urlt']
        no_review = int(row['reno'].replace(',','')) 
        if no_review > cutNo_reviews: no_review = cutNo_reviews
        _or_value = 0
        
        if index > cutNo_hotels: # cutoff number of hotels to study
            timeEnd = datetime.strptime(datetime.now().strftime(dtfmt), dtfmt)
            print("*********** Finished with used time[" + str((timeStart - timeEnd)/60) + " mins] ********************************")
            break 
            
        print("\tStarting time[" + str(timeIndex) + "] " + str(index) + ": Scraping " + str(no_review) + " reviews of " + hotel_name) 
        while (_or_value < no_review):
            if _or_value == 0: _or = ""
            else:  _or = "or" + str(_or_value) + "-"
            reviewpg_url = base_urls + head_urls + _or + tail_urls 
            output_string = output_string + hotel_review(reviewpg_url)
            _or_value = _or_value + 10   
        review_file = open(DataDir + citys + "/" + hotel_name + "-review.mcsv", "a+", encoding="utf-8") 
        review_file.write(output_string)
        review_file.close()       

In [None]:
cutNo_hotel = 40 # Number of hotels to scrap from each city
cutNo_review = 50 # Number of reviews to scrap from each hotel

# Url of the city review list divided into base and sub urls
base_url = "http://www.tripadvisor.com.au"
sub_url = "/Hotels-g255057-Canberra_Australian_Capital_Territory-Hotels.html"
city = "Canberra" # Name of a city
#sub_url = "/Hotels-g255060-Sydney_New_South_Wales-Hotels.html",  city = "Sydney"
#sub_url = "/Hotels-g255100-Melbourne_Victoria-Hotels.html", city = "Melbourne"
#sub_url = "/Hotels-g255068-Brisbane_Brisbane_Region_Queensland-Hotels.html", city = "Brisbane"
hotel_run(base_url, sub_url, city, cutNo_hotel, cutNo_review)


******* Starting time[2017-05-04 12:37:42] Scraping reviews for 40 hotels ******************
	Starting time[2017-05-04 12:37:50] 0: Scraping 50 reviews of Little_National_Hotel
	Starting time[2017-05-04 12:39:31] 1: Scraping 50 reviews of Aria_Hotel_Canberra
	Starting time[2017-05-04 12:41:16] 2: Scraping 50 reviews of East_Hotel
	Starting time[2017-05-04 12:42:41] 3: Scraping 50 reviews of Forrest_Hotel_And_Apartments
