# Webscrapping Centerparcs Le Bois aux Daims

## 1. Importing packages 

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
import sys
from scrapy.http import Request
from scrapy.linkextractors import LinkExtractor
import json
import logging
import pandas as pd

## 2. Defining class and functions

In [14]:
class HotelreviewsItem(scrapy.Item):

    reviewer_id = scrapy.Field()
    rating = scrapy.Field()
    review = scrapy.Field()
    title = scrapy.Field()
    trip_date = scrapy.Field()
    trip_type = scrapy.Field()
    published_date = scrapy.Field()
    hotel_type = scrapy.Field()
    hotel_name = scrapy.Field()
    reviewer_id = scrapy.Field()
    review_language = scrapy.Field()
    review_id = scrapy.Field()
    price_range = scrapy.Field()


In [12]:
def user_info_splitter(raw_user_info):
    """

    :param raw_user_info:
    :return:
    """

    user_info = {}

    splited_info = raw_user_info.split()
    for element in splited_info:
        converted_element = get_convertible_elements_as_dic(element)
        if converted_element:
            user_info[converted_element[0]] = converted_element[1]

    return user_info

## 3. JSON Pipeline

In [13]:
class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('booking.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

## 4. Spider

In [None]:
class MySpider(CrawlSpider):
    name = 'BasicSpider'
    domain_url = "https://www.booking.com"
    start_urls = [
        "https://www.booking.com/hotel/fr/domaine-du-bois-aux-daims.fr.html#tab-reviews"]
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, 
        'FEED_FORMAT':'json',                                 
        'FEED_URI': 'booking.json'                       
    }

    def parse(self, response):

        next_reviews_page_url = "https://www.booking.com" + response.xpath(
            "/html/body/div[5]/div[1]/div[3]/div[1]/div[1]/div[8]/div/div[1]/div[2]/div[2]/div[2]/div[3]/div[4]/div/div[1]/ul/li[3]/a/svg/path").extract()[0]
        
        all_review_pages = response.xpath(
            "/html/body/div[5]/div[1]/div[3]/div[1]/div[1]/div[8]/div/div[1]/div[2]/div[2]/div[2]/div[3]/div[4]/div/div[1]/ul/li[2]/ul/li[7]/a").extract()[0]

        next_reviews_page_url = "https://www.booking.com" + response.xpath(
            "/html/body/div[5]/div[1]/div[3]/div[1]/div[1]/div[8]/div/div[1]/div[2]/div[2]/div[2]/div[3]/div[4]/div/div[1]/ul/li[3]/a/svg/path").extract()[0]            "//a[contains(@class,'nav') and contains(@class,'next') and contains(@class,'primary')]/@href").extract()[0]

        yield scrapy.Request(next_reviews_page_url, callback=self.parse)

        review_urls = []
        for partial_review_url in response.xpath("//div[contains(@class,'quote')]/a/@href").extract():
            review_url = response.urljoin(partial_review_url)
            if review_url not in review_urls:
                review_urls.append(review_url)

            yield scrapy.Request(review_url, callback=self.parse_review_page)

    def parse_review_page(self, response):

        item = HotelreviewsItem()

        item["reviewer_id"] = next(iter(response.xpath(
            "/html/body/div[5]/div[1]/div[3]/div[1]/div[1]/div[8]/div/div[1]/div[2]/div[2]/div[2]/div[3]/ul/li[2]/div/div[2]/div/div[1]/div/div[2]/span[1]").extract()),
                                   None)
        item["review_language"] = next(iter(response.xpath(
            "/html/body/div[5]/div[1]/div[3]/div[1]/div[1]/div[8]/div/div[1]/div[2]/div[2]/div[2]/div[1]/div[2]/div/div/label[1]/input").extract()),
                                       None)
        item["review_id"] = next(iter(response.xpath(
            "//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]/span/@data-reviewid").extract()),
                                 None)

        review_id = item["review_id"]
        review_url_on_page = response.xpath('//script[@type="application/ld+json"]/text()').extract()
        review = eval(review_url_on_page[0])

        item["review"] = review["reviewBody"].replace("\\n", "")
        item["title"] = review["name"]
        item["rating"] = review["reviewRating"]["ratingValue"]
        item["image_url"] = review["image"]
        item["hotel_type"] = review["itemReviewed"]["@type"]
        item["hotel_name"] = review["itemReviewed"]["name"]
        item["price_range"] = review["itemReviewed"]["priceRange"]
        item["hotel_adress"] = review["itemReviewed"]["address"]
        try:
            item["published_date"] = review["datePublished"]
        except KeyError:

            item["published_date"] = next(iter(response.xpath(
                f"//div[contains(@id,'review_{review_id}')]/div/div/span[@class='ratingDate']/@title""").extract()),
                                          None)


        try:
            item["trip_date"] = next(iter(response.xpath("//div[contains(@class,"
                                                         "'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div["
                                                         "contains(@class,'prw_reviews_stay_date_hsx')]/text()").extract()),
                                     None)

        except:

            item["trip_date"] = next(iter(response.xpath(
                "//div[contains(@id,'review_538163624')]/div/div/div[@data-prwidget-name='reviews_stay_date_hsx']/text()").extract()),
                                     None)

        yield item
