In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
import sys
from scrapy.http import Request
from scrapy.linkextractors import LinkExtractor
import json
import logging
import pandas as pd

class HotelreviewsItem(scrapy.Item):
    # define the fields for your item here like:
    rating = scrapy.Field()
    review = scrapy.Field()
    title = scrapy.Field()
    trip_date = scrapy.Field()
    trip_type = scrapy.Field()
    published_date = scrapy.Field()
    image_url = scrapy.Field()
    hotel_type = scrapy.Field()
    hotel_name = scrapy.Field()
    hotel_adress = scrapy.Field()
    price_range = scrapy.Field()
    reviewer_id = scrapy.Field()
    review_id = scrapy.Field()
    review_language = scrapy.Field()
    pid = scrapy.Field()
    locid = scrapy.Field()
    
def user_info_splitter(raw_user_info):
    """

    :param raw_user_info:
    :return:
    """

    user_info = {}

    splited_info = raw_user_info.split()
    for element in splited_info:
        converted_element = get_convertible_elements_as_dic(element)
        if converted_element:
            user_info[converted_element[0]] = converted_element[1]

    return user_info

class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('fr_trip.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

class spoods420(CrawlSpider):
    name = 'spoods_420'
    download_delay = 3
    domain_url = "https://www.tripadvisor.fr"

    start_urls = [
        "https://www.tripadvisor.fr/Hotel_Review-g5555792-d7107948-Reviews-Center_Parcs_Le_Bois_aux_Daims-Les_Trois_Moutiers_Vienne_Nouvelle_Aquitaine.html"]
    
        #Custom settings to modify settings usually found in the settings.py file 
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'don4_fr_spds.json'                        # Used for pipeline 2
    }

    def parse(self, response):

        
        next_reviews_page_url = "https://www.tripadvisor.fr" + response.xpath(
            "//a[contains(@class,'nav') and contains(@class,'next') and contains(@class,'primary')]/@href").extract()[0]
        
        all_review_pages = response.xpath(
            "//a[contains(@class,'pageNum') and contains(@class,'last')]/@data-offset").extract()

        next_reviews_page_url = "https://www.tripadvisor.fr" + response.xpath(
            "//a[contains(@class,'nav') and contains(@class,'next') and contains(@class,'primary')]/@href").extract()[0]

        next_page_number = eval(response.xpath(
            "//a[contains(@class,'nav') and contains(@class,'next') and contains(@class,'primary')]/@data-page-number").extract()[
                                    0])

        if next_page_number < 400:
            yield scrapy.Request(next_reviews_page_url, callback=self.parse)

        review_urls = []
        for partial_review_url in response.xpath("//div[contains(@class,'quote')]/a/@href").extract():
            review_url = response.urljoin(partial_review_url)
            if review_url not in review_urls:
                review_urls.append(review_url)

            yield scrapy.Request(review_url, callback=self.parse_review_page)

    def parse_review_page(self, response):

        item = HotelreviewsItem()

        item["reviewer_id"] = next(iter(response.xpath(
            "//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]/span/@data-memberid").extract()),
                                   None)
        item["review_language"] = next(iter(response.xpath(
            "//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]/span/@data-language").extract()),
                                       None)
        item["review_id"] = next(iter(response.xpath(
            "//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]/span/@data-reviewid").extract()),
                                 None)
        item["review_id"] = next(iter(response.xpath(
            "//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]/span/@data-reviewid").extract()),
                                 None)

        item["pid"] = next(iter(response.xpath(
            "//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]/span/@data-pid").extract()),
                           None)
        item["locid"] = next(iter(response.xpath(
            "//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]/span/@data-locid").extract()),
                             None)

        review_id = item["review_id"]
        review_url_on_page = response.xpath('//script[@type="application/ld+json"]/text()').extract()
        review = eval(review_url_on_page[0])

        item["review"] = review["reviewBody"].replace("\\n", "")
        item["title"] = review["name"]
        item["rating"] = review["reviewRating"]["ratingValue"]
        item["image_url"] = review["image"]
        item["hotel_type"] = review["itemReviewed"]["@type"]
        item["hotel_name"] = review["itemReviewed"]["name"]
        item["price_range"] = review["itemReviewed"]["priceRange"]
        item["hotel_adress"] = review["itemReviewed"]["address"]
        try:
            item["published_date"] = review["datePublished"]
        except KeyError:

            item["published_date"] = next(iter(response.xpath(
                f"//div[contains(@id,'review_{review_id}')]/div/div/span[@class='ratingDate']/@title""").extract()),
                                          None)

        item["trip_type"] = next(iter(response.xpath("//div[contains(@class,"
                                                     "'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div"
                                                     "/div/div/div[contains(@class,'noRatings')]/text()").extract()),
                                 None)

        try:
            item["trip_date"] = next(iter(response.xpath("//div[contains(@class,"
                                                         "'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div["
                                                         "contains(@class,'prw_reviews_stay_date_hsx')]/text()").extract(

            )), None)

        except:

            item["trip_date"] = next(iter(response.xpath(
                "//div[contains(@id,'review_538163624')]/div/div/div[@data-prwidget-name='reviews_stay_date_hsx']/text()").extract()),
                                     None)

        # user_info = response.xpath("//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]").extract()[0]
        # item["unstructured"] = user_info_splitter(user_info)

        yield item


In [2]:
pfr4 = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

pfr4.crawl(spoods420)
pfr4.start()

2019-02-02 17:56:59 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: scrapybot)
2019-02-02 17:56:59 [scrapy.utils.log] INFO: Versions: lxml 4.3.0.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0, Twisted 17.5.0, Python 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.1.1a  20 Nov 2018), cryptography 2.4.2, Platform Darwin-15.6.0-x86_64-i386-64bit
2019-02-02 17:56:59 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'don4_fr_spds.json', 'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2019-02-02 17:57:01 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.tripadvisor.fr/Hotel_Review-g5555792-d7107948-Reviews-Center_Parcs_Le_Bois_aux_Daims-Les_Trois_Moutiers_Vienne_Nouvelle_Aquitaine.html> (referer: None)
Traceback (most recent call last):
  File "/Users/laks/anaconda3/lib/python3.6/site-packa

In [None]:
dfjson2 = pd.read_json('don4_frr.json')
#Previewing DF
dfjson2.head()
dfjson2.info()