In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess

from selenium import webdriver
import pandas as pd

In [2]:
class RedditbotSpider(scrapy.Spider):
    name = 'redditbot'
    allowed_domains = ['www.reddit.com/r/gameofthrones']
    start_urls = ['http://www.reddit.com/r/gameofthrones//']
    
    def parse(self, response):
        #Extracting the content using css selectors
        titles = response.css('.title.may-blank::text').extract()
        votes = response.css('.score.unvoted::text').extract()
        times = response.css('time::attr(title)').extract()
        comments = response.css('.comments::text').extract()
       
        #Give the extracted content row wise
        for item in zip(titles,votes,times,comments):
            #create a dictionary to store the scraped info
            scraped_info = {
                'title' : item[0],
                'vote' : item[1],
                'created_at' : item[2],
                'comments' : item[3],
            }

            #yield or give the scraped info to scrapy
            yield scraped_info
 
        # Getting the information needed to continue to the next ten entries.
        next_page = response.xpath('next-buttons').extract_first()
        
        # Recursively calling the spider to process the next ten entries, if they exist.
        if next_page is not None:
            next_page = '{}&lhcontinue={}'.format(self.start_urls[0],next_page)
            yield scrapy.Request(next_page, callback=self.parse)
  

In [3]:
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'GoT.json',
    # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
    'ROBOTSTXT_OBEY': False,
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'LOG_ENABLED': False,
    # We use CLOSESPIDER_PAGECOUNT to limit our scraper to the first 100 links.    
    'CLOSESPIDER_PAGECOUNT' : 10
})

# Starting the crawler with our spider.
process.crawl(RedditbotSpider)
process.start()


In [4]:
# Checking whether we got data 

GoT=pd.read_json('GoT.json', orient='records')
print(GoT.shape)
print(GoT.tail())

(27, 4)
        comments                    created_at  \
22     1 comment  Sun Mar 11 08:17:41 2018 UTC   
23   17 comments  Sat Mar 10 18:17:39 2018 UTC   
24     1 comment  Sat Mar 10 16:24:49 2018 UTC   
25  102 comments  Sat Mar 10 15:40:26 2018 UTC   
26    3 comments  Sun Mar 11 03:48:22 2018 UTC   

                                                title  vote  
22  [No Spoilers] Game of Thrones themed wall plat...    11  
23  [NO SPOILERS] Why are there missing GoT episod...     9  
24  [Spoilers] The Possible Fate of Sansa Stark Se...     0  
25  [SPOILERS] The full circle nature of the separ...  2256  
26  [NO SPOILERS] Local marching band rehearsing t...     4  
