In [87]:
from neo4j.v1 import GraphDatabase, basic_auth
import requests as r
from requests.exceptions import *
import time
from lxml import html, etree
import sys
import dateparser
import re

import json

We overwrite Jupyters defaut logging pipe and send all output to a the file *scraping.log*.

In [88]:
import logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='scraping.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

# Initialize our database connection
The results are stored in a NEO4J database. Make sure you have one, with the BOLT protocol enabled, running locally.

In [89]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth("neo4j", "pass"))
session = driver.session()

# Create page and vote objects
Scraping code is generally ugly, period. However, to create a bit more structure we define a few usefull classes.

## Exceptions
Since scraping is pretty error prone due to the many HTTP request we define some exceptions to handle errors neathly.

In [90]:
class InvalidResponse(Exception):
    def __init__(self, *args):
        super(Exception, self).__init__(*args)
        
class ScraperException(Exception):
    def __init__(self, *args):
        super(Exception, self).__init__(*args)

Our general Scraper object will form the basis for each scraping operation.

In [91]:
class Scraper(object):
    def __init__(self, 
                 maxTries = 3,
                 sleepTimeOnError = 30,
                 accept404 = False):
        self.maxTries = maxTries
        self.sleepTime = sleepTimeOnError
        self.logger = logging.getLogger('Scraper')
        self.accept404 = accept404
        
    def scrape(self, url):
        num = 0 #Set the repetition count to 0

        while(True):
            num += 1

            if num >= self.maxTries:
                raise ScraperException('Maximum number of tries exceeded.')

            try:
                resp = r.get(url)
                if resp.status_code == 404 and self.accept404:
                    return False
                if resp.status_code != 200 or resp.content == None:
                    raise InvalidResponse('Request was succesfull, but got no valid response.')
                break
            except RequestException:
                # If we encounter an error of Requests, just retry immediately
                self.logger.warning('Encountered a request exception. URL: {0} TRY: {1}'.format(url, num))
                pass
            except InvalidResponse as e:
                # If the request was succesfull but we don't like the response
                # sleep some time before retrying.
                self.logger.warning('Encountered a Invalid Response. URL: {0} TRY: {1}'.format(url, num))
                time.sleep(self.sleepTime)
        return self.parse(resp)

    def parse(self, response):
        raise NotImplementedError('Parser method should be defined in an inherited class')
    
    
            

PaginatedScraper to be used to iterate over content represented by multiple pages.

In [92]:
class PaginatedScraper(Scraper):
    def __init__(self, 
                 urlUpdate, 
                 timeBetween = 0, 
                 maxTries = 3, 
                 maxIterations = 0, 
                 sleepTimeOnError = 30):
        super().__init__(maxTries,
                        sleepTimeOnError)
        self.urlUpdate = urlUpdate
        self.timeBetween = timeBetween
        self.maxIterations = maxIterations
        self.logger = logging.getLogger("PaginatedScraper")
    
    def __iter__(self):
        self.iteration = 1
        return self  
    
    def __next__(self):
        # First call the urlUpdate closure to get the url for the next iteration
        url = self.urlUpdate()
        
        # Sleep the desired time between updates
        time.sleep(self.timeBetween)
        
        if self.maxIterations != 0 and self.iteration >= self.maxIterations:
            raise StopIteration()       
                         
        try:
            scrapedPage = super().scrape(url)
        except ScraperException:
            self.logger.error("Scraping failed even after retries. Skipping the URL. URL: {0}".format(url))
            return self.__next__()
            
        self.iteration += 1
        # We may assume resp exists here and contains valid information
        return scrapedPage
    
    def parse(self, response):
        raise NotImplementedError('This class should not directly be initiated.');

The VotePagesScraper is a PaginatedScraper specific for the vote pages on tweedekamer.nl

In [93]:
class VotePagesScraper(PaginatedScraper):
    BASE_URL = "https://www.tweedekamer.nl/kamerstukken/"
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def parse(self, response):
        content = response.content
        pageTree = html.fromstring(content)
        pathString = '//li/div[@class="search-result-content"]/h3/a/@href'
        votes = pageTree.xpath(pathString)
        
        #If the current page is empty, stop scraping
        if len(votes) < 1:
            raise StopIteration()
            
        for vote in votes:
            time.sleep(self.timeBetween)
            yield VotePagesScraper.BASE_URL + vote

The document object will be used to parse documents related to Votes

In [94]:
class Document(Scraper):
    def parse(self, response):
        if 'Content-Type' in response.headers and response.headers['Content-Type'] == 'text/xml':
            tree = etree.fromstring(response.content)            
            return {
                "text": "\n".join(tree.xpath('//vrije-tekst//al//text()')),
                "url": response.url
            }
        return {
            "url": response.url
        }

The MetaData Object will be used to scrape the metadata for a specific vote.

In [95]:
class MetaData(Scraper):
    def parse(self, response):
        result = []
        
        if 'Content-Type' in response.headers and response.headers['Content-Type'] == 'text/xml':
            tree = etree.fromstring(response.content)
            themes = tree.xpath('//metadata[@scheme="OVERHEID.TaxonomieBeleidsagenda"]/@content')
            
            for theme in themes:
                try:
                    result.append({
                            'major': theme.split(' | ')[0],
                            'minor': theme.split(' | ')[1]
                        })
                except IndexError:
                    continue                  
        return result

The Vote object will be used to scrape multiple Votes in a VoteGroup

In [96]:
class Vote(PaginatedScraper):
    def parse(self, response):
        content = response.content
        if len(content) < 20:
            raise StopIteration()
        pageTree = html.fromstring(content)
        medeLeden = pageTree.xpath('//dt[text()="Medeindiener"]/following-sibling::dd[1]/a[1]/text()')
        medePartijen = pageTree.xpath('//dt[text()="Medeindiener"]/following-sibling::dd[1]/a[2]/text()')
        medeIndieners = [{"Lid":a, "Partij": b} for a,b in zip(medeLeden,medePartijen)]
        
        return medeIndieners

The VoteGroup object is a representation of one or multiple votes on the same topic.

In [97]:
class VoteGroup(Scraper):
    BASE_URL = "https://www.tweedekamer.nl"
    def __init__(self, url, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.url = url
        
        self.i = 1
        
        self.content, self.urls = self.scrape(self.url)
        self.logger = logging.getLogger("VoteGroupScraper")
        
        
    def getSubmitter(self, voteOverview):
        lid = voteOverview.xpath('./div[@class="search-result-content"]/p[@class="submitter"]/a[1]/text()')
        partij = voteOverview.xpath('./div[@class="search-result-content"]/p[@class="submitter"]/a[2]/text()')
        
        secretary = voteOverview.xpath('./div[@class="search-result-content"]/p[@class="submitter"]/text()')
        
        # Check whether the submitter is a parliament member
        if len(lid) >0 and len(partij) > 0:
            return {
                "Naam": lid[0],
                "Partij": partij[0],
                "Type": "Kamerlid"
            }
        elif len(secretary) > 0 and re.match('([A-Za-z\.\s]*),([A-Za-z\s)]*)', secretary[0]):
            matches = re.match('([A-Za-z\.\s]*),([A-Za-z\s)]*)', secretary[0]).groups()
            # If it is not a member of parliament, try to check whether it is a secretary of state
            return {
                "Naam": matches[0].strip(' \t\n\r'),
                "Partij": "None",
                "Type": matches[1].strip(' \t\n\r')            
            }
        return False
    
    def getVoteResults(self, voteOverview):
        resultsSection = voteOverview.xpath('./div[@class="vote-result"]')
        if len(resultsSection) > 0:
            voteResults = []
            
            voteType = resultsSection[0].xpath('./p[@class="vote-type"]/text')
            resultRows = resultsSection[0].xpath('./table[@class="statistics"]/tbody/tr')
            for row in resultRows:
                partij = row.xpath('./td[1]/a/text()')
                zetels = row.xpath('./td[2]/text()')
                voor = row.xpath('./td[3]/img/@width')
                tegen = row.xpath('./td[4]/img/@width')
                
                if len(partij) > 0 and len(zetels) > 0 and (len(voor) > 0 or len(tegen) > 0):
                    voor = int(voor[0]) if len(voor) > 0 else 0
                    tegen = int(tegen[0]) if len(tegen) >0 else 0
                    onthouding = int(zetels[0]) - voor - tegen  
                    voteResults.append({
                            "partij": partij[0],
                            "voor": int(voor),
                            "tegen": int(tegen),
                            "onthouding": int(onthouding)
                        })
            return voteResults
        return []
                
    
    def getDocumentText(self, dossierNummer):
        links = ["https://zoek.officielebekendmakingen.nl/kst-{0}.xml".format(dossierNummer)]
        documents = []
        documentScraper = Document()
        
        for link in links:
            try:
                document = documentScraper.scrape(link)
                documents.append(document)
            except Exception:
                self.logger.exception("Could not parse document. Skipping it. URL: {0}".format(link))
                return []
        return documents
    
    def getThemes(self, dossierNummer):
        url = "https://zoek.officielebekendmakingen.nl/kst-{0}/metadata.xml".format(dossierNummer)
        metaData = MetaData(accept404 = True)
        
        try:
            result = metaData.scrape(url)
        except ScraperException:
            return []
        return result

    def __iter__(self):
        self.contentIter = iter(self.content)
        
        def urlIter():
            urlIter = iter(self.urls)
            def inner():
                nonlocal urlIter
                return VoteGroup.BASE_URL + next(urlIter)
            return inner
        
        self.urlIter = urlIter
        self.votesIter = iter(Vote(urlIter()))
        return self
    
    def __next__(self):
        voteOverview = next(self.contentIter)
        medeIndieners = []
        try:
            medeIndieners = next(self.votesIter)
        except StopIteration:
            self.logger.error("Vote seems to have no detail page, skipping it.")
            return self.__next__()
        except ScraperException:
            self.logger.error("Scraper Exception, skipping the vote.")
            return self.__next__()

            
        try:
            dossierNummer = voteOverview.xpath('./div[@class="search-result-properties"]/p[1]/text()')[0]
            data = {
                "group": self.groupProperties,
                "category": voteOverview.xpath('./p[@class="search-result-category"]/text()')[0],
                "title": voteOverview.xpath('./div[@class="search-result-content"]/h3/a[1]/text()')[0]
                    .replace('\n', '')
                    .replace('\t', '')
                    .replace('  ', ''),
                "indiener": self.getSubmitter(voteOverview), 
                "medeIndieners": medeIndieners,
                "datum": 
                    int(dateparser.parse(
                        voteOverview.xpath('./div[@class="search-result-properties"]/p[@class="date"]/text()')[0],
                        settings={'TIMEZONE': 'EST'}
                    ).timestamp()),
                "dossierNummer": dossierNummer,
                "documents": self.getDocumentText(dossierNummer),
                "besluit": voteOverview.xpath('./div[@class="search-result-content"]/p[@class="result"]/span/text()')[0]
                    .replace('.', ''),
                "voteResults": self.getVoteResults(voteOverview),
                "themes": self.getThemes(dossierNummer)
            }
        except IndexError:
            # If an index error is raised we can't parse a vote in a meaningfull way. We decide to skip it.
            self.logger.error("Index Error for vote, skipping the vote.")
            return self.__next__()
        return data
        
    def parse(self, response):
        content=response.content
        pageTree = html.fromstring(content)
        
        try:
            groupTitle = pageTree.xpath('//div[@class="center-content"]/h1/text()')[0].strip(' \t\n\r')
            groupLocation = pageTree.xpath('//div[@class="center-content"]/p[@class="vote-info"]/text()')[0].strip(' \t\n\r')
            groupDate = int(dateparser.parse(
                pageTree.xpath('//div[@class="center-content"]/p[@class="vote-info"]\
/span[@class="date"]/text()')[0],
                settings={'TIMEZONE': 'EST'}
            ).timestamp())
        except IndexError:
            self.logger.error("Index error for votegroup, skipping it.")
            raise StopIteration()
            
        votes = pageTree.xpath('//div[@class="vote-results"]/ul/li')
        urls = pageTree.xpath('//div[@class="vote-results"]/ul/li\
/div[@class="search-result-content"]/h3/a/@href')
                              
        self.numberOfVotes = len(votes)
        
        self.groupProperties = {
            "title": groupTitle
                    .replace('\n', '')
                    .replace('\t', '')
                    .replace('  ', ''),
            "location": groupLocation,
            "date": groupDate,
            "numberOfItems": self.numberOfVotes
        }        
        
        
        
        return votes, urls

# Scraping votes

To loop through the desired URLs we use a closure that returns the next URL every time it's called.

In [98]:
def votePagesUrl():
    currentPage = 0
    def calculateUrl():
        nonlocal currentPage
        url = "https://www.tweedekamer.nl/kamerstukken/stemmingsuitslagen?\
qry=%2A&fld_tk_categorie=Kamerstukken&\
fld_tk_subcategorie=Stemmingsuitslagen&\
srt=date%3Adesc%3Adate%2Cprl_volgorde%3Aasc%3Anum&\
clusterName=Stemmingsuitslagen&\
Type=Kamerstukken&\
fromdate=01%2F01%2F2016&\
todate=31%2F12%2F2016&\
dpp=15&sta={0}".format(currentPage)
        currentPage += 15
        return url
    
    return calculateUrl

In [99]:
def prepareQuery(vote):
    query = ""
    
    # Add the location
    query += "MERGE (location:Location {{name:'{0[group][location]}'}})\n".format(vote)
    
    # Add the overlaying topic
    query += ("MERGE (topic:Topic{{"
                 "name: '{0[group][title]}'"
                 "}})\n").format(vote)
    
    query += "MERGE (location)<-[discussed:Discussed {{date: {0[group][date]}}}]-(topic)\n".format(vote)
    
    # Add the vote
    query += ("CREATE (vote:Vote {{"
             "title: '{0[title]}',"
             "id: '{0[dossierNummer]}'," 
             "ruling: '{0[besluit]}',"
             "submitted: {0[datum]}"
             "}})\n").format(vote)
    
    #Add the relationship to the overlaying topic
    query += "CREATE (vote)-[:PartOf]->(topic)\n"
      
    if vote['indiener']:
        # We add the submitter
        query += ("MERGE (submitter:Person {{"
                    "name: '{0[indiener][Naam]}',"
                    "function: '{0[indiener][Type]}'"
                    "}})\n").format(vote)

        #Add the relation to the vote
        query += ("CREATE (submitter)-[:Submitted {{ date: {0[datum]} }}]->(vote)\n").format(vote)
    
        # The party of the submiter
        query += ("MERGE (party:Party {{ name:'{0[indiener][Partij]}' }})\n").format(vote)
    
        #The membership of the submitter to the party
        query += ("MERGE (submitter)-[:Member]->(party)\n")
    
    # loop through each co-submitter
    query += ("WITH "+listOfDictsToString(vote['medeIndieners'])+ " AS cosubmitters\n")
    #Foreach loop
    query += ("FOREACH (pers IN cosubmitters | \n"
             "MERGE (vote:Vote {{"
             "id: '{0[dossierNummer]}'," 
             "ruling: '{0[besluit]}'"
             "}})\n"
             "MERGE (cosub:Person {{"
             "name: pers.Lid,"
             "function: 'Kamerlid'"
             "}})\n"
             "MERGE (cosubParty:Party {{name: pers.Partij}})\n"
             "MERGE (cosub)-[:Member]->(cosubParty)\n"
             "CREATE (cosub)-[:Cosubmitted]->(vote)\n"
             ")\n").format(vote)
    
    # loop through each theme
    query += ("WITH "+listOfDictsToString(vote['themes'])+ " AS themes\n")
    #Foreach loop
    query += ("FOREACH (theme IN themes | \n"
             "MERGE (vote:Vote {{"
             "id: '{0[dossierNummer]}'," 
             "ruling: '{0[besluit]}'"
             "}})\n"
             "MERGE (mat:MajorTheme {{"
             "name: theme.major"
             "}})\n"
             "MERGE (mit:MinorTheme {{"
             "name: theme.minor"
             "}})\n"
             "MERGE (mat)<-[:ParentTheme]-(mit)\n"
             "CREATE (mit)<-[:Theme]-(vote)\n"
             ")\n").format(vote)
    
    # Vote results
    query += ("WITH "+listOfDictsToString(vote['voteResults'])+ " AS voters\n")
    
    #Add vote edges
    query += ("FOREACH (voteRes IN voters | \n"
             "MERGE (vote:Vote {{"
             "id: '{0[dossierNummer]}'," 
             "ruling: '{0[besluit]}'"
             "}})\n"
             "MERGE (voter:Party {{name: voteRes.partij}})\n"
             "CREATE (voter)-[:Voted {{"
             "pro:voteRes.voor, "
             "con:voteRes.tegen, "
             "withold:voteRes.onthouding"
             "}}]->(vote)\n"
             ")").format(vote)
    
    # We add the documents to the vote
    query += ("WITH "+listOfDictsToString(vote['documents'])+ " AS docs\n")
    #Foreach loop
    query += ("FOREACH (doc IN docs | \n"
             "MERGE (vote:Vote {{"
             "id: '{0[dossierNummer]}'," 
             "ruling: '{0[besluit]}'"
             "}})\n"
             "CREATE (accompDoc:Document {{"
             "url: doc.url,"
             "text: doc.text"
             "}})\n"
             "CREATE (accompDoc)-[:Describes]->(vote)\n"
             ")\n").format(vote)
    
    return query

In [100]:
# We cannot, unfortunately, just use str(), due to the quotation around the keys that will add
# The following implementation is crappy, non-pythonic and to be used at your own risk:)

def listOfDictsToString(listOfDictionaries):
    value = "["
    outerIt =1
    outerSize = len(listOfDictionaries)
    
    for item in listOfDictionaries:
        value = value + "{"
        size = len(item)
        iteration = 1
        
        for key, val in item.items():
            if isinstance(val, str):
                value = value + key + ":" + "'{0}'".format(val)
            else:
                value = value + key + ":" + "{0}".format(val)
            if iteration != size:
                value = value + ","
            iteration +=1
        value = value + "}"
        
        if outerIt != outerSize:
            value = value + ","
        
        outerIt +=1
    
    value = value + "]"
    return value            

In [101]:
def addEscapeSequences(data):
    if type(data) == dict:
        for key, value in data.items():
            data[key] = addEscapeSequences(value)
                
    elif type(data) == list:
        for key, value in enumerate(data):
            data[key] = addEscapeSequences(value)
    
    elif isinstance(data, str):
        data = data.translate(str.maketrans({"'":  r"\'"}))           

    return data

In [1]:
k=0
a=None
c=None
for voteListPage in VotePagesScraper(votePagesUrl(), maxIterations = 1000000):
    for voteGroupUrl in voteListPage:
        try:
            for vote in VoteGroup(voteGroupUrl):
                a=session.run(prepareQuery(addEscapeSequences(vote)))
                for b in a:
                    print(b)
                sys.stdout.write("\rDoing thing {0}".format(k))
                k=k+1
        except Exception:
            logging.exception("Something general went wrong.")