In [14]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

import time
import random

from bs4 import BeautifulSoup
import requests as req
from fake_useragent import UserAgent
import pandas as pd

import json
import w3lib.html

from lxml import html
from tqdm import tqdm

In [15]:
class Scraper():
    def __init__(self):
        self.ua = UserAgent()
        self.headers = req.utils.default_headers()
    
    def index_archive(self):
        date_range = list(pd.date_range('2007-01-01', '2022-03-04', freq='D'))

        dates = []
        for d in date_range:
            dates.append(str(d)[:10].replace('-', '/'))
        
        self.archive_urls = [f"https://www.parool.nl/archief/{date}" for date in dates]
        
    def index_urls(self):
        
        def fetchArchiveURL(url):
            document = {}

            try:
                # Random User Agent
                self.headers.update({'User-Agent': self.ua.random,})
                r = req.get(url)
                r.raise_for_status()
            except req.exceptions.HTTPError as err:
                pass
            else:        
                document['url'] = url
                document['raw_html'] = r.content
                document['in_cache_date'] = datetime.utcnow()

            return document
        
        def parseArchiveURL(url):
            doc = fetchArchiveURL(url)
            raw_html = doc['raw_html']
            soup = BeautifulSoup(raw_html, 'html')
            tree = html.fromstring(raw_html)

            try:
                article_urls = tree.xpath('/html/body/main/div[2]/*/a[@class="teaser__link"]/@href')
            except: 
                article_urls = None


            return article_urls
        
        self.article_urls = []
        print('Indexing urls: ')
        for url in tqdm(self.archive_urls[:10]):
            # Simulate human usage 
            time.sleep(random.randint(0, 3))
            self.article_urls.extend(parseArchiveURL(url))
        
        self.article_urls = ["http://parool.nl" + x for x in self.article_urls]
    
    def crawl_articles(self):
        
        def fetchArticle(url):
            document = {}

            try:
                # Random User Agent
                self.headers.update({'User-Agent': self.ua.random,})
                r = req.get(url)
                r.raise_for_status()

            except req.exceptions.HTTPError as err:
                # TODO: add error handling
                pass

            else:        
                document['url'] = url
                document['raw_html'] = r.content
                document['in_cache_date'] = datetime.utcnow()
                document['parsed'] = 0

            return document
        
        self.articles = []
        print('Crawling articles: ')
        for url in tqdm(self.article_urls):
            # Simulate human usage
            time.sleep(random.randint(0, 3))
            self.articles.append(fetchArticle(url))
    
    def clean_articles(self):
        
        def parse_article(r):
            try:
                a = {}

                a['url']  = r['url']

                tree = html.fromstring(r['raw_html'])

                try:

                    a['timestamp'] = tree.xpath('/html/body/main/article/header/section/time[@class="artstyle__production__datetime"]/@datetime')[0].strip()
                except: 
                    a['timestamp'] = None
                try:
                    a['title'] = tree.xpath('//h1/text()')[0].strip()
                except:
                    a['title'] = None
                try:
                    a['publisherID'] = tree.xpath('/html/body/main/article/header/section/span[1]/a/text()')[0].strip()
                except:
                    a['publisherID'] = None
                try:
                    a['cleantext'] = ''.join(tree.xpath('//html/body/main/article/section/section/*/text()')[1:])
                except:
                    a['cleantext'] = None
                try:
                    a['category'] = tree.xpath('/html/body/main/section/div[2]/h2/a/span/text()')[0].strip()
                except:
                    a['category'] = None
            except:
                pass

            return a


        print('Cleaning articles:')
        self.cleaned_articles = []
        for article in tqdm(self.articles):
            self.cleaned_articles.append(parse_article(article))
    
    def save_json(self):
        
        with open('HetParool.json', 'w') as file:
            json.dump(self.cleaned_articles, file, indent=4)
            
    

In [16]:
scraper = Scraper()

In [17]:
scraper.index_archive()

In [18]:
scraper.index_urls()

  0%|                                                                                       | 0/10 [00:00<?, ?it/s]

Indexing urls: 


100%|██████████████████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00,  1.29s/it]

['http://parool.nl/nieuws/amsterdam-danst-naar-2007~b01ec144/', 'http://parool.nl/nieuws/geen-nieuwjaarsduik-scheveningen~b37ff615/', 'http://parool.nl/nieuws/veel-vernielingen-in-zuid-holland~b9a53fad/', 'http://parool.nl/nieuws/veel-autobranden-in-den-haag~be47b517/', 'http://parool.nl/nieuws/vuurwerk-eist-leven~bb994a04/', 'http://parool.nl/nieuws/onderzoek-naar-opnames-executie-saddam~bbcb30a3/', 'http://parool.nl/nieuws/recherche-wil-weer-infiltranten~b0f70b3f/', 'http://parool.nl/nieuws/xaviera-hollander-getrouwd~b58f3d0a/', 'http://parool.nl/nieuws/oud-nieuw-blijft-rustig~b07a0e1a/', 'http://parool.nl/nieuws/veel-meer-asbest-in-otapan~b5e2423a/', 'http://parool.nl/nieuws/aex-opent-nieuw-jaar-boven-500~b4dc55ce/', 'http://parool.nl/sport/van-barneveld-wint-wk-darts~bca085c78/', 'http://parool.nl/nieuws/olieprijs-voor-het-eerst-op-100-dollar-per-vat~bbea2dbc/', 'http://parool.nl/nieuws/cohen-nog-niet-tevreden~bbf39f01/', 'http://parool.nl/nieuws/schietpartij-na-achtervolging~b0815




In [19]:
scraper.crawl_articles()

  0%|                                                                                      | 0/209 [00:00<?, ?it/s]

Crawling articles: 


100%|████████████████████████████████████████████████████████████████████████████| 209/209 [07:19<00:00,  2.10s/it]


In [20]:
scraper.clean_articles()

 11%|████████▋                                                                   | 24/209 [00:00<00:00, 235.92it/s]

Cleaning articles:


100%|███████████████████████████████████████████████████████████████████████████| 209/209 [00:00<00:00, 237.59it/s]


In [21]:
scraper.save_json()