In [1]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd

import time
import random

from bs4 import BeautifulSoup
import requests as req

import json
import w3lib.html

from lxml import html
from tqdm import tqdm

In [6]:
class Scraper():
    def __init__(self):
        pass
    
    def index_archive(self):
        date_range = list(pd.date_range('2020-02-01', '2022-03-07', freq='D'))

        dates = []
        for d in date_range:
            dates.append(str(d)[:10].replace('-', '/'))

        self.archive_index = [f"https://www.trouw.nl/archief/{date}" for date in dates]
        
    def index_urls(self):
        
        def fetchArchiveURL(url):
            document = {}

            try:
                r = req.get(url)
                r.raise_for_status()
            except req.exceptions.HTTPError as err:
                # TODO: add error handling
                pass
            else:        
                document['url'] = url
                document['raw_html'] = r.content
                document['in_cache_date'] = datetime.utcnow()

            return document
        
        def parseArchiveURL(url):
            doc = fetchArchiveURL(url)
            raw_html = doc['raw_html']
            soup = BeautifulSoup(raw_html, 'html')
            tree = html.fromstring(raw_html)

            try:
                article_urls = tree.xpath('/html/body/main/div[2]/*/a[@class="teaser__link"]/@href')
            except: 
                article_urls = None

            return article_urls
        
        self.article_urls = []
        print('Indexing urls: ')
        for url in tqdm(self.archive_index):
            # Simulate human usage 
            time.sleep(random.randint(0, 3))
            self.article_urls.extend(parseArchiveURL(url))
        
        self.article_urls = ["http://trouw.nl" + x for x in self.article_urls]
    
    def crawl_articles(self):
        
        def fetchArticle(url):
            document = {}

            try:
                r = req.get(url)
                r.raise_for_status()

            except req.exceptions.HTTPError as err:
                # TODO: add error handling
                pass

            else:        
                document['url'] = url
                document['raw_html'] = r.content
                document['in_cache_date'] = datetime.utcnow()
                document['parsed'] = 0

            return document
        
        self.articles = []
        print('Crawling articles: ')
        for url in tqdm(self.article_urls):
            # Simulate human usage
            time.sleep(random.randint(0, 3))
            self.articles.append(fetchArticle(url))
    
    def clean_articles(self):
        
        def parse_article(r):
            a = {}

            a['url']  = r['url']

            tree = html.fromstring(r['raw_html'])

            try:

                a['timestamp'] = tree.xpath('/html/body/main/article/header/section/time[@class="artstyle__production__datetime"]/@datetime')[0].strip()
            except: 
                a['timestamp'] = None
            try:
                a['title'] = tree.xpath('//h1/text()')[0].strip()
            except:
                a['title'] = None
            try:
                a['publisherID'] = tree.xpath('/html/body/main/article/header/section/span[1]/a/text()')[0].strip()
            except:
                a['publisherID'] = None
            try:
                a['cleantext'] = ''.join(tree.xpath('//html/body/main/article/section/section/*/text()')[1:])
            except:
                a['cleantext'] = None
            try:
                a['category'] = tree.xpath('/html/body/main/section/div[2]/h2/span/a/text()')[0].strip()
            except:
                a['category'] = None

            return a
        
        print('Cleaning articles:')
        self.cleaned_articles = []
        for article in tqdm(self.articles):
            self.cleaned_articles.append(parse_article(article))
    
    def save_json(self):
        
        with open('Trouw.json', 'w') as file:
            json.dump(self.cleaned_articles, file, indent=4)
            
    

In [7]:
scraper = Scraper()
scraper.index_archive()
scraper.index_urls()
scraper.crawl_articles()
scraper.clean_articles()
scraper.save_json()

  0%|                                                                                          | 0/766 [00:00<?, ?it/s]

Indexing urls: 


100%|████████████████████████████████████████████████████████████████████████████████| 766/766 [26:07<00:00,  2.05s/it]
  0%|                                                                                        | 0/43791 [00:00<?, ?it/s]

Crawling articles: 


 10%|███████                                                                 | 4277/43791 [2:36:37<24:07:04,  2.20s/it]


ConnectionError: HTTPSConnectionPool(host='www.trouw.nl', port=443): Max retries exceeded with url: /nieuws/nee-het-is-niet-maar-een-griepje-waar-de-vergelijking-tussen-covid-19-en-griep-mank-gaat~b295b811/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001CFC1C35F40>: Failed to establish a new connection: [Errno 11002] getaddrinfo failed'))