In [1]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd

import time
import random

from bs4 import BeautifulSoup
import requests as req
from fake_useragent import UserAgent

import json
import w3lib.html

from lxml import html
from tqdm import tqdm

In [8]:
class Scraper():
    def __init__(self):
        self.ua = UserAgent()
        self.headers = req.utils.default_headers()
    
    def index_archive(self):
        date_range = list(pd.date_range('2021-02-01', '2022-03-08', freq='D'))

        dates = []
        for d in date_range:
            dates.append(str(d)[:10].replace('-', '/'))

        self.archive_index = [f"https://www.trouw.nl/archief/{date}" for date in dates]
        
    def index_urls(self):
        
        def fetchArchiveURL(url):
            document = {}

            try:
                # Random User Agent
                self.headers.update({'User-Agent': self.ua.random,})
                r = req.get(url)
                r.raise_for_status()
            except req.exceptions.HTTPError as err:
                # TODO: add error handling
                pass
            else:        
                document['url'] = url
                document['raw_html'] = r.content
                document['in_cache_date'] = datetime.utcnow()

            return document
        
        def parseArchiveURL(url):
            doc = fetchArchiveURL(url)
            raw_html = doc['raw_html']
            soup = BeautifulSoup(raw_html, 'html')
            tree = html.fromstring(raw_html)

            try:
                article_urls = tree.xpath('/html/body/main/div[2]/*/a[@class="teaser__link"]/@href')
            except: 
                article_urls = None

            return article_urls
        
        self.article_urls = []
        print('Indexing urls: ')
        for url in tqdm(self.archive_index):
            # Simulate human usage 
            time.sleep(random.randint(0, 3))
            self.article_urls.extend(parseArchiveURL(url))
        
        self.article_urls = ["http://trouw.nl" + x for x in self.article_urls]
    
    def crawl_articles(self):
        
        def fetchArticle(url):
            document = {}

            try:
                # Random User Agent
                self.headers.update({'User-Agent': self.ua.random,})
                r = req.get(url)
                r.raise_for_status()

            except req.exceptions.HTTPError as err:
                pass

            else:        
                document['url'] = url
                document['raw_html'] = r.content
                document['in_cache_date'] = datetime.utcnow()
                document['parsed'] = 0

            return document
        
        self.articles = []
        print('Crawling articles: ')
        for url in tqdm(self.article_urls):
            # Simulate human usage
            time.sleep(random.randint(0, 3))
            self.articles.append(fetchArticle(url))
    
    def clean_articles(self):
        
        def parse_article(r):
            a = {}
            
            try:
                a['url']  = r['url']
                tree = html.fromstring(r['raw_html'])
                
                try:
                    a['timestamp'] = tree.xpath('/html/body/main/article/header/section/time[@class="artstyle__production__datetime"]/@datetime')[0].strip()
                except: 
                    a['timestamp'] = None
                try:
                    a['title'] = tree.xpath('//h1/text()')[0].strip()
                except:
                    a['title'] = None
                try:
                    a['publisherID'] = tree.xpath('/html/body/main/article/header/section/span[1]/a/text()')[0].strip()
                except:
                    a['publisherID'] = None
                try:
                    a['cleantext'] = ''.join(tree.xpath('//html/body/main/article/section/section/*/text()')[1:])
                except:
                    a['cleantext'] = None
                try:
                    a['category'] = tree.xpath('/html/body/main/section/div[2]/h2/span/a/text()')[0].strip()
                except:
                    a['category'] = None
                
            except:
                pass
            


            return a
        
        print('Cleaning articles:')
        self.cleaned_articles = []
        for article in tqdm(self.articles):
            self.cleaned_articles.append(parse_article(article))
    
    def save_json(self):
        
        with open('Trouw.json', 'w') as file:
            json.dump(self.cleaned_articles, file, indent=4)
            
    

In [3]:
scraper = Scraper()
scraper.index_archive()
scraper.index_urls()
scraper.crawl_articles()


  0%|                                                                                          | 0/401 [00:00<?, ?it/s]

Indexing urls: 


100%|████████████████████████████████████████████████████████████████████████████████| 401/401 [13:12<00:00,  1.98s/it]
  0%|                                                                                        | 0/23060 [00:00<?, ?it/s]

Crawling articles: 


100%|█████████████████████████████████████████████████████████████████████████| 23060/23060 [14:11:09<00:00,  2.21s/it]
  0%|                                                                              | 31/23060 [00:00<01:16, 301.48it/s]

Cleaning articles:


 51%|██████████████████████████████████████                                     | 11716/23060 [00:36<00:35, 322.81it/s]


KeyError: 'url'

In [9]:
scraper.clean_articles()


  0%|                                                                              | 28/23060 [00:00<01:24, 274.05it/s]

Cleaning articles:


 51%|██████████████████████████████████████                                     | 11716/23060 [00:37<00:36, 311.79it/s]


KeyError: 'url'

In [10]:
scraper.save_json()