In [1]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

from bs4 import BeautifulSoup
import requests as req

import json
import w3lib.html

from lxml import html
from tqdm import tqdm

In [6]:
class Scraper():
    def __init__(self):
        pass
    
    def index_archive(self):
        # Create a list of all monthly sitemap urls
        date_input_format = '%m-%Y'
        date_nrc_format = '%Y-%m'

        # start_date = '10-1970'
        # test start date
        start_date = '2-2022'
        end_date = datetime.today().strftime(date_input_format)

        def diff_month(d1, d2):
            return (d1.year - d2.year) * 12 + d1.month - d2.month

        start = datetime.strptime(start_date, date_input_format)
        end = datetime.strptime(end_date, date_input_format)

        date_range= [start + relativedelta(months=x) for x in range(0, diff_month(end, start)+1)]

        self.archive_index = [f"https://www.nrc.nl/sitemap/{date.strftime(date_nrc_format)}.xml" for date in date_range]
        
    def index_urls(self):
        
        def fetchArchiveURL(url):
            document = {}

            try:
                r = req.get(url)
                r.raise_for_status()
            except req.exceptions.HTTPError as err:
                # TODO: add error handling
                pass
            else:        
                document['url'] = url
                document['raw_xml'] = r.content
                document['in_cache_date'] = datetime.utcnow()

            return document
        
        def fetchArticleURL(url):
            doc = fetchArchiveURL(url)
            raw_xml = doc['raw_xml']
            soup = BeautifulSoup(raw_xml, 'xml')

            article_urls = []
            locs = soup.find_all('loc')
            for loc in locs:
                article_urls.append(loc.get_text())

            return article_urls
        
        self.article_urls = []
        print('Indexing urls:')
        for url in tqdm(self.archive_index):
            self.article_urls.extend(fetchArticleURL(url))
    
    def crawl_articles(self):
        
        def fetchArticle(url):
            document = {}

            try:
                r = req.get(url)
                r.raise_for_status()

            except req.exceptions.HTTPError as err:
                # TODO: add error handling
                pass

            else:        
                document['url'] = url
                document['raw_html'] = r.content
                document['in_cache_date'] = datetime.utcnow()
                document['parsed'] = 0

            return document
        
        self.articles = []
        print('Crawling articles:')
        print(self.article_urls[:10])
        for url in tqdm(self.article_urls[:10]):
            self.articles.append(fetchArticle(url))
    
    def clean_articles(self):
        
        def parse_article(r):

            a = {}

            a['url']  = r['url']

            tree = html.fromstring(r['raw_html'])

            try:
                a['timestamp'] = tree.xpath('/html/body/main/div[2]/div/article/div/div/div[1]/div/section/div[1]/div//time[@class="article__byline__text"]/@datetime')[0].strip()
            except: 
                a['timestamp'] = None
            try:
                a['title'] = tree.xpath('//h1/text()')[0].strip()
            except:
                a['title'] = None
            try:
                a['publisherID'] = tree.xpath('/html/body/main/div[2]/div/article/div/div/div[1]/div/section/div[1]/div/div[1]/ul/li/a/text()')[0].strip()
            except:
                a['publisherID'] = None
            try:
                a['cleantext'] = ''.join(tree.xpath('//div[@class="content article__content"]//p/text()')[:-3])
            except:
                a['cleantext'] = None
            try:
                a['category'] = tree.xpath('/html/body/a/div/div/h2/text()')[0].strip()
            except:
                a['category'] = None

            return a
        
        print('Cleaning articles:')
        self.cleaned_articles = []
        for article in tqdm(self.articles):
            self.cleaned_articles.append(parse_article(article))
    
    def save_json(self):
        
        with open('NRC.json', 'w') as file:
            json.dump(self.articles, file, indent=4)
            
    

In [3]:
scraper = Scraper()
scraper.index_archive()
scraper.index_urls()
scraper.crawl_articles()
scraper.clean_articles()

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Indexing urls:


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.46it/s]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

Crawling articles:
['https://www.nrc.nl/nieuws/2022/02/24/lesje-zoete-aardappel-a4093145', 'https://www.nrc.nl/nieuws/2022/02/24/oekraine-verbreekt-diplomatieke-banden-met-moskou-a4093296', 'https://www.nrc.nl/nieuws/2022/02/24/oekraine-claimt-vijftig-russische-militairen-te-hebben-gedood-a4093295', 'https://www.nrc.nl/nieuws/2022/02/24/loekasjenko-kan-troepen-sturen-indien-nodig-stelt-onderhandelingen-voor-a4093294', 'https://www.nrc.nl/nieuws/2022/02/24/reisadvies-verlaat-oekraine-als-dit-veilig-kan-geen-evacuatie-door-overheid-a4093293', 'https://www.nrc.nl/nieuws/2022/02/24/ik-heb-gehakt-aangevuld-met-tahoe-a4093142', 'https://www.nrc.nl/nieuws/2022/02/24/vrij-a4093159', 'https://www.nrc.nl/nieuws/2022/02/24/en-daarna-toch-maar-weer-zoenen-in-het-bejaardentoilet-a4093158', 'https://www.nrc.nl/nieuws/2022/02/24/niet-te-wild-en-niet-te-duf-a4093154', 'https://www.nrc.nl/nieuws/2022/02/24/kiev-is-in-een-slechte-droom-beland-a4093292']


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.47it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 250.51it/s]

Cleaning articles:





In [4]:
scraper.save_json()

TypeError: Object of type bytes is not JSON serializable