In [24]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

import time
import random

from bs4 import BeautifulSoup
import requests as req
from fake_useragent import UserAgent

import json
import w3lib.html

from lxml import html
from tqdm import tqdm

In [25]:
class Scraper():
    def __init__(self):
        self.ua = UserAgent()
        self.headers = req.utils.default_headers()
        self.sitemap_url = 'https://www.telegraaf.nl/sitemap.xml'
    
    def index_archive(self):
        def fetchArchiveURL(url):
            document = {}

            try:
                # Random User Agent
                self.headers.update({'User-Agent': self.ua.random,})
                r = req.get(url)
                r.raise_for_status()
            except req.exceptions.HTTPError as err:
                # TODO: add error handling
                pass
            else:        
                document['url'] = url
                document['raw_xml'] = r.content
                document['in_cache_date'] = datetime.utcnow()

            return document
        
        def fetchArticleURL(url):
            doc = fetchArchiveURL(url)
            raw_xml = doc['raw_xml']
            soup = BeautifulSoup(raw_xml, 'xml')

            article_urls = []
            locs = soup.find_all('loc')
            for loc in locs:
                article_urls.append(loc.get_text())

            return article_urls
        
        self.archive_urls = fetchArticleURL(self.sitemap_url)
        
    def index_urls(self):
        
        def fetchArchiveURL(url):
            document = {}

            try:
                # Random User Agent
                self.headers.update({'User-Agent': self.ua.random,})
                r = req.get(url)
                r.raise_for_status()
            except req.exceptions.HTTPError as err:
                # TODO: add error handling
                pass
            else:        
                document['url'] = url
                document['raw_xml'] = r.content
                document['in_cache_date'] = datetime.utcnow()

            return document
        
        def fetchArticleURL(url):
            doc = fetchArchiveURL(url)
            raw_xml = doc['raw_xml']
            soup = BeautifulSoup(raw_xml, 'xml')

            article_urls = []
            locs = soup.find_all('loc')
            for loc in locs:
                article_urls.append(loc.get_text())

            return article_urls
        
        self.article_urls = []
        print('Indexing urls: ')
        for url in tqdm(self.archive_urls[:5]):
            # Simulate human usage 
            time.sleep(random.randint(0, 3))
            self.article_urls.extend(fetchArticleURL(url))
    
    def crawl_articles(self):
        def fetchArticle(url):
            document = {}

            try:
                # Random User Agent
                self.headers.update({'User-Agent': self.ua.random,})
                r = req.get(url)
                r.raise_for_status()

            except req.exceptions.HTTPError as err:
                pass

            else:        
                document['url'] = url
                document['raw_html'] = r.content
                document['in_cache_date'] = datetime.utcnow()
                document['parsed'] = 0

            return document
        
        self.articles = []
        print('Crawling articles: ')
        for url in tqdm(self.article_urls):
            # Simulate human usage
            time.sleep(random.randint(0, 3))
            self.articles.append(fetchArticle(url))
    
    def clean_articles(self):
        
        def parse_article(r):
            a = {}
            
            try:
                a['url']  = r['url']

                tree = html.fromstring(r['raw_html'])

                try:
                    a['timestamp'] = tree.xpath('/html/body/main/div[2]/div/article/div/div/div[1]/div/section/div[1]/div//time[@class="article__byline__text"]/@datetime')[0].strip()
                except: 
                    a['timestamp'] = None
                try:
                    a['title'] = tree.xpath('//h1/text()')[0].strip()
                except:
                    a['title'] = None
                try:
                    a['publisherID'] = tree.xpath('/html/body/div/article/main/div[1]/section/div[3]/div[1]/div/p/strong/text()')[0].strip()
                except:
                    a['publisherID'] = None
                try:
                    a['cleantext'] = tree.xpath('//*[@id="articleIntro1179944"]/text()')[0]
                except:
                    a['cleantext'] = None
                try:
                    a['category'] = tree.xpath('/html/body/div/div[1]/a/text()')[0].strip()
                except:
                    a['category'] = None
            except:
                pass

            return a
        
        print('Cleaning articles:')
        self.cleaned_articles = []
        for article in tqdm(self.articles):
            self.cleaned_articles.append(parse_article(article))
    
    def save_json(self):
        
        with open('Telegraaf.json', 'w') as file:
            json.dump(self.cleaned_articles, file, indent=4)
            
    

In [26]:
scraper = Scraper()

In [27]:
scraper.index_archive()

In [28]:
scraper.index_urls()

  0%|                                                                                        | 0/5 [00:00<?, ?it/s]

Indexing urls: 


100%|████████████████████████████████████████████████████████████████████████████████| 5/5 [00:08<00:00,  1.69s/it]


In [29]:
scraper.crawl_articles()

  0%|                                                                                     | 0/1759 [00:00<?, ?it/s]

Crawling articles: 


100%|██████████████████████████████████████████████████████████████████████████| 1759/1759 [49:38<00:00,  1.69s/it]


In [30]:
scraper.clean_articles()

  1%|█                                                                          | 25/1759 [00:00<00:07, 247.63it/s]

Cleaning articles:


100%|█████████████████████████████████████████████████████████████████████████| 1759/1759 [00:07<00:00, 236.77it/s]


In [31]:
scraper.save_json()