## TV2 forside scraper


**Krav:**
- Scanne TV2 Nyheder forside med jævne mellemrum
- Scanne forside-titler for indhold af bestemte nøgleord
- Lagre information om artikler, der indeholder disse nøgleord
- Tilføje til JSON
- Kontrol for artikel allerede er lagret
- Skal forvente timeouts
- Evt. mailvarsel


**JSON-opbygning:**
- entry[id]
    - newspaper_name: ...
    - newspaper_frontpage-url: ...
    - frontpage_selector:
    - keyword_search:[..., ..., ...]
    - keyword_match: [...]
    - article_title: ...
    - article_link: ...
    - article_html: ...
    - article_datetime: ...
    - encounter_datetime: ...

In [3]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np

import scrapy
import requests
from scrapy import Selector

import os
import sys
import datetime

import re
import time
import uuid
import random
from random import randint
from itertools import compress
import json

import pprint
pp = pprint.PrettyPrinter()

In [35]:
def keyword_check(keywords, headline):
    text = headline['title']
    if any(word in text for word in keywords):
        return True
    else:
        return False

def get_article_info(link):
    
    i = 3
    
    art_uuid = str(uuid.uuid4())
    encounter_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
    
    while i > 0:
        time_out = randint(2, 5)
        time.sleep(time_out)
        response_code = requests.get(link).status_code

        if response_code == 200:     

            info = dict()

            html = requests.get(link).content
            soup = bs(html, "html.parser")

            article_title = soup.title.get_text()
            try:
                article_datetime = soup.find("meta", attrs={"name": "article:published_time"})['content']
            except TypeError:
                article_datetime = ''

            matches = list(compress(keywords, [keyword in article_title.lower() for keyword in keywords]))

            info['uuid'] = art_uuid
            info['article_accessed'] = 1
            info['newspaper_name'] = 'TV2 Nyheder'
            info['newspaper_frontpage_url'] = 'https://nyheder.tv2.dk/seneste'
            info['frontpage_selector'] = "div.o-article_wrap.g-con.g-col.g-row_l.g-gutter.g-colx.u-space_t-single"
            info['keywords_search'] = keywords
            info['keywords_match'] = matches
            info['article_title'] = article_title
            info['article_link'] = link
            info['article_datetime'] = article_datetime
            info['encounter_datetime'] = encounter_time
            return(info)
        else:
            i = i -1
        
        if i == 0:
            
            info = dict()
            
            info['uuid'] = art_uuid
            info['article_accessed'] = 0
            info['newspaper_name'] = 'TV2 Nyheder'
            info['newspaper_frontpage_url'] = 'https://nyheder.tv2.dk/seneste'
            info['frontpage_selector'] = "div.o-article_wrap.g-con.g-col.g-row_l.g-gutter.g-colx.u-space_t-single"
            info['keywords_search'] = keywords
            info['keywords_match'] = ''
            info['article_title'] = ''
            info['article_link'] = link
            info['article_datetime'] = ''
            info['encounter_datetime'] = encounter_time
            return(info)

def front_page_check(url, keywords, url_list):
    #selector of main page
    url = url
    html = requests.get(url).content
    soup = bs(html, "html.parser")

    #get headline soups
    headlines = soup.find_all("a", class_="o-teaser_link")

    #extract headlines based on keyword
    headlines_ext = list()

    for headline in headlines:
        if keyword_check(keywords, headline) == True:
            headlines_ext.append(headline)

    #get links from extracted headlines
    links_ext = list()
    for headline in headlines_ext:
        link = "https:" + headline['href']
        links_ext.append(link)
    links_ext = list(filter(None, links_ext))
    links_ext = list(set(links_ext))

    #get article info
    articles = []

    for link in links_ext:
        if not link in url_list:
            art_info = get_article_info(link)
            articles.append(art_info)
            
    return(articles)

def headline_watch(keywords, datadir, main_url = 'https://nyheder.tv2.dk/seneste'):
    '''
    Checks the frontpage and stores info about headlines matching keywords.
    '''
    keywords = keywords

    urldir = datadir + "urls/"

    urllist_filename = "tv2_article_urls.txt"

    data_filename = "tv2_articles.json"

    url_list = []

    try:
        with open(urldir + urllist_filename, 'r') as f:
            for line in f:
                url_list.append(line.strip())
            f.close()
    except IOError:
        print("No existing url list. Creating new file {}".format(urllist_filename))
        #logger.info("No existing url list. Creating new file {}".format(urllist_filename))
        if not os.path.isdir(urldir):
            os.mkdir(urldir)

    try:
        with open(datadir + data_filename, 'r') as f:
            f.close()
    except IOError:
        print("No existing data file. Creating new file {}".format(data_filename))
        #logger.info("No existing data file. Creating new file {}".format(data_filename))
        with open(datadir + data_filename, 'w') as f:
            json.dump([], f)

    i = 2

    while i > 0:
        try:
            response = requests.get(main_url, timeout = 5.0)
            break
        except:
            i = i - 1
            time_int = random.uniform(0.1, 0.2) 
            time.sleep(time_int)
            continue

    if i > 0: 
        if response.status_code == 200:
            articles = front_page_check(url = main_url, keywords = keywords, url_list = url_list)

            if len(articles) != 0:
                with open(datadir + data_filename, 'r') as f:
                    heads = json.load(f)
                    heads = heads + articles
                    f.close()
                with open(datadir + data_filename, 'w') as file:
                    json.dump(heads, file)
                file.close()

            for article in articles:
                url_list.append(article['article_link'])

            url_list = list(set(url_list))

            with open(urldir + urllist_filename, 'w') as f:
                for url in url_list:
                    f.write(url + "\n")
                f.close()

            print("TV2 front page checked on {time}. {n} new articles found.".format(time = datetime.datetime.now(), n = len(articles)))
            #logger.info("Berlingske front page checked on {time}. {n} new articles found.".format(time = datetime.datetime.now(), n = len(articles)))
            return
    else:
        print("Error retrieving TV2 front page on {time}. Skipping...".format(time = datetime.datetime.now()))
        #logger.warning("Error retrieving Berlingske front page on {time}. Skipping...".format(time = datetime.datetime.now()))      
        return

In [36]:
#set front url
main_url = 'https://nyheder.tv2.dk/seneste'

#set keywords
#keywords = ['udlænding', 'asyl', 'grænse', 'indvandr', 'udland']
keywords = ['corona', 'covid']

#set datadir
datadir = "../data/"

#Start
headline_watch(keywords = keywords, datadir = datadir, main_url = main_url)

TV2 front page checked on 2020-09-30 17:01:42.824995. 0 new articles found.


In [37]:
import pandas as pd
data = pd.read_json("../data/tv2_articles.json")
data.head()

Unnamed: 0,uuid,article_accessed,newspaper_name,newspaper_frontpage_url,frontpage_selector,keywords_search,keywords_match,article_title,article_link,article_datetime,encounter_datetime
0,6bc42258-7f50-4d8b-96f4-87b0a0feb784,1,TV2 Nyheder,https://nyheder.tv2.dk/seneste,div.o-article_wrap.g-con.g-col.g-row_l.g-gutte...,"[corona, covid]",[corona],Jægere skal aflevere ræve og mårhunde til coro...,https://nyheder.tv2.dk/samfund/2020-09-30-jaeg...,2020-09-30T12:56:33.000Z,2020-09-30 17:00
1,925edbbf-7352-4efb-8cb6-baa0f3ceb81d,1,TV2 Nyheder,https://nyheder.tv2.dk/seneste,div.o-article_wrap.g-con.g-col.g-row_l.g-gutte...,"[corona, covid]",[corona],Michelin-restaurant får bøde for at bryde coro...,https://nyheder.tv2.dk/samfund/2020-09-30-mich...,2020-09-30T12:53:12.000Z,2020-09-30 17:00
2,02d29d5b-fa91-4f55-92c9-1d26a2bde4b1,1,TV2 Nyheder,https://nyheder.tv2.dk/seneste,div.o-article_wrap.g-con.g-col.g-row_l.g-gutte...,"[corona, covid]",[corona],Stort coronaudbrud i Flensborg efter fest i Da...,https://nyheder.tv2.dk/lokalt/2020-09-30-stort...,2020-09-30T11:57:56.000Z,2020-09-30 17:00
3,caff194f-515d-4ef5-b1a9-df9abefa4326,1,TV2 Nyheder,https://nyheder.tv2.dk/seneste,div.o-article_wrap.g-con.g-col.g-row_l.g-gutte...,"[corona, covid]",[corona],534 nye coronasmittede siden i går - TV 2,https://nyheder.tv2.dk/samfund/2020-09-30-534-...,2020-09-30T12:00:49.000Z,2020-09-30 17:00
4,8e50c9d8-e1f1-4a7d-95b0-67f6c191b1bb,1,TV2 Nyheder,https://nyheder.tv2.dk/seneste,div.o-article_wrap.g-con.g-col.g-row_l.g-gutte...,"[corona, covid]",[corona],"Nyeste corona-tal: Så mange er smittede, døde ...",https://nyheder.tv2.dk/samfund/2020-02-27-nyes...,,2020-09-30 17:00
