## Politiken forside scraper

https://askubuntu.com/questions/396654/how-to-run-a-python-program-in-the-background-even-after-closing-the-terminal

https://www.programiz.com/python-programming/json

**Krav:**
- Scanne Politikens forside med jævne mellemrum
- Scanne forside-titler for indhold af bestemte nøgleord
- Lagre information om artikler, der indeholder disse nøgleord
- Tilføje til JSON
- Kontrol for artikel allerede er lagret
- Skal forvente timeouts
- Evt. mailvarsel

**Udfordring:**
- Varsling hvis script kører død?

**JSON-opbygning:**
- entry[id]
    - newspaper_name: ...
    - newspaper_frontpage-url: ...
    - frontpage_selector:
    - keyword_search:[..., ..., ...]
    - keyword_match: [...]
    - article_title: ...
    - article_link: ...
    - article_html: ...
    - article_datetime: ...
    - encounter_datetime: ...

In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np

import scrapy
import requests
from scrapy import Selector

import os
import sys
import datetime

import re
import time
import uuid
from random import randint
from itertools import compress
import json

import pprint
pp = pprint.PrettyPrinter()

In [2]:
#define functions
def keyword_check(keywords, headline):
    text = headline.css(" ::text").getall()
    text = ' '.join(text)
    text = text.lower()
    if any(word in text for word in keywords):
        return True
    else:
        return False

def get_article_info(link):
    
    i = 3
    
    art_uuid = str(uuid.uuid4())
    encounter_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
    
    while i > 0:
        time_out = randint(2, 5)
        time.sleep(time_out)
        response_code = requests.get(link).status_code

        if response_code == 200:     

            info = dict()

            html = requests.get(link).content
            sel = Selector(text = html)

            title_sel = "title ::text"
            datetime_xpath = '//meta[contains(@property,"article:published_time")]/@content'

            article_title = sel.css(title_sel).get()
            article_datetime = sel.xpath(datetime_xpath).get()

            matches = list(compress(keywords, [keyword in article_title.lower() for keyword in keywords]))

            info['uuid'] = art_uuid
            info['article_accessed'] = 1
            info['newspaper_name'] = 'Politiken'
            info['newspaper_frontpage_url'] = 'https://politiken.dk/'
            info['frontpage_selector'] = "section.frontpage__section"
            info['keywords_search'] = keywords
            info['keywords_match'] = matches
            info['article_title'] = article_title
            info['article_link'] = link
            info['article_datetime'] = article_datetime
            info['encounter_datetime'] = encounter_time
            return(info)
        else:
            i = i -1
        
        if i == 0:
            
            info = dict()
            
            info['uuid'] = art_uuid
            info['article_accessed'] = 0
            info['newspaper_name'] = 'Politiken'
            info['newspaper_frontpage_url'] = 'https://politiken.dk/'
            info['frontpage_selector'] = "section.frontpage__section"
            info['keywords_search'] = keywords
            info['keywords_match'] = ''
            info['article_title'] = ''
            info['article_link'] = link
            info['article_datetime'] = ''
            info['encounter_datetime'] = encounter_time
            return(info)

def front_page_check(url, keywords):
    #selector of main page
    url = url
    html = requests.get(url).content
    sel = Selector(text = html)

    #selector of top frontpage contet
    front_sel = "section.frontpage__section"
    front_page = sel.css(front_sel)

    #get headline selectors
    headlines = front_page.css("h2")

    #extract headlines based on keyword
    headlines_ext = list()

    for headline in headlines:
        if keyword_check(keywords, headline) == True:
            headlines_ext.append(headline)
        
    #get links from extracted headlines
    links_ext = list()
    for headline in headlines_ext:
        links_ext.append(headline.css("a::attr(href)").get())
    links_ext = list(filter(None, links_ext))
    
    #get article info
    articles = list()

    for link in links_ext:
        if not link in url_list:
            art_info = get_article_info(link)
            articles.append(art_info)
    
    return(articles)

In [4]:
#set keywords
#keywords = ['udlænding', 'asyl', 'grænse', 'indvandr', 'udland']
keywords = ['corona']

i = 2

url_list = list()

with open('../data/urls/politiken_article_urls.txt', 'r') as f:
    for line in f:
        url_list.append(line.strip())

while i > 0:
    response = requests.get('https://politiken.dk/')
    if response.status_code == 200:
        articles = front_page_check(url = 'https://politiken.dk/', keywords = keywords)
        
        if len(articles) != 0:
            with open('..\\data\\politiken_articles.json', 'a') as file:
                json.dump(articles, file)
        
        for article in articles:
            url_list.append(article['article_link'])
    
    print(datetime.datetime.now())
    print(len(articles))
    
    i = i - 1
    time_out = randint(41*60, 62*60)
    time.sleep(time_out)

url_list = list(set(url_list))
    
with open('../data/urls/politiken_article_urls.txt', 'a') as f:
    for url in url_list:
        f.write(url + "\n")

2020-03-26 11:31:24.786353
14
2020-03-26 11:31:30.623717
0


In [None]:
print(len(articles))

In [None]:
for article in articles:
    pp.pprint(article['article_title'])

In [None]:
list(set(url_list))

In [14]:
poldata = pd.read_json('..\\data\\politiken_articles.json')
berdata = pd.read_json('..\\data\\berlingske_articles.json')

In [20]:
data = poldata.append(berdata, sort = False, ignore_index = True)

In [21]:
data

Unnamed: 0,uuid,article_accessed,newspaper_name,newspaper_frontpage_url,frontpage_selector,keywords_search,keywords_match,article_title,article_link,article_datetime,encounter_datetime,frontpage_title
0,f2265dfa-e2bf-40ff-848e-fe787a056e19,1,Politiken,https://politiken.dk/,section.frontpage__section,"[udlænding, asyl, grænse, indvandr, udland]",[udland],Tidligere britisk ambassadør i Danmark: Danmar...,https://politiken.dk/debat/debatindlaeg/art772...,2020-03-24T19:28:22+01:00,2020-03-25 11:27,
1,52f883ba-420a-4532-bfa8-f48799336ccc,1,Politiken,https://politiken.dk/,section.frontpage__section,"[udlænding, asyl, grænse, indvandr, udland]",[udlænding],R og EL vil have Tesfaye til at lave 'corona-l...,https://politiken.dk/indland/art7723746/R-og-E...,2020-03-25T11:50:12+01:00,2020-03-25 12:24,
2,438d3274-4309-47f7-a4ec-e8090d713f85,1,Berlingske,https://www.berlingske.dk/,div.front.theme-berlingske,"[udlænding, asyl, grænse, indvandr, udland]",[grænse],Økonom: Der er intet alternativ til at sætte e...,https://www.berlingske.dk/kronikker/oekonom-de...,2020-03-25T08:00:00+01:00,2020-03-25 11:27,"økonom: jeg ved, det er upopulært, men vi må ..."


In [22]:
data.to_csv('..\\data\\newspaper_articles.csv')