In [5]:
import requests # Downloading webpages
from bs4 import BeautifulSoup # Extracting data from html files
from urllib.parse import urljoin # Cleaning URLS
import json # Parsing JSON
from time import sleep # Time management
#from tqdm.notebook import tqdm 
from tqdm import tqdm # Processbar
from pathlib import Path # Working with files
import random # Generating pseudo random numbers
import pkg_resources # For citing modules

Extract a list of all constituencies

In [2]:
stemmeseddel_url = 'https://www.dr.dk/nyheder/politik/folketingsvalg/din-stemmeseddel' # website with links to all constituencies
stemmeseddel_req = requests.get(stemmeseddel_url) # download page
stemmeseddel_soup = BeautifulSoup(stemmeseddel_req.text) # using .text extracts html code from webpage

Extract links to each constituency

In [5]:
kreds_urls_ugly = stemmeseddel_soup('a', class_='AccordionGrid_link__cGkec') # extracting the links 'a' we want. By looking at the page with the inspect feature, we see that all the links we're interested in are classified with AccordionGrid_link__cGkec.
kreds_urls = list(map(lambda x: urljoin(stemmeseddel_url, x['href']), kreds_urls_ugly)) # cleaning to get at list of constituency links

Defining function to extract the url for each candidate

In [4]:
def kreds_kandidater(kreds_url):
    req = requests.get(kreds_url) # get webpage for the constituency
    soup = BeautifulSoup(req.text) # get html text
    
    kandidater_soup = soup.find('script', id='__NEXT_DATA__').text # search in html
    kandidater_json = json.loads(kandidater_soup) # extract json
    kandidater_data = kandidater_json['props']['pageProps']['smallConstituencyCandidatesByPartyCode'] # list of candidates by party
    
    # kandidater_data[0] is all kandidates from the first party in the current constituency in JSON format. In kreds_urls[0] (Rønne), is that Lea, Steen og Lars from socialdemokratiet.
    
    # urls for each candidate
    kandidat_urls = []
    for parti in kandidater_data: # go through all parties in the constituency
        for kandidat in list(parti.values())[0]['candidates']: # for each kandidate in the party
            kandidat_urls.append(kandidat['urlKey']) # save the URL for the candidate
    return kandidat_urls

In [5]:
kandidat_urls = []
for kreds_url in tqdm(kreds_urls): # go through all constisuencies
    kandidat_urls += kreds_kandidater(kreds_url) # save the URLs

100%|███████████████████████████████████████████| 92/92 [03:04<00:00,  2.01s/it]


Saving the URLs for every candidate

In [6]:
#open("candidate_urls.txt", "x")
urlfile = Path('data/kandidat_urls') # creating a new file
with urlfile.open('w') as f: # with automatically closes the file again, avoiding us closing it again manually
                             # the 'w' allows ud to write in the file
                             # with open as f, the file is in the variable f
    for url in set(kandidat_urls): # using 'set' to remove duplicates
        f.write(url + '\n') # adding a new line after each candidate

Scraping data for each candidate

In [7]:
kandidat_baseurl = 'https://www.dr.dk/nyheder/politik/folketingsvalg/din-stemmeseddel/kandidater/' # the first part of every URL

In [8]:
for kandidat_url in tqdm(set(kandidat_urls)):
    kandidat_fil = Path(f'data/kandidater/{kandidat_url}.json') # create filepath to save candiate info in
    if kandidat_fil.exists(): continue # if the file is already made, skip the rest of the loop

    req = requests.get(kandidat_baseurl + kandidat_url) # get the page
    soup = BeautifulSoup(req.text) # get the html text
    kandidat_data = soup.find('script', id='__NEXT_DATA__').text # get data
    kandidat_fil.write_text(kandidat_data) # write the data to file
    sleep(random.random() * 8) # avoid becoming a hacker

100%|███████████████████████████████████████| 1014/1014 [16:24<00:00,  1.03it/s]
