In [1]:
# Scrape 
import os
import requests
from bs4 import BeautifulSoup

In [69]:
# URL of site hosting omgevingsvisies (i.e. policy documents from Dutch municipalities)
url = 'https://mijnomgevingsvisie.nl/omgevingsvisie'

In [70]:
# Request content of main page
r = requests.get(url)

soup = BeautifulSoup(r.content, 'html.parser')

In [72]:
# Get links to the individual municipality pages
municipality_divs = soup.find_all('a', class_='dp-dfg-image-link', href=True)
municipality_links = [link['href'] for link in municipality_divs]

In [75]:
# Dictionary with municipality names as keys and downloadlink for omgevingvisie as items
downloads = {name.text : '' for name in soup.find_all(class_='entry-title')}

In [20]:
# List with all municipalities
municipalities = list(downloads.keys())

In [56]:
# Get download links from initial pages
for link, municipality in zip(links, municipalities):
    print(f'Getting link for {municipality}...')
    
    request = requests.get(link)
    
    soup = BeautifulSoup(request.content, 'html.parser')
    found = 0
    
    for href in soup.find_all('a', href=True):
    
        if plaats in href.text:

            url = href['href']
            
            found += 1

            downloads[municipality] = url

Getting link for Tubbergen...
Getting link for Dinkelland...
Getting link for Oldebroek...
Getting link for Raalte...
Getting link for Uitgeest...
Getting link for Voorst...
Getting link for Maastricht...
Getting link for Overbetuwe...
Getting link for Nijmegen...
Getting link for Westvoorne...
Getting link for Gieten...
Getting link for Almelo...
Getting link for Waterland...
Getting link for Oldambt...
Getting link for Staphorst...
Getting link for Brielle...
Getting link for Voerendaal...
Getting link for Winterswijk...
Getting link for Ede...
Getting link for Eindhoven...
Getting link for Terneuzen, Sas van Gent...
Getting link for Den Helder, Julianadorp...
Getting link for Hoeksche Waard...
Getting link for Oisterwijk...
Getting link for Tilburg...
Getting link for Baarle-Nassau...
Getting link for Weststellingwerf...
Getting link for Nunspeet...
Getting link for Oirschot...
Getting link for Groningen...
Getting link for Hellendoorn...
Getting link for Opsterland...
Getting link 

In [85]:
# Manually extract some municipalities that didn't go through scraper
manually_retrieved = {'Terneuzen, Sas van Gent': 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/07/20181213-omgevingsvisie-Injesasinsas-definitief.pdf',
                      'Den Helder, Julianadorp': 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/07/191216-Den-Helder-Omgevingsvisie-Julianadorp.pdf',
                      'Tilburg': 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/05/150900-Tilburg-Omgevingsvisie.pdf',
                      'Oirschot': 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/05/170926-Oirschot-Omgevingsvisie-1.pdf',
                      'Groningen': 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/05/180700-Groningen-Omgevingsvisie-The-next-city-Groningse-leefkwaliteit-voorop.pdf',
                      'Opsterland': 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/05/150911-Opsterland-Omgevingsvisie-2015-2030.pdf',
                      'Sittard-Geleen': 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/05/161215-Sittard-Geleen-Omgevingsvisie-2016-deel-1-en-2.pdf',
                      'Alkmaar': 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/06/171005-Alkmaar-Omgevingsvisie-Alkmaar-2040.pdf',
                      'Hollands Kroon': 'https://omgevingsvisie.hollandskroon.nl/omgevingsvisie-duplicate-1',
                      'Bergen (L)': 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/06/190400-Bergen-L-Omgevingsvisie-Bergen-2030.pdf',
                      'Bunnik': 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/05/170323-Omgevingsvisie-Kromme-Rijngebied-Bunnik-Houten-Wijk-bij-Duurstede-small-1.pdf',
                      'Houten': 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/05/170323-Omgevingsvisie-Kromme-Rijngebied-Bunnik-Houten-Wijk-bij-Duurstede-small-1.pdf',
                      'Wijk bij Duurstede': 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/05/170323-Omgevingsvisie-Kromme-Rijngebied-Bunnik-Houten-Wijk-bij-Duurstede-small-1.pdf',
                      'Leusden': 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/05/180201-Leusden-Omgevingsvisie-stedelijk-gebied.pdf'}

In [94]:
downloaded = []

# Loop over all download links and store pdf's to 'data/visie-documents/raw'

for plaats, link in downloads.items():
    
    # Ignore links that have not been found unless manually retrieved
    if link == '':
        if plaats in manually_retrieved.keys():
            link = manually_retrieved[plaats]
        else:
            continue
    
    # Avoid downloading duplicate documents
    if link in downloaded:
        continue
    
    # Manual retrieval for Leiden 
    if plaats == 'Leiden':
        link = 'https://mijnomgevingsvisie.nl/wp-content/uploads/2020/07/190529-Leiden-omgevingsvisie-1.0-definitief.pdf'
    
    # Only include PDF documents
    if link[-3:] != 'pdf':
        continue
    
    # Adjust place name if necessary (',' are not allowed in file names)
    if ',' in plaats:
        plaats.replace(',', '')
    
    # Retrieve the document
    r = requests.get(link)
    
    # Write document to designated folder
    with open(os.path.join('../visie-documents/pdf', plaats + '.pdf'), 'wb') as file:
        file.write(r.content)
        
    downloaded.append(link)