# Scraping the AVB for PDFs of bulletins

In [2]:
import os
import re
import time
import sys

import requests

## Fonctions

In [8]:
def get_urls():
    """Retrieve all URLs from root AVB page"""
    root_url = "https://archives.bruxelles.be/bulletins/date"
    resp = requests.get(root_url)
    print(f"Status: {resp.status_code}")
    print(f"Encoding: {resp.encoding}")
    html = resp.text
    print(f"Text length: {len(html)}")

    pattern = r"https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/.*\.pdf"
    urls = re.findall(pattern, html)
    print(f"{len(urls)} PDF files found")
    return urls

In [15]:
def download(pdf_path, urls, offset=0):
    """Dowloading all files starting from offset"""
    for url in urls[offset:]:
        filename = url.split("/")[-1]
        print(f"Dowloading {filename}...")
        start_time = time.time()
        response = requests.get(url)
        print(f"   done in {(time.time() - start_time):.1f} seconds")
        with open(f"{pdf_path}/{filename}", 'wb') as f:
            f.write(response.content)

In [17]:
def check(pdf_path, urls):
    """Check if all files have been downloaded"""
    ok_count = 0
    for url in urls:
        filename = url.split("/")[-1]
        downloads = os.listdir(pdf_path)
        if filename not in downloads:
            print(f"{filename} is missing!")
        else:
            ok_count += 1
    print(f"{ok_count} PDFs found on {len(urls)}!")

## Obtenir les URLs des bulletins communaux 

In [9]:
urls = get_urls()

Status: 200
Encoding: utf-8
Text length: 821431
2833 PDF files found


In [12]:
pdf_path = '../data/pdf'

In [13]:
from pathlib import Path

Path(pdf_path).mkdir(parents=True, exist_ok=True)

## Télécharger les 3 premiers PDFs

In [16]:
%%time
download(pdf_path, urls[:3])

Dowloading Bxl_1847_Tome_I1_Part_1.pdf...
   done in 15.5 seconds
Dowloading Bxl_1847_Tome_I1_Part_2.pdf...
   done in 14.6 seconds
Dowloading Bxl_1847_Tome_I1_Part_3.pdf...
   done in 12.4 seconds
CPU times: user 1.83 s, sys: 668 ms, total: 2.5 s
Wall time: 42.6 s


## Vérifier si les 3 premiers PDFs ont été téléchargés

In [18]:
check(pdf_path, urls[:3])

3 PDFs found on 3!
