**Code die scraped op de site van het Vlaams Parlement. De code scraped de vragen en interpellaties in een gegeven tijdsinterval. Extract ook het thema van de vraag, de pdf-link en de link van de webfiche. Converteert het document naar TXT files. Ook de metadata wordt geplaatst in het begin van deze TXT files.**

In [1]:
import os
import re 
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from docx import Document
import PyPDF2
import io
from io import BytesIO
import pandas as pd
import docx2txt
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

######### Functions to scrape links for downloadable PDF and HTML files #########

def scrape_combined_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    combined_links = []
    articles = soup.find_all("article")

    for article in articles:
        fiche_link = article.find("a", string=re.compile("Bekijk documentenfiche", re.IGNORECASE))
        pdf_link = article.find("a", string=re.compile("Download pdf", re.IGNORECASE))
        verslag_link = article.find("a", string=re.compile("Bekijk verslag", re.IGNORECASE))

        if fiche_link:
            fiche_url = urljoin(url, fiche_link["href"])
            combined_entry = {"document_fiche": fiche_url}

            if pdf_link:
                combined_entry["pdf_link"] = urljoin(url, pdf_link["href"])
            if verslag_link:
                combined_entry["verslag_link"] = urljoin(url, verslag_link["href"])

            combined_links.append(combined_entry)

    return combined_links

########## Function to extract title and thema from the fiche ##########

def scrape_title_and_thema_from_fiche(driver, url):
    try:
        driver.get(url)
        time.sleep(3)  # Allow time for page to load

        # Extract title from the page-subtitle class
        try:
            title_element = driver.find_element(By.CSS_SELECTOR, ".page-subtitle")
            title = title_element.text.strip()
        except:
            title = "No Title Found"

        # Extract thema (topics)
        thema_elements = driver.find_elements(By.CSS_SELECTOR, "li.meeting-details__thema a")
        themas = [element.text.strip() for element in thema_elements]

    except Exception as e:
        print(f"An error occurred while scraping {url}: {e}")
        title = "Error Retrieving Title"
        themas = []

    return title, themas

########## Function to extract text from PDFs ##########

def extract_text_from_pdf(pdf_content):
    pdf_content = io.BytesIO(pdf_content)
    reader = PyPDF2.PdfReader(pdf_content)
    text = ""

    for page in range(len(reader.pages)):
        page_text = reader.pages[page].extract_text()
        if page_text:
            text += page_text + "\n"

    return text

########## Function to download and extract PDF content ##########

def download_and_convert_file(driver, url, download_dir, thema_link):
    title, thema = scrape_title_and_thema_from_fiche(driver, thema_link)
    thema_str = "\n".join(thema) + "\n"

    file_id = url.split("=")[-1]
    response = requests.get(url)
    content = response.content

    text = extract_text_from_pdf(content) if 'application/pdf' in response.headers.get('content-type', '') else ""

    text_filename = os.path.join(download_dir, f"{file_id}.txt")
    with open(text_filename, "w", encoding="utf-8") as text_file:
        text_file.write(f"title: {title}\n")
        text_file.write("thema: " + thema_str)
        text_file.write(f"pdf link: {url}\n")
        text_file.write(f"thema link: {thema_link}\n")
        text_file.write(text)

    return file_id, text

########## Function to download and extract HTML report content ##########

def download_verslag(driver, page_url, download_dir, thema_link):
    title, thema = scrape_title_and_thema_from_fiche(driver, thema_link)
    thema_str = "\n".join(thema) + "\n"

    verslag_id = page_url.split("/")[-1]
    
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, "html.parser")

    text = "\n".join([p.get_text() for p in soup.find_all('p')])

    text_filename = os.path.join(download_dir, f"{verslag_id}.txt")
    with open(text_filename, "w", encoding="utf-8") as text_file:
        text_file.write(f"title: {title}\n")
        text_file.write("thema: " + thema_str)
        text_file.write(f"verslag link: {page_url}\n")
        text_file.write(f"thema link: {thema_link}\n")
        text_file.write(text)

########## Function to iterate over pages and scrape links ##########

def iterate_scraper_over_pages(base_url, query_params):
    download_dir = "ScrapeddocumentsCorneel1202"
    os.makedirs(download_dir, exist_ok=True)

    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

    try:
        page_num = query_params["page"]
        while True:
            query_params["page"] = page_num
            url = base_url + "?" + "&".join([f"{k}={v}" for k, v in query_params.items()])
            print(f"Scraping page: {url}")

            combined_links = scrape_combined_links(url)

            if combined_links:
                for entry in combined_links:
                    if "pdf_link" in entry:
                        download_and_convert_file(driver, entry["pdf_link"], download_dir, entry["document_fiche"])
                    if "verslag_link" in entry:
                        download_verslag(driver, entry["verslag_link"], download_dir, entry["document_fiche"])
            else:
                print(f"No more links found on page {page_num}.")
                break

            page_num += 1
    finally:
        driver.quit()

########## Main Execution ##########

base_url = "https://www.vlaamsparlement.be/nl/parlementaire-documenten"
query_params = {
    "page": 0,
    "period": "custom",
    "start_period": "2023-03-01",
    "end_period": "2023-05-30",
    "aggregaat[]": "Vraag of interpellatie"
}

iterate_scraper_over_pages(base_url, query_params)


Scraping page: https://www.vlaamsparlement.be/nl/parlementaire-documenten?page=0&period=custom&start_period=2023-03-01&end_period=2023-05-30&aggregaat[]=Vraag of interpellatie
Scraping page: https://www.vlaamsparlement.be/nl/parlementaire-documenten?page=1&period=custom&start_period=2023-03-01&end_period=2023-05-30&aggregaat[]=Vraag of interpellatie
Scraping page: https://www.vlaamsparlement.be/nl/parlementaire-documenten?page=2&period=custom&start_period=2023-03-01&end_period=2023-05-30&aggregaat[]=Vraag of interpellatie
Scraping page: https://www.vlaamsparlement.be/nl/parlementaire-documenten?page=3&period=custom&start_period=2023-03-01&end_period=2023-05-30&aggregaat[]=Vraag of interpellatie
Scraping page: https://www.vlaamsparlement.be/nl/parlementaire-documenten?page=4&period=custom&start_period=2023-03-01&end_period=2023-05-30&aggregaat[]=Vraag of interpellatie
Scraping page: https://www.vlaamsparlement.be/nl/parlementaire-documenten?page=5&period=custom&start_period=2023-03-01&e

**Stuk hieronder is om te proberen namen van vraagsteller en ondervraagde minister mee te nemen maar werkt niet omdat ergens in de voorbije jaren de fiche van lay-out is veranderd**


In [13]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from docx import Document
import PyPDF2
import io
from io import BytesIO
import pandas as pd
import docx2txt
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

######### Function to scrape links for downloadable PDF and HTML files #########

def scrape_combined_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    combined_links = []
    articles = soup.find_all("article")

    for article in articles:
        fiche_link = article.find("a", string=re.compile("Bekijk documentenfiche", re.IGNORECASE))
        pdf_link = article.find("a", string=re.compile("Download pdf", re.IGNORECASE))
        verslag_link = article.find("a", string=re.compile("Bekijk verslag", re.IGNORECASE))

        if fiche_link:
            fiche_url = urljoin(url, fiche_link["href"])
            combined_entry = {"document_fiche": fiche_url}

            if pdf_link:
                combined_entry["pdf_link"] = urljoin(url, pdf_link["href"])
            if verslag_link:
                combined_entry["verslag_link"] = urljoin(url, verslag_link["href"])

            combined_links.append(combined_entry)

    return combined_links

########## Function to extract title, thema, vraagsteller, and ondervraagde minister ##########

def scrape_fiche_details(driver, url):
    try:
        driver.get(url)
        time.sleep(3)  # Allow time for page to load

        # Extract title
        try:
            title_element = driver.find_element(By.CSS_SELECTOR, ".page-subtitle")
            title = title_element.text.strip()
        except:
            title = "No Title Found"

        # Extract thema
        thema_elements = driver.find_elements(By.CSS_SELECTOR, "li.meeting-details__thema a")
        themas = [element.text.strip() for element in thema_elements]

        # Extract Vraagsteller and Ondervraagde Minister
        vraagsteller, minister = "Unknown", "Unknown"
        labels = driver.find_elements(By.CSS_SELECTOR, ".meeting-details__label")
        values = driver.find_elements(By.CSS_SELECTOR, ".meeting-details__value")

        for label, value in zip(labels, values):
            label_text = label.text.strip()
            value_text = value.text.strip()

            if label_text == "Vraagsteller":
                vraagsteller = value_text
            elif label_text == "Ondervraagde minister":
                minister = value_text

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        title, themas, vraagsteller, minister = "Error", [], "Unknown", "Unknown"

    return title, themas, vraagsteller, minister

########## Function to extract text from PDFs ##########

def extract_text_from_pdf(pdf_content):
    pdf_content = io.BytesIO(pdf_content)
    reader = PyPDF2.PdfReader(pdf_content)
    text = ""

    for page in range(len(reader.pages)):
        page_text = reader.pages[page].extract_text()
        if page_text:
            text += page_text + "\n"

    return text

########## Function to download and extract PDF content ##########

def download_and_convert_file(driver, url, download_dir, thema_link):
    title, thema, vraagsteller, minister = scrape_fiche_details(driver, thema_link)
    thema_str = "\n".join(thema) + "\n"

    file_id = url.split("=")[-1]
    response = requests.get(url)
    content = response.content

    text = extract_text_from_pdf(content) if 'application/pdf' in response.headers.get('content-type', '') else ""

    text_filename = os.path.join(download_dir, f"{file_id}.txt")
    with open(text_filename, "w", encoding="utf-8") as text_file:
        text_file.write(f"title: {title}\n")
        text_file.write("thema: " + thema_str)
        text_file.write(f"pdf link: {url}\n")
        text_file.write(f"thema link: {thema_link}\n")
        text_file.write(f"Vraagsteller: {vraagsteller}\n")
        text_file.write(f"Ondervraagde minister: {minister}\n")
        text_file.write(text)

    return file_id, text

########## Function to download and extract HTML report content ##########

def download_verslag(driver, page_url, download_dir, thema_link):
    title, thema, vraagsteller, minister = scrape_fiche_details(driver, thema_link)
    thema_str = "\n".join(thema) + "\n"

    verslag_id = page_url.split("/")[-1]
    
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, "html.parser")

    text = "\n".join([p.get_text() for p in soup.find_all('p')])

    text_filename = os.path.join(download_dir, f"{verslag_id}.txt")
    with open(text_filename, "w", encoding="utf-8") as text_file:
        text_file.write(f"title: {title}\n")
        text_file.write("thema: " + thema_str)
        text_file.write(f"verslag link: {page_url}\n")
        text_file.write(f"thema link: {thema_link}\n")
        text_file.write(f"Vraagsteller: {vraagsteller}\n")
        text_file.write(f"Ondervraagde minister: {minister}\n")
        text_file.write(text)

########## Function to iterate over pages and scrape links ##########

def iterate_scraper_over_pages(base_url, query_params):
    download_dir = "ScrapeddocumentsCorneel1202"
    os.makedirs(download_dir, exist_ok=True)

    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

    try:
        page_num = query_params["page"]
        while True:
            query_params["page"] = page_num
            url = base_url + "?" + "&".join([f"{k}={v}" for k, v in query_params.items()])
            print(f"Scraping page: {url}")

            combined_links = scrape_combined_links(url)

            if combined_links:
                for entry in combined_links:
                    if "pdf_link" in entry:
                        download_and_convert_file(driver, entry["pdf_link"], download_dir, entry["document_fiche"])
                    if "verslag_link" in entry:
                        download_verslag(driver, entry["verslag_link"], download_dir, entry["document_fiche"])
            else:
                print(f"No more links found on page {page_num}.")
                break

            page_num += 1
    finally:
        driver.quit()

########## Main Execution ##########

base_url = "https://www.vlaamsparlement.be/nl/parlementaire-documenten"
query_params = {
    "page": 0,
    "period": "custom",
    "start_period": "2024-06-01",
    "end_period": "2024-11-30",
    "aggregaat[]": "Vraag of interpellatie"
}

iterate_scraper_over_pages(base_url, query_params)


Scraping page: https://www.vlaamsparlement.be/nl/parlementaire-documenten?page=0&period=custom&start_period=2024-06-01&end_period=2024-11-30&aggregaat[]=Vraag of interpellatie


KeyboardInterrupt: 