# Ophalen aantal vogeltellingen per jaar

Vermits er jaar na jaar meer vogelwaarnemingen worden gelogged, is het belangrijk om de totale jaarlijkse vogelwaarnemingen ook op te halen, zodat we de groei van elke soort in kaart kunnen brengen tov het totaal aantal vogelwaarnemingen.

In [1]:
import bs4
import requests
import pandas as pd
from datetime import datetime
import time
import os
import re



# Base URL for scraping

base_url = 'https://waarnemingen.be/species/?species_group_id=1&filter_month=&filter_year={}&include_exotic_and_extinct=on&own_species=all+species' 

# Function to parse a single page
def parse_page(year, retries=10, backoff_factor=2):
    
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-encoding":"gzip, deflate, br, zstd",
        "accept-language":"nl-BE,nl;q=0.9,en-BE;q=0.8,en;q=0.7,nl-NL;q=0.6,en-US;q=0.5",
        "connection":"keep-alive",
        "cookie":"csrftoken=3JbFPYJyRC9GxhkNoW4XzF1vbbG6Fbxe; sessionid=v132os9mxwltj3ol3plhmojrjch24m9o; fundraiser_dismissed=1; cookielaw_accepted=1",
        "host":"waarnemingen.be",
        "Referer": "https://www.google.com/",
        "sec-ch-ua":'"Not A(Brand";v="8", "Chromium";v="132", "Google Chrome";v="132"',
        "sec-ch-ua-mobile":"?0",
        "sec-ch-ua-platform":"Linux",
        "sec-fetch-dest":"document",
        "sec-fetch-mode":"navigate",
        "sec-fetch-site":"same-origin",
        "sec-fetch-user":"?1",
        "upgrade-insecure-requests":"1"
            }
    
    for attempt in range(retries):
        try:
            res = requests.get(base_url.format(year), headers=headers)
            res.raise_for_status()
            soup = bs4.BeautifulSoup(res.text, 'html.parser')
            
            year_stats = soup.select_one('.callout.callout-primary').getText(strip=True) if soup.select_one('.callout.callout-primary') else None,
            print(f'year stats: {year_stats}')
            
            # Regular expression to find "X waarnemingen van Y individuen"
            matches = re.search(r'([\d\.]+) waarnemingen van ([\d\.]+) individuen', str(year_stats))
            
            observation_count = matches[1]
            individu_count = matches[2]
            
            print(f'year observation_count: {observation_count}')
            print(f'year individu_count: {individu_count}')
            
            year_observations = {
                "year": year,
                "observation_count": int(observation_count.replace('.', '')),
                "individu_count": int(individu_count.replace('.', '')),
            }

            
            print(year_observations)
            return year_observations

            
        except requests.exceptions.HTTPError as e:
            print(f"HTTP error: {e} on attempt {attempt + 1}/{retries}")
            if attempt < retries - 1:
                time.sleep(backoff_factor * (2 ** attempt))
                continue
            else:
                raise e

def scrape(year_start = 1971, year_end = 2024):
    file_name = make_filename()
    year = year_start
    first_write = True if year_start == 1971 else False  # Add header only for the first page
    
    while (year <= year_end):
        print(f"Scraping year {year}")
        try:
            result = parse_page(year)
            
            # Convert observations to a DataFrame
            if result:
                df = pd.DataFrame([result])
                
                # Append locations to CSV
                df.to_csv(
                    file_name,
                    mode='a',  # Append mode
                    index=False,
                    header=first_write  # Write header only for the first write
                )
                first_write = False  # Ensure header is only written once
            
            time.sleep(5)  # Respectful delay between requests
            year += 1
        except Exception as e:
            print(f"An error occurred: {e}")
            break
    df = pd.read_csv(file_name)
    print(f"Scraped {len(df)} years, last scraped year: {year - 1}")
    os.rename(file_name, make_filename(year_start, year - 1))
    return df

def make_filename(year_start=None, year_end_incl=None):
    current_date = datetime.now().strftime('%Y-%m-%d')
    directory = 'scraped_data'
    
    # Controleer of de map bestaat, anders maak je die aan
    if not os.path.exists(directory):
        os.makedirs(directory)

    base_name = f'{directory}/observation_year_count_{current_date}'
    if year_start and year_end_incl:
        base_name = base_name + f'_year_{year_start}-{year_end_incl}'
    
    return base_name + '.csv'

#################################################################################################################

year_start = 2022
year_end = 2024

scrape() # Scrape all pages

print("Scraping of yearly count of observations and individues is complete. Data saved to .csv file")


Scraping year 1971
year stats: ('2.242 waarnemingen van 34.987 individuen resulteerden in een lijst van207 soorten,4 verzamelsoorten,2 ondersoorten,1 forma.',)
year observation_count: 2.242
year individu_count: 34.987
{'year': 1971, 'observation_count': 2242, 'individu_count': 34987}
Scraping year 1972
year stats: ('5.281 waarnemingen van 80.669 individuen resulteerden in een lijst van239 soorten,2 verzamelsoorten,5 ondersoorten.',)
year observation_count: 5.281
year individu_count: 80.669
{'year': 1972, 'observation_count': 5281, 'individu_count': 80669}
Scraping year 1973
year stats: ('6.547 waarnemingen van 99.533 individuen resulteerden in een lijst van234 soorten,3 verzamelsoorten,7 ondersoorten.',)
year observation_count: 6.547
year individu_count: 99.533
{'year': 1973, 'observation_count': 6547, 'individu_count': 99533}
Scraping year 1974
year stats: ('9.548 waarnemingen van 147.922 individuen resulteerden in een lijst van236 soorten,4 verzamelsoorten,9 ondersoorten.',)
year obs