In [4]:
import bs4
import requests
import pandas as pd
from datetime import datetime
import time
import os
from random import randint

# Base URL for scraping
base_url = 'https://waarnemingen.be/observation/{}/'  

# Function to parse a single page
def parse_observation(observation_id, retries=10, backoff_factor=2):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-encoding":"gzip, deflate, br, zstd",
        "accept-language":"nl-BE,nl;q=0.9,en-BE;q=0.8,en;q=0.7,nl-NL;q=0.6,en-US;q=0.5",
        "connection":"keep-alive",
        "cookie":"csrftoken=3JbFPYJyRC9GxhkNoW4XzF1vbbG6Fbxe; sessionid=v132os9mxwltj3ol3plhmojrjch24m9o; fundraiser_dismissed=1; cookielaw_accepted=1",
        "host":"waarnemingen.be",
        "Referer": "https://www.google.com/",
        "sec-ch-ua":'"Not A(Brand";v="8", "Chromium";v="132", "Google Chrome";v="132"',
        "sec-ch-ua-mobile":"?0",
        "sec-ch-ua-platform":"Linux",
        "sec-fetch-dest":"document",
        "sec-fetch-mode":"navigate",
        "sec-fetch-site":"same-origin",
        "sec-fetch-user":"?1",
        "upgrade-insecure-requests":"1"
            }
    for attempt in range(retries):
        try:
            res = requests.get(base_url.format(observation_id), headers=headers)
            print(f"Requesting {base_url.format(observation_id)} (Attempt {attempt + 1})")
            res.raise_for_status()
            soup = bs4.BeautifulSoup(res.text, 'html.parser')
            
            observation = {
                "observation_id": observation_id,
                "species_id": soup.select_one("h1 > a").get('href').split('/')[2],
                "species_name": soup.select_one(".species-common-name").getText().strip(),
                "species_name_scientific": soup.select_one(".species-scientific-name").getText().strip(),
                "validation": soup.select_one(".validation-status-text").getText().strip(),
                "gps_coordinates": soup.select_one('span[title="WGS 84"] .teramap-coordinates-coords').getText().strip() if soup.select_one('span[title="WGS 84"] .teramap-coordinates-coords') else None,
                "accuracy": soup.find('span', string="Nauwkeurigheid").next_sibling.getText().strip() if soup.find('span', string="Nauwkeurigheid") else None,
                "source": soup.find('span', string="Bron").next_sibling.getText().strip() if soup.find('span', string="Bron") else None,
                "date": soup.select_one('th:-soup-contains("Datum") + td').getText().strip() if soup.select_one('th:-soup-contains("Datum") + td') else None,
                "amount": soup.select_one('th:-soup-contains("Aantal") + td').getText().strip() if soup.select_one('th:-soup-contains("Aantal") + td') else None, # aantal
                "life_stage": soup.select_one('th:-soup-contains("Levensstadium") + td').getText().strip() if soup.select_one('th:-soup-contains("Levensstadium") + td') else None, # levensstadium
                "activity": soup.select_one('th:-soup-contains("Activiteit") + td').getText().strip() if soup.select_one('th:-soup-contains("Activiteit") + td') else None, # activiteit
   
                "location_id": soup.select_one('th:-soup-contains("Locatie") + td').a.get('href').split('/')[2] if soup.select_one('th:-soup-contains("Locatie") + td a') else None, 
                "location": soup.select_one('th:-soup-contains("Locatie") + td a').getText().strip() if soup.select_one('th:-soup-contains("Locatie") + td a') else None, # locatie
                "observer_id": soup.select_one('th:-soup-contains("Waarnemer") + td').a.get('href').split('/')[2] if soup.select_one('th:-soup-contains("Waarnemer") + td a') else None,
                "observer_name": soup.select_one('th:-soup-contains("Waarnemer") + td a').getText().strip() if soup.select_one('th:-soup-contains("Waarnemer") + td a') else None, # waarnemer
                
                "counting_method": soup.select_one('th:-soup-contains("Telmethode") + td').getText().strip() if soup.select_one('th:-soup-contains("Telmethode") + td') else None, # telmethode
                "method": soup.select_one('th:-soup-contains("Methode") + td').getText().strip() if soup.select_one('th:-soup-contains("Methode") + td') else None, # methode
            }
            # print(observation)
            return observation
        
        except requests.exceptions.HTTPError as e:
            if res.status_code == 404:
                print(f"Observation {observation_id} not found (404). Returning empty observation.")
                return {"observation_id": observation_id, **{key: None for key in [
                    "species_id", "species_name", "species_name_scientific", "validation",
                    "gps_coordinates", "accuracy", "source", "date", "amount",
                    "life_stage", "activity", "location_id", "location",
                    "observer_id", "observer_name", "counting_method", "method"
                ]}}
            else:
                print(f"HTTP error {res.status_code} on attempt {attempt + 1}/{retries}: {e}")
        except requests.exceptions.RequestException as e:
            print(f"Request error: {e} on attempt {attempt + 1}/{retries}")
            
        if attempt < retries - 1:
            time.sleep(backoff_factor * (2 ** attempt))
            continue
        else:
            print(f"Failed to fetch observation {observation_id} after {retries} attempts.")
            raise e
            
def scrape(observations, species_id, folder_path, sleep_min = 2, sleep_max = 10):
    file_name = make_filename(species_id, folder_path)
    # print(f"make_filename: {file_name}")
    
    try:
        for i, observation_id in enumerate(observations):
            print(f"Scraping observation {observation_id} | ({i+1:_}/{len(observations):_})")
            if (i == 0 and not os.path.isfile(file_name)):
                write_header = True
                # print(f"Writing header to {file_name}")
            else:
                write_header = False
            
            observation = parse_observation(observation_id)
            pd.DataFrame([observation]).to_csv(
                file_name,
                mode='a',  # Append mode
                index=False,
                header=write_header  # Write header only for the first write in the file
            )
            
            time.sleep(randint(sleep_min,sleep_max))  # Variable respectful delay between requests

    except Exception as e:
        print(f"An error occurred: {e}")
    
def make_filename(species_id, folder_path, complete=False):
    os.makedirs('scraped_data', exist_ok=True)
    base_name = f'{folder_path}/observation_details_{species_id}_clean'
    if not complete:
        return base_name + '_in_progress.csv'
    else:
        return base_name + '.csv'

###########################################################################################################################################
species_id = 70
folder_path_clean = "./scraped_data/cleaned/"
scrape_amount = 20000
sleep_min = 1
sleep_max = 4

start_time = datetime.now()

df_general = pd.read_csv(folder_path_clean + "observations_general_" + str(species_id) + "_clean" + ".csv")
details = [f for f in os.listdir(folder_path_clean) if f"observation_details_{species_id}_clean" in f and f.endswith(".csv")]

df_details = pd.read_csv(folder_path_clean + details[0]) if len(details) > 0 else None

if (df_details is not None):
    # Find ids in df_general that are not in df_details
    not_in_details = df_general[~df_general['id'].isin(df_details['observation_id'])]['id'].tolist()
else:
    not_in_details = df_general['id'].tolist()
    

if len(not_in_details) > 0:
    print(f"Start scraping: {scrape_amount:_} this batch, but {len(not_in_details):_} observations to scrape")
    scrape(not_in_details[0:scrape_amount], species_id, folder_path_clean, sleep_min, sleep_max)
else:
    print("All observations have been scraped.")
    os.rename(make_filename(species_id, folder_path_clean, complete=False), make_filename(species_id, folder_path_clean, complete=True))
    





end_time = datetime.now()
execution_time = end_time - start_time
days, rem = divmod(execution_time.total_seconds(), 86400)
hours, rem = divmod(rem, 3600)
minutes, rem = divmod(rem, 60)
seconds, _ = divmod(rem, 1)
print(f"Execution time: {int(days)} days, {int(hours)} hours, {int(minutes)} minutes, {int(seconds)} seconds")







            

Start scraping: 20_000 this batch, but 98_132 observations to scrape
Scraping observation 150291595 | (1/20_000)
Requesting https://waarnemingen.be/observation/150291595/ (Attempt 1)
Scraping observation 150285263 | (2/20_000)
Requesting https://waarnemingen.be/observation/150285263/ (Attempt 1)
Scraping observation 150284990 | (3/20_000)
Requesting https://waarnemingen.be/observation/150284990/ (Attempt 1)
Scraping observation 150284988 | (4/20_000)
Requesting https://waarnemingen.be/observation/150284988/ (Attempt 1)
Scraping observation 150290991 | (5/20_000)
Requesting https://waarnemingen.be/observation/150290991/ (Attempt 1)
Scraping observation 150341188 | (6/20_000)
Requesting https://waarnemingen.be/observation/150341188/ (Attempt 1)
Scraping observation 150341182 | (7/20_000)
Requesting https://waarnemingen.be/observation/150341182/ (Attempt 1)
Scraping observation 150284823 | (8/20_000)
Requesting https://waarnemingen.be/observation/150284823/ (Attempt 1)
Scraping observatio

KeyboardInterrupt: 