In [3]:
import bs4
import requests
import pandas as pd
from datetime import datetime

species = {70: "boomklever", 116: "halsbandpakiet"}

# Base URL for scraping
base_url = 'https://waarnemingen.be/species/{}/observations/?date_after=1970-01-15&date_before=2025-01-14&page={}'

# Function to parse a single page
def parse_page(species_id, page_number):
    res = requests.get(base_url.format(species_id, page_number))
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, 'html.parser')
    
    # Extract table rows
    rows = soup.select('table tbody tr')
    observations = []
    
    for row in rows:
        cols = row.find_all('td')
        if len(cols) >= 5:
            observation = {
                'date': cols[0].getText().strip(),
                'observation_id': cols[0].a['href'] if cols[0].a else None,
                'amount_manner': cols[1].getText().strip(),
                'location': cols[2].getText().strip(),
                'location_id': cols[2].a['href'] if cols[2].a else None,
                'observer_name': cols[3].getText().strip(),
                'observer_id': cols[3].a['href'] if cols[3].a else None,
                'validation': cols[4].getText().strip(),
            }
            observations.append(observation)
    
    return observations

def scrape_all_pages(species_id):
    all_observations = []
    page = 1
    while True:
        print(f"Scraping page {page}...")
        observations = parse_page(species_id, page)
        if not observations:  # Stop if no more data
            break
        all_observations.extend(observations)
        page += 1
    return all_observations

def scrape_multiple_pages(species_id, page_start, page_end_excl):
    all_observations = []
    for page in range(page_start, page_end_excl): 
        print(f"Scraping page {page}...")
        observations = parse_page(species_id, page)
        if not observations:  # Stop if no more data
            break
        all_observations.extend(observations)
    return all_observations


species_id = 116
page_start = 1
page_end_excl = 3

observations = scrape_multiple_pages(species_id, page_start, page_end_excl)

# Convert to DataFrame
df = pd.DataFrame(observations)

# Save to CSV (optional)
current_date = datetime.now().strftime('%Y-%m-%d')
df.to_csv(f'observations_{species_id}_{page_start}-{page_end_excl}_{current_date}.csv', index=False)

print("Scraping complete. Data saved to 'observations.csv'")
print(df.head())


Scraping page 1...
Scraping page 2...
Scraping complete. Data saved to 'observations.csv'
               date           observation_id                amount_manner  \
0  2025-01-14 16:33  /observation/337053960/                           10   
1        2025-01-14  /observation/337050655/                            1   
2  2025-01-14 14:35  /observation/337054880/               1  foeragerend   
3  2025-01-14 13:38  /observation/337053880/         3  gezien en gehoord   
4  2025-01-14 13:31  /observation/337043515/  6  adult, gezien en gehoord   

                                            location        location_id  \
0                   Diest - Centrum (Vlaams-Brabant)  /locations/30991/   
1  Harelbeke/Deerlijk - De Gavers (Provinciedomei...  /locations/31857/   
2                Diest - Halve Maan (Vlaams-Brabant)  /locations/92164/   
3           Hofstade - BLOSO-Domein (Vlaams-Brabant)  /locations/30871/   
4                       Beauvechain (Brabant Wallon)  /locations/43443/ 