In [None]:
import bs4
import requests
import pandas as pd
from datetime import datetime
import time
import os

# Base URL for scraping
base_url = 'https://waarnemingen.be/species/{}/observations/?date_after=1900-01-01&date_before=2024-12-31&page={}'  # te verwachten tot page 3879

# Function to parse a single page
def parse_observation(observation_id, retries=10, backoff_factor=2):
    for attempt in range(retries):
        try:
            res = requests.get(base_url.format(observation_id))
            res.raise_for_status()
            soup = bs4.BeautifulSoup(res.text, 'html.parser')
            
            observation = {
                "species_id": soup.select_one("h1 > a").get('href').strip('/species/').strip('/observations/'),
                "species_name": soup.select_one(".species-common-name").getText().strip(),
                "species_name_scientific": soup.select_one(".species-scientific-name").getText().strip(),
                
                "validation": soup.select_one(".validation-status-text").getText().strip(),
                
                "gps_coordinates": soup.select_one('span[title="WGS 84"] .teramap-coordinates-coords').getText().strip(),
                "accuracy": soup.find('span', string="Nauwkeurigheid").next_sibling.getText().strip(),
                "source": soup.find('span', string="Bron").next_sibling.getText().strip(),
                
                "date": soup.find("#observation_details th", string="Datum").next_sibling.getText().strip()#datum
                # "amount":#aantal
                # "life_stage":#levensstadium
                # "activity":#activiteit
                # "location_id": soup.select_one('tr > th').a.get('href').strip('/locations/').strip('/'), #######################todo hier zitten we, gebaseerd op https://waarnemingen.be/observation/336664701/
                # "location": #locatie
                # 'observer_name': cols[3].getText().strip(),
                # 'observer_id': cols[3].a['href'].strip('/users/').strip('/') if cols[3].a else None,
                # "counting_method":#telmethode
                # "method":#methode
                    
                
            }
            return observation
                    
        except requests.exceptions.HTTPError as e:
            print(f"HTTP error: {e} on attempt {attempt + 1}/{retries}")
            if attempt < retries - 1:
                time.sleep(backoff_factor * (2 ** attempt))
                continue
            else:
                raise e
            
def scrape(observations = ["336664701"]):
    file_name = make_filename(10041)
    
    try:
        for i, observation_id in enumerate(observations):
            if (i == 0 and not os.path.isfile(file_name)):
                write_header = True
            else:
                write_header = False
            
            observation = parse_observation(observation_id)
            pd.DataFrame(observation).to_csv(
                file_name,
                mode='a',  # Append mode
                index=False,
                header=write_header  # Write header only for the first write in the file
            )
            
            time.sleep(5)  # Respectful delay between requests

    except Exception as e:
        print(f"An error occurred: {e}")
        break


def make_filename(species_id):
    current_date = datetime.now().strftime('%Y-%m-%d')
    base_name = f'scraped_data/observation_details_{species_id}_{current_date}'
    return base_name + '.csv'
            

In [12]:
import bs4
import requests
import pandas as pd
from datetime import datetime
import time
import os

# Base URL for scraping
base_url = 'https://waarnemingen.be/observation/{}'

# Function to parse a single observation
def parse_observation(observation_id):
    res = requests.get(base_url.format(observation_id))
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, 'html.parser')
    
    # Extract table rows
    rows = soup.select('table tbody tr')
    observations = []
    
    for row in rows:
        cols = row.find_all('td')
        if len(cols) >= 5:
            observation = {
                'date': cols[0].getText().strip(),
                'id': cols[0].a['href'].strip('/observation/').strip('/') if cols[0].a else None,
                'amount_manner': cols[1].getText().strip(),
                'location': cols[2].getText().strip(),
                'location_id': cols[2].a['href'].strip('/locations/').strip('/') if cols[2].a else None,
                'observer_name': cols[3].getText().strip(),
                'observer_id': cols[3].a['href'].strip('/users/').strip('/') if cols[3].a else None,
                'validation': cols[4].i['title'].strip() if cols[4].i else None,
            }
            observations.append(observation)
    
    return observations

def scrape_all_pages(species_id):
    all_observations = []
    page = 1
    while True:
        print(f"Scraping page {page}...")
        observations = parse_observation(species_id, page)
        if not observations:  # Stop if no more data
            break
        all_observations.extend(observations)
        page += 1
    return all_observations

def scrape_multiple_pages(species_id, page_start, page_end_excl):
    all_observations = []
    for page in range(page_start, page_end_excl): 
        print(f"Scraping page {page}...")
        observations = parse_observation(species_id, page)
        if not observations:  # Stop if no more data
            break
        all_observations.extend(observations)
    return all_observations

def scrape(species_id, page_start=0, page_end_excl=0):
    if page_end_excl == 0:
        return scrape_all_pages(species_id)
    else:
        return scrape_multiple_pages(species_id, page_start, page_end_excl)


species_id = 116
page_start = 1
page_end_excl = 4

# observations = scrape_multiple_pages(species_id, page_start, page_end_excl)
observations = scrape(species_id, page_start, page_end_excl)

# Convert to DataFrame
df = pd.DataFrame(observations)

# Save to CSV (optional)
current_date = datetime.now().strftime('%Y-%m-%d')
if page_start == 0 and page_end_excl == 0:
    df.to_csv(f'observations_{species_id}_{current_date}_full.csv', index=False)
else :
    df.to_csv(f'observations_{species_id}_{current_date}_{page_start}_{page_end_excl}.csv', index=False)

print("Scraping complete. Data saved to 'observations.csv'")
print(df.head())


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping complete. Data saved to 'observations.csv'
               date         id              amount_manner  \
0  2025-01-14 16:57  337060073  2  overvliegend noordwest   
1  2025-01-14 16:33  337053960                         10   
2        2025-01-14  337050655                          1   
3  2025-01-14 14:35  337054880             1  foeragerend   
4  2025-01-14 13:38  337053880       3  gezien en gehoord   

                                            location location_id  \
0           Deerlijk - De Bonte Os (West-Vlaanderen)      623755   
1                   Diest - Centrum (Vlaams-Brabant)       30991   
2  Harelbeke/Deerlijk - De Gavers (Provinciedomei...       31857   
3                Diest - Halve Maan (Vlaams-Brabant)       92164   
4           Hofstade - BLOSO-Domein (Vlaams-Brabant)       30871   

  observer_name observer_id                            validation  
0    yann feryn       40457                    