In [None]:
import bs4
import requests
import pandas as pd
from datetime import datetime
import time
import os

species = {70: "boomklever", 77: "bosuil", 116: "halsbandparkiet", 300: "oehoe", 362: "zwarte ooievaar", 10041: "gewone wasbeer"}

# Base URL for scraping
base_url = 'https://waarnemingen.be/species/{}/observations/?date_after=1900-01-01&date_before=2024-12-31&page={}'  # te verwachten tot page 3879

# Function to parse a single page
def parse_page(species_id, page_number, retries=10, backoff_factor=2):
    for attempt in range(retries):
        try:
            res = requests.get(base_url.format(species_id, page_number))
            res.raise_for_status()
            soup = bs4.BeautifulSoup(res.text, 'html.parser')
            
            # Extract table rows
            rows = soup.select('table tbody tr')
            observations = []
            
            for row in rows:
                cols = row.find_all('td')
                if len(cols) >= 5:
                    observation = {
                        'date': cols[0].getText().strip(),
                        'id': cols[0].a['href'].strip('/observation/').strip('/') if cols[0].a else None,
                        'amount_manner': cols[1].getText().strip(),
                        'location': cols[2].getText().strip(),
                        'location_id': cols[2].a['href'].strip('/locations/').strip('/') if cols[2].a else None,
                        'observer_name': cols[3].getText().strip(),
                        'observer_id': cols[3].a['href'].strip('/users/').strip('/') if cols[3].a else None,
                        'validation': cols[4].i['title'].strip() if cols[4].i else None,
                        'page' : page_number
                    }
                    observations.append(observation)
                    
            pagination = soup.select('.pagination li')

            has_next_page = False
            for page in pagination:
                text = page.getText()
                try:
                    number = int(text)
                    if number > page_number:
                        has_next_page = True
                        break
                except ValueError:
                    continue
            return observations, has_next_page
        except requests.exceptions.HTTPError as e:
            print(f"HTTP error: {e} on attempt {attempt + 1}/{retries}")
            if attempt < retries - 1:
                time.sleep(backoff_factor * (2 ** attempt))
                continue
            else:
                raise e

def scrape(species_id, page_start = 1, page_end_incl = 100_000):
    file_name = make_filename(species_id)
    page = page_start
    keep_scraping = True
    first_write = True if page_start == 1 else False  # Add header only for the first page
    
    while keep_scraping:
        print(f"Scraping page {page}...")
        try:
            observations, has_next_page = parse_page(species_id, page)
            if not observations or not has_next_page or page >= page_end_incl:
                keep_scraping = False
            
            # Convert observations to a DataFrame
            if observations:
                df = pd.DataFrame(observations)
                
                # Append observations to CSV
                df.to_csv(
                    file_name,
                    mode='a',  # Append mode
                    index=False,
                    header=first_write  # Write header only for the first write
                )
                first_write = False  # Ensure header is only written once
            
            time.sleep(5)  # Respectful delay between requests
            page += 1
        except Exception as e:
            print(f"An error occurred: {e}")
            break
    df = pd.read_csv(file_name)
    print(f"Scraped {len(df)} observations, last scraped page: {page - 1}")
    os.rename(file_name, make_filename(species_id, page_start, page - 1))
    return df

def make_filename(species_id, page_start = None, page_end_incl = None):
    current_date = datetime.now().strftime('%Y-%m-%d')
    base_name = f'scraped_data/observations_{species_id}_{current_date}'
    if page_start and page_end_incl:
        base_name = base_name + f'_pages_{page_start}-{page_end_incl}'
    return base_name + '.csv'

#################################################################################################################

species_id = 70
page_start = 1
page_end_incl = 2

scrape(species_id) # Scrape all pages
# scrape(species_id, page_start, page_end_incl) # Scrape section

print("Scraping complete. Data saved to .csv file")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
