# Scrapen van de individuele locaties

- Doel: het verkrijgen van de oppervlakte van elke locatie
- Scrapen
  - Een variabele, maar respectvolle tijd voorzien tussen 2 aanvragen
  - Voldoende tijd te voorzien
- Op basis van de lijst met alle locatie id's per soort, kunnen we onze csv file telkens aanvullen met ontbrekende area. Zo kunnen we de data ophalen in verschillende sessies en bij elke scrape-sessie aanvullen met de observaties die we ontbreken. Van zodra alle observaties opgehaald zijn, verandert de filename van _in_progress.csv to _complete.csv.

In [3]:
import bs4
import requests
import pandas as pd
from datetime import datetime
import time
import os
from random import randint

# Base URL for scraping
base_url = 'https://waarnemingen.be/locations/{}/'  

# Function to parse a single location
def parse_location(location_id, retries=10, backoff_factor=2):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-encoding":"gzip, deflate, br, zstd",
        "accept-language":"nl-BE,nl;q=0.9,en-BE;q=0.8,en;q=0.7,nl-NL;q=0.6,en-US;q=0.5",
        "connection":"keep-alive",
        "cookie":"csrftoken=3JbFPYJyRC9GxhkNoW4XzF1vbbG6Fbxe; sessionid=v132os9mxwltj3ol3plhmojrjch24m9o; fundraiser_dismissed=1; cookielaw_accepted=1",
        "host":"waarnemingen.be",
        "Referer": "https://www.google.com/",
        "sec-ch-ua":'"Not A(Brand";v="8", "Chromium";v="132", "Google Chrome";v="132"',
        "sec-ch-ua-mobile":"?0",
        "sec-ch-ua-platform":"Linux",
        "sec-fetch-dest":"document",
        "sec-fetch-mode":"navigate",
        "sec-fetch-site":"same-origin",
        "sec-fetch-user":"?1",
        "upgrade-insecure-requests":"1"
            }
    for attempt in range(retries):
        try:
            res = requests.get(base_url.format(location_id), headers=headers)
            print(f"Requesting {base_url.format(location_id)} (Attempt {attempt + 1})")
            res.raise_for_status()
            soup = bs4.BeautifulSoup(res.text, 'html.parser')
            
            area = soup.select_one('th:-soup-contains("Oppervlakte") + td').get_text(strip=True) if soup.select_one('th:-soup-contains("Oppervlakte") + td') else None # oppervlakte
            return area
        
        except requests.exceptions.HTTPError as e:
            if res.status_code == 404:
                print(f"Location {location_id} not found (404). Returning empty location.")
                return None
            else:
                print(f"HTTP error {res.status_code} on attempt {attempt + 1}/{retries}: {e}")
        except requests.exceptions.RequestException as e:
            print(f"Request error: {e} on attempt {attempt + 1}/{retries}")
            
        if attempt < retries - 1:
            time.sleep(backoff_factor * (2 ** attempt))
            continue
        else:
            print(f"Failed to fetch location {location_id} after {retries} attempts.")
            raise e
            
def scrape(locations, folder_path, sleep_min = 2, sleep_max = 10):
    file_name = make_filename(folder_path)
    print(f"Scraping {locations.shape[0]} locations to {file_name}")
    
    try:
        for i, location in locations.iterrows():
            print(f"Scraping location {location["location_id"]} | ({i+1:_}/{locations.shape[0]:_})")
            if (i == 0 and not os.path.isfile(file_name)):
                write_header = True
            else:
                write_header = False
            
            location['area'] = parse_location(location["location_id"])

            
            location_df = pd.DataFrame([location])
            location_df.to_csv(
                file_name,
                mode='a',  # Append mode
                index=False,
                header=write_header  # Write header only for the first write in the file
            )
            
            time.sleep(randint(sleep_min,sleep_max))  # Variable respectful delay between requests

    except Exception as e:
        print(f"An error occurred: {e}")
        location_df = pd.DataFrame([location])
def make_filename(folder_path, complete=False):
    os.makedirs('scraped_data', exist_ok=True)
    base_name = f'{folder_path}/location_details'
    if not complete:
        return base_name + '_in_progress.csv'
    else:
        return base_name + '_complete.csv'

###########################################################################################################################################
folder_path_clean = "./scraped_data/cleaned/"
folder_path_raw = "./scraped_data/"
scrape_amount = 100000
sleep_min = 1
sleep_max = 4

start_time = datetime.now()

df_general = pd.read_csv(folder_path_clean + "locations_general_clean.csv") # source
details = [f for f in os.listdir(folder_path_raw) if f"location_details" in f and f.endswith(".csv")] # target in progress

df_details = pd.read_csv(folder_path_raw + details[0]) if len(details) > 0 else None

if (df_details is not None):
    # Find ids in df_general that are not in df_details
    df_to_scrape = df_general[~df_general['location_id'].isin(df_details['location_id'])]
else:
    df_to_scrape = df_general
    

if df_to_scrape.shape[0] > 0:
    print(f"Start scraping: {scrape_amount:_} this batch, {df_to_scrape.shape[0]:_} locations to scrape")
    scrape(df_to_scrape.iloc[0:scrape_amount,:], folder_path_raw, sleep_min, sleep_max)
else:
    print("All locations have been scraped.")
    if 'in_progress' in details[0]:
        os.rename(make_filename(folder_path_raw, complete=False), make_filename(folder_path_raw, complete=True))
        
end_time = datetime.now()
execution_time = end_time - start_time
days, rem = divmod(execution_time.total_seconds(), 86400)
hours, rem = divmod(rem, 3600)
minutes, rem = divmod(rem, 60)
seconds, _ = divmod(rem, 1)
print(f"Execution time: {int(days)} days, {int(hours)} hours, {int(minutes)} minutes, {int(seconds)} seconds")


All locations have been scraped.
Execution time: 0 days, 0 hours, 0 minutes, 0 seconds
