In [37]:
import os
import datetime
import requests
import zipfile
import pandas as pd
import geopandas as gpd
import h3
from shapely.geometry import shape
import fiona
from multiprocessing import Pool
from bs4 import BeautifulSoup

In [38]:
def scrape_urls(base_url):
    """
    Scrapes all URLs from a webpage.

    Args:
        base_url (str): The URL of the webpage to scrape.

    Returns:
        list: A list of URLs found on the webpage.
    """
    # Send a GET request to the webpage
    response = requests.get(base_url)

    # If the GET request is successful, the status code will be 200
    if response.status_code == 200:
        # Get the content of the response
        page_content = response.content

        # Create a BeautifulSoup object and specify the parser
        soup = BeautifulSoup(page_content, 'html.parser')

        # Find all the anchor tags in the HTML
        # Extract the href attribute and add it to a list
        urls = [a['href'] for a in soup.find_all('a', href=True) if 'cbs_vk100' in a['href']]

        return urls

    else:
        print(f"Failed to scrape {base_url}")
        return []

In [39]:
def download_and_extract(url, year, download_path, extract_path):
    """
    Downloads a ZIP file from a URL and extracts its contents.

    Args:
        year (int): The year of the data.
        download_path (str): Path to save the downloaded ZIP file.
        extract_path (str): Path to extract the contents of the ZIP file.

    Returns:
        bool: True if the file was successfully downloaded and extracted, False otherwise.
    """    
    try:
        # Download the file
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception if the request was unsuccessful
    	
        # Save the file
        with open(download_path, 'wb') as f:
            f.write(response.content)

        # Extract the file
        with zipfile.ZipFile(download_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        
        return True  # The file was successfully downloaded and extracted

    except Exception as e:
        print(f"Error downloading or extracting file: {e}")
        return False  # An error occurred

In [40]:
def process_geojson(url_gpkg, resolution, output_filepath):
  """
  ... (function docstring)
  """
  try:
    if not os.path.exists(url_gpkg):
      raise FileNotFoundError(f"Input file not found: {url_gpkg}")

    with fiona.open(url_gpkg, 'r') as src:
      crs = src.crs
      population_data = gpd.GeoDataFrame()

      for feat in src:
        # Process each feature individually
        df = gpd.GeoDataFrame([feat], crs=crs)
        # Select columns and explode
        selected_data = df[['properties', 'geometry']]
        selected_data['aantal_inwoners'] = selected_data['properties'].apply(lambda x: x['aantal_inwoners'])
        selected_data.drop(columns=['properties'], inplace=True)
        centroid_points = selected_data.explode(index_parts=True)
        # Handle missing values (replace with 0 for this example)
        centroid_points.fillna(0, inplace=True)
        # Set CRS
        centroid_points = centroid_points.set_crs(4326, allow_override=True)

        # Aggregate to H3 cells (consider vectorization for efficiency)
        h3_cells = h3.geo_to_h3(centroid_points['geometry'].tolist())

        # Aggregate to H3 cells (consider vectorization for efficiency)
        h3_cells = []
        for geom in centroid_points['geometry'].tolist():
            lat, lng = geom.y, geom.x  # Extract latitude and longitude
            h3_cell = h3.geo_to_h3(lat, lng, resolution)  # Convert to H3 cell
            h3_cells.append(h3_cell)

        # Append H3 data to population_data
        population_data = population_data.append(pd.DataFrame(h3_cells, columns=['h3']), ignore_index=True)


    # Save H3 data to CSV after processing all features
    population_data.to_csv(output_filepath)

  except Exception as e:
    print(f"Error processing data: {e}")


In [41]:
def main():
    # Get the current year
    current_year = datetime.datetime.now().year -1

    # Scrape the URLs from the website
    base_url = 'https://www.cbs.nl/nl-nl/dossier/nederland-regionaal/geografische-data/kaart-van-100-meter-bij-100-meter-met-statistieken#:~:text=In%20deze%20kaart%20met%20vierkanten,en%20nabijheid%20van%20voorzieningen%20samengesteld.'
    urls = scrape_urls(base_url)
    # print(urls)

    # Try downloading the file for the current year and previous years
    for year in range(current_year - 1, current_year - 12, -1):  # Try for the last 10 years
        # Construct the file paths using the year
        download_path = f'./cbs_{year}.zip'
        extract_path = f'./cbs_{year}'
        url_gpkg = os.path.join(extract_path, f'cbs_vk100_{year}_v1.gpkg')
        output_filepath = f'./cbs_{year}_h3.csv'

        # Find the URL for the current year
        url = next((u for u in urls if str(year) in u), None)

        if url:
            # Download and extract the file
            if download_and_extract(url, year, download_path, extract_path):

                # If successful, process the data and break the loop
                process_geojson(url_gpkg, 9, output_filepath)
                break
        else:
            print(f"No URL available for {year}")

In [42]:
if __name__ == "__main__":
    main()

Error processing data: _API_FUNCTIONS.geo_to_h3() missing 2 required positional arguments: 'lng' and 'resolution'
