In [17]:
import os
import datetime
import requests
import zipfile
import pandas as pd
import geopandas as gpd
import h3
from shapely.geometry import Point, shape
from multiprocessing import Pool
from bs4 import BeautifulSoup
import logging

In [18]:
def scrape_urls(base_url):
    """
    Scrapes all URLs from a webpage.

    Args:
        base_url (str): The URL of the webpage to scrape.

    Returns:
        list: A list of URLs found on the webpage.
    """
    try:
        response = requests.get(base_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        urls = [a['href'] for a in soup.find_all('a', href=True) if 'cbs_vk100' in a['href']]
        return urls
    except Exception as e:
        print(f"Failed to scrape {base_url} due to {e}")
        return []

In [19]:
def clip_gpkg(gpkg_path, shapefile_path, output_path):
    """
    Clips a GeoPackage to the extent of a shapefile and saves the result as a GeoJSON.

    Args:
        gpkg_path (str): Path to the GeoPackage.
        shapefile_path (str): Path to the shapefile.
        output_path (str): Path to save the clipped GeoJSON.
    """
    try:
        gdf = gpd.read_file(gpkg_path)
        gdf = gdf.to_crs(epsg=4326)
        clip_gdf = gpd.read_file(shapefile_path)
        clip_gdf = clip_gdf.to_crs(epsg=4326)
        clipped = gpd.clip(gdf, clip_gdf)
        clipped.to_file(output_path, driver='GeoJSON')
    except Exception as e:
        print(f"Error clipping GeoPackage: {e}")

In [20]:
def download_and_extract(url, year, download_path, extract_path):
    """
    Downloads a ZIP file from a URL and extracts its contents.

    Args:
        year (int): The year of the data.
        download_path (str): Path to save the downloaded ZIP file.
        extract_path (str): Path to extract the contents of the ZIP file.

    Returns:
        bool: True if the file was successfully downloaded and extracted, False otherwise.
    """    
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(download_path, 'wb') as f:
            f.write(response.content)
        with zipfile.ZipFile(download_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        return True
    except Exception as e:
        print(f"Error downloading or extracting file: {e}")
        return False

In [21]:
def process_feature(feature, resolution):
  """
  Processes a single GeoDataFrame feature and converts geometry to H3 cell.

  Args:
    feature (tuple): A single row from the DataFrame, represented as a tuple.
    resolution (int): H3 resolution (0-17).

  Returns:
    tuple: A tuple containing the H3 cell and population data.
  """
  try:
    aantal_inwoners, geometry = feature
    centroid = geometry.centroid
    h3_cell = h3.geo_to_h3(centroid.y, centroid.x, resolution)
    return h3_cell, aantal_inwoners
  except Exception as e:
    print(f"Error processing feature: {e}")
    return None, None

In [22]:
def process_geojson(clip_gdf, resolution, output_filepath):
  """
  Processes a GeoJSON file containing population data and aggregates it to H3 cells.

  Args:
    clip_gdf (str): URL or path to the GeoJSON file.
    resolution (int): H3 resolution (0-17).
    output_filepath (str): Path to save the resulting CSV file.
  """
  try:
    gdf = gpd.read_file(clip_gdf)
    gdf = gdf[['aantal_woningen', 'geometry']]
    gdf = gdf[gdf['aantal_woningen'] > 0]  # Filter out rows with a value of 0
    n_cores = os.cpu_count()
    with Pool(processes=n_cores) as pool:
      # Convert DataFrame to list of tuples
      data = list(gdf.itertuples(index=False, name=None))
      results = pool.starmap(process_feature, zip(data, [resolution] * len(gdf)))
    h3_data, population_data = zip(*results)
    df = pd.DataFrame({'hex9': h3_data, 'value': population_data})
    df.to_csv(output_filepath, index=False)
  except Exception as e:
    print(f"Error processing data: {e}")


In [23]:
def main():
    current_year = datetime.datetime.now().year -1
    base_url = 'https://www.cbs.nl/nl-nl/dossier/nederland-regionaal/geografische-data/kaart-van-100-meter-bij-100-meter-met-statistieken#:~:text=In%20deze%20kaart%20met%20vierkanten,en%20nabijheid%20van%20voorzieningen%20samengesteld.'
    urls = scrape_urls(base_url)
    for year in range(current_year - 1, current_year - 12, -1):
        download_path = f'./cbs_{year}.zip'
        extract_path = f'./cbs_{year}'
        url_gpkg = os.path.join(extract_path, f'cbs_vk100_{year}_v1.gpkg')
        output_filepath = f'./cbs2_{year}_h3.csv'
        url = next((u for u in urls if str(year) in u), None)
        if url:
            # if download_and_extract(url, year, download_path, extract_path):
            #     clip_gpkg(url_gpkg, '../shapefiles/zh_poly.shp', './clipped.geojson')
            process_geojson('./clipped.geojson', 9, output_filepath)
            break
        else:
            print(f"No URL available for {year}")

In [24]:
if __name__ == "__main__":
    main()