# Collect, filter and merge region data

In [None]:
import json
import os
import time

import cartopy.crs as ccrs
import geopandas as gpd
import numpy as np
import pandas as pd
import requests
import shapely

In [None]:
DOWNLOAD_DIR = "nuts"

os.makedirs(DOWNLOAD_DIR, exist_ok=True)

## Collect NUTS files

In [None]:
def get_nuts_units_files(year):
    url = rf"https://gisco-services.ec.europa.eu/distribution/v2/nuts/nuts-{year}-units.json"
    return requests.get(url).json()

nuts_2021_units = get_nuts_units_files(2021)
nuts_2024_units = get_nuts_units_files(2024)

In [None]:
def get_level_ids(d):
    return [nuts_id for nuts_id in d.keys() if len(nuts_id) == 4]

def get_level_ids_uk(d):
    return [nuts_id for nuts_id in d.keys() if len(nuts_id) == 4 and nuts_id.startswith("UK")]

# Combine 2024 NUTS regions, fill in UK based on 2021 regions
level2_ids = set(get_level_ids(nuts_2024_units) + get_level_ids_uk(nuts_2021_units))

In [None]:
def select_file(files):
    for file in files:
        if "region-10m-4326-" in file:
            return file
    raise ValueError(files)


for nuts_id in sorted(level2_ids):
    # Prefer 2024 records, fall back to 2021 if not available
    if nuts_id in nuts_2024_units:
        files = nuts_2024_units[nuts_id]
    elif nuts_id in nuts_2021_units:
        files = nuts_2021_units[nuts_id]
    else:
        raise KeyError(nuts_id)
    
    file = select_file(files)
    url = rf"https://gisco-services.ec.europa.eu/distribution/v2/nuts/distribution/" + file
    
    with open(os.path.join(DOWNLOAD_DIR, file), "wb") as f:
        f.write(requests.get(url).content)
    
    # Don't spam the server too much
    time.sleep(0.02)

## Load and merge data

In [None]:
files = [os.path.join(DOWNLOAD_DIR, file) for file in os.listdir(DOWNLOAD_DIR) if file.endswith(".geojson")]

df = pd.concat([gpd.read_file(file) for file in files]).sort_values("NUTS_ID")

## Only keep regions covered by EURO-CORDEX domain

In [None]:
# Usual EURO-CORDEX projection
crs_cordex = ccrs.RotatedPole(pole_latitude=39.25, pole_longitude=-162)
crs_latlon = ccrs.PlateCarree()

In [None]:
# https://cordex.org/domains/cordex-region-euro-cordex/
cordex_corners_rot = crs_cordex.transform_points(
    crs_latlon,
    np.asarray([315.86, 64.4, 36.30, 350.01]),
    np.asarray([60.21, 66.65, 25.36, 22.20])
)

cordex_domain_rot = shapely.Polygon(cordex_corners_rot[:,:2])
# Increase the polygon resolution for accurate reprojection of edges
cordex_domain_rot = shapely.segmentize(cordex_domain_rot, max_segment_length=2.)

Output: EURO-CORDEX domain

In [None]:
cordex_domain_df = gpd.GeoDataFrame(None, geometry=[cordex_domain_rot], crs=crs_cordex).to_crs(df.crs)
cordex_domain_df.set_precision(0.0001).to_file("../data/eurocordex.geojson")

In [None]:
df_cordex_only = df.loc[df.within(cordex_domain_df)]

## Manual edits

In [None]:
df = df_cordex_only.copy()

### Remove regions

In [None]:
# No temperature data for Malta in EOBS
# df = df.drop(["MT00"], axis=0)

### Add regions

In [None]:
countries_df = gpd.read_file("https://gisco-services.ec.europa.eu/distribution/v2/countries/geojson/CNTR_RG_10M_2024_4326.geojson").to_crs(df.crs)

nuts_country_codes = set(df["CNTR_CODE"].unique())

def select_additional_countries(row):
    in_domain = cordex_domain_df.geometry.contains_properly(row.geometry).values[0]
    in_nuts = row["CNTR_ID"] in nuts_country_codes
    return in_domain and not in_nuts

mask = countries_df.apply(select_additional_countries, axis=1)

In [None]:
# Add Moldova
moldova = countries_df.loc[133]
assert moldova["CNTR_ID"] == "MD"

df_extra = gpd.GeoDataFrame.from_dict({
    'COAST_TYPE': [None],
    'MOUNT_TYPE': [None],
    'NAME_LATN': [moldova["CNTR_NAME"]],
    'CNTR_CODE': [moldova["CNTR_ID"]],
    'NUTS_ID': [moldova["CNTR_ID"]],
    'NUTS_NAME': [moldova["CNTR_NAME"]],
    'LEVL_CODE': [0],
    'URBN_TYPE': [None],
    "geometry": [moldova.geometry]
}).set_crs(countries_df.crs)

df = pd.concat([df, df_extra]).sort_values("NUTS_ID")

## Output

In [None]:
df = df.drop(["COAST_TYPE", "MOUNT_TYPE", "URBN_TYPE", "CNTR_CODE", "LEVL_CODE"], axis=1)
df = df.rename({"NUTS_ID": "id"}, axis=1)
df = df.set_index("id")
df.geometry = df.set_precision(0.0001)

In [None]:
with open("../data/regions.geojson", "w") as f:
    json.dump(df.__geo_interface__, f)