# Tranform coordinates

In this notebook, coordinates of all federal states are transformed to EPSG 3035 (ETRS89-extended / LAEA Europe).

In [22]:
import os
import re

from pyproj import Transformer
import pandas as pd
import numpy as np
from tqdm import tqdm

Build a lookup dictionary which contains the source CRS, column names of X and Y coordinates, the name of the area column and the name of the ID column in the raw metadata table.

In [25]:
# build lookup dict
from camelsp.util import _NUTS_LVL2_NAMES

nuts_ids = list(_NUTS_LVL2_NAMES.keys())

# values: CRS, column names location [x, y], column name area, column name ID
_LOCATION_DICT = {
    'DE1': {'epsg': 'EPSG:25832', 'xy': ['Ost (UTM ETRS89)', 'Nord (UTM ETRS89)'], 'area': 'Einzugsgebiet in km²', 'id': 'Messstellennummer'},
    'DE2': {'epsg': 'EPSG:25832', 'xy': ['Ostwert', 'Nordwert'], 'area': 'EZG km²', 'id': 'Stationsnummer'}, 
    'DE4': {'epsg': 'EPSG:32633', 'xy': ['Ost/RW', 'Nord/HW'], 'area': 'CATCHMENT_SIZE', 'id': 'Messstellennummer'},
    'DE7': {'epsg': 'EPSG:25832', 'xy': ['Koordinaten X', 'Koordinaten Y'], 'area': 'Größe des Einzugsge-biets [km²]', 'id': 'Messstellen Nr.'},
    'DE8': {'epsg': 'EPSG:32633', 'xy': ['rechtswert', 'hochwert'], 'area': 'einzugsgebiet', 'id': 'pegelkennzahl'},
    'DE9': {'epsg': 'EPSG:31467', 'xy': ['RECHTS3', 'HOCH3'], 'area': 'SHAPE_STAr', 'id': 'MESSSTELLE_NR'},
    'DEA': {'epsg': 'EPSG:32632', 'xy': ['KOORDX', 'KOORDY'], 'area': 'Einzugsgebiet', 'id': 'Stationsnummer'},
    'DEB': {'epsg': 'EPSG:31466', 'xy': ['RW', 'HW'], 'area': 'Aeo', 'id': 'Nummer'},
    'DEC': {'epsg': 'EPSG:31466', 'xy': ['RW', 'HW'], 'area': 'EZG_Gr', 'id': 'MSTNR'},
    'DED': {'epsg': 'EPSG:32633', 'xy': ['OSTWERT', 'NORDWERT'], 'area': 'AE', 'id': 'Pegelkennziffer'},
    'DEE': {'epsg': 'EPSG:25832', 'xy': ['Easting', 'Northing'], 'area': 'AREA_KM2', 'id': 'SANR'},
    'DEF': {'epsg': 'EPSG:4647', 'xy': ['x', 'y'], 'area': 'area', 'id': 'id'},
    'DEG': {'epsg': 'EPSG:25832', 'xy': ['RW (GK 4)', 'HW (GK 4)'], 'area': 'EZG', 'id': 'Pegelnr'},
}


Function that transforms coordinates for a given Bundesland based on the `_LOCATION_DICT`

In [61]:
def transform_coords(nuts_id: str, location_dict: dict) -> pd.DataFrame:
    """
    Transforms coordinates from raw metadata to EPSG:3035 and saves them as CSV.

    Arguments:
    --------
    nuts_id : str
        NUTS ID of Bundesland.
    location_dict : dict
        Dictionary containing information about the location data in the raw metadata 
        files of each Bundesland.

    Returns:
    --------
    df_location : pd.DataFrame
        DataFrame containing the transformed coordinates of the Bundesland.
    
    """
    
    # create locations directory if it does not exist
    os.makedirs("../output_data/locations", exist_ok=True)

    if nuts_id not in location_dict.keys():
        raise ValueError(f"[{nuts_id}]: nuts_id {nuts_id} not contained in given location_dict.")
    
    # read raw metadata file for selected Bundesland
    raw_meta = pd.read_csv(f"../output_data/raw_metadata/{nuts_id}_raw_metadata.csv")

    # get epsg code of source location
    from_epsg = location_dict[nuts_id]['epsg']

    # get necessary columns
    xy_columns = location_dict[nuts_id]['xy']
    area_column = location_dict[nuts_id]['area']
    id_column = location_dict[nuts_id]['id']

    # transformer from source epsg to 3035
    transformer = Transformer.from_crs(from_epsg, "EPSG:3035", always_xy=True)

    x_transformed = []
    y_transformed = []

    # transform coordinates
    for _, row in raw_meta[xy_columns].iterrows():
        x, y = row.iloc[0], row.iloc[1]

        transformed_coords = transformer.transform(x, y)
        x_transformed.append(transformed_coords[0])
        y_transformed.append(transformed_coords[1])

    # build dataframe
    df_location = pd.DataFrame({'ID': raw_meta[id_column], 'Area': raw_meta[area_column], 'X': x_transformed, 'Y': y_transformed})

    # remove eventually occuring characters from area column (e.g. ' km²') and transform to float
    df_location['Area'] = pd.to_numeric(df_location['Area'].apply(lambda x: re.sub(',', '.', str(x).split(' ')[0].split('km')[0]) if str(x).strip() != 'nan' else np.nan), errors='coerce')

    # drop rows where area < 0, Schleswig-Holstein uses -999 as missing value
    df_location = df_location[df_location['Area'] > 0]

    # drop rows where X or Y are NaN
    df_location = df_location.dropna(subset=['X', 'Y'])

    # drop rows where X or Y are inf
    df_location = df_location[~df_location['X'].isin([np.inf, -np.inf])]
    
    # drop possible duplicates
    df_location = df_location.drop_duplicates(subset='ID')

    # sort by ID column
    df_location = df_location.sort_values(by='ID', ignore_index=True)

    # write locations dataframe to folder locations
    df_location.to_csv(f"../output_data/locations/{nuts_id}_Locations.csv", index=False)

    return df_location

In [62]:
# go for all bl, save locations as csv
for nuts_id in tqdm(_LOCATION_DICT.keys()):
    transform_coords(nuts_id=nuts_id, location_dict=_LOCATION_DICT)

100%|██████████| 13/13 [00:00<00:00, 20.81it/s]
