In [1]:
import os,sys
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
import shapely
import pandas as pd
import numpy as np
import xarray as xr
import dask_geopandas
from tqdm import tqdm
sys.path.append('c://projects//osm-flex/src') 

from rasterstats import point_query

In [2]:
data_path = 'c://data//CEED'
input_data = os.path.join(data_path,'input_data')
bucco_path = os.path.join(data_path,'coastal_bucco_exact')
osm_path = os.path.join(data_path,'coastal_osm_exact')

In [3]:
def raster_to_vector(xr_raster):
    """
    Convert a raster to a vector representation.

    Args:
        xr_raster (xarray.DataArray): Input raster data as xarray.DataArray.

    Returns:
        gpd.GeoDataFrame: Vector representation of the input raster.
    """

    # Convert xarray raster to pandas DataFrame
    df = xr_raster.to_dataframe()

    # Filter DataFrame to select rows where band_data > 0
    df_1 = df.loc[df.band_data > 0].reset_index()

    # Create a Shapely Point geometry column from x and y values
    df_1['geometry'] = shapely.points(df_1.x.values, df_1.y.values)

    # Remove unnecessary columns from the DataFrame
    df_1 = df_1.drop(['x', 'y', 'band', 'spatial_ref'], axis=1)

    # Calculate the resolution of the raster
    resolution = xr_raster.x[1].values - xr_raster.x[0].values

    # Buffer the Point geometries by half of the resolution with square caps
    df_1.geometry = shapely.buffer(df_1.geometry, distance=resolution/2, cap_style='square').values

    # Convert the DataFrame to a GeoDataFrame
    return gpd.GeoDataFrame(df_1)      

def zonal_stats(vector_in, raster_in):
    """
    Calculate zonal statistics of a raster dataset based on a vector dataset.
    
    Parameters:
    - vector_in (str): Path to the vector dataset file (in Parquet format).
    - raster_in (str): Path to the raster dataset file (in NetCDF format).
    
    Returns:
    - pandas.Series: A series containing the zonal statistics values corresponding to each centroid point in the vector dataset.
    """
    
    # Read the vector dataset from the given path
    vector = gpd.read_parquet(vector_in)
    
    # Open the raster dataset using the xarray library
    raster = xr.open_dataset(raster_in, engine="rasterio")
    
    # Progress bar setup for obtaining values
    tqdm.pandas(desc='obtain values')
    
    # Clip the raster dataset to the bounding box of the vector dataset
    raster_clip = raster.rio.clip_box(vector.total_bounds[0], vector.total_bounds[1], vector.total_bounds[2], vector.total_bounds[3])
    
    # Convert the clipped raster dataset to a vector representation
    raster_vector = raster_to_vector(raster_clip)
    
    # Create a dictionary mapping each index to its corresponding band data value
    band_data_dict = dict(zip(list(raster_vector.index), raster_vector['band_data'].values))
    
    # Construct an STRtree from the vector geometry values
    tree = shapely.STRtree(raster_vector.geometry.values)
    
    # Apply a function to calculate zonal statistics for each centroid point in the vector dataset
    return vector.centroid.progress_apply(lambda x: band_data_dict[tree.query(x, predicate='intersects')[0]])


def vector_point_query(x, coastal_CLC_tree, band_data_dict):
    """
    Perform a point query on a vector dataset based on specific conditions.

    Parameters:
    - x (GeoDataFrame): A GeoDataFrame representing a single feature.
    - coastal_CLC_tree (shapely.STRtree): STRtree object constructed from the coastal CLC vector geometry values.
    - band_data_dict (dict): A dictionary mapping indices to their corresponding band data values.

    Returns:
    - int: The band data value corresponding to the point query, or -9999 if the conditions are not met.
    """

    if x.land_use == 5:
        try:
            # Perform an intersection query using the centroid of the feature
            match = coastal_CLC_tree.query(x.centroid, predicate='intersects')
            return band_data_dict[match[0]]
        except:
            # Return -9999 if no intersection is found
            return -9999
    else:
        # Return -9999 if the land use condition is not met
        return -9999
    
def final_land_use(x):
    """
    Determine the final land use based on the coastal land use and land use values of a feature.

    Parameters:
    - x (pandas.Series): A pandas Series representing a single feature.

    Returns:
    - int: The final land use value, which is either the coastal land use or the land use value of the feature.
    """

    if x.coastal_land_use == -9999:
        return x.land_use
    else:
        return x.coastal_land_use  

In [4]:
country_code = 'FRA'

In [5]:
bucco_file = os.path.join(input_data,'..','coastal_bucco_exact','{}_bucco.parquet').format(country_code)
CLC_path = os.path.join(input_data,'u2018_clc2018_v2020_20u1_raster100m','DATA','U2018_CLC2018_V2020_20u1.tif')
coastal_CLC_path = os.path.join(input_data,'CZ_2018_DU004_3035_V010.parquet')

### Read Data

In [6]:
coastal_bucco = gpd.read_parquet(bucco_file)    
coastal_CLC = gpd.read_parquet(coastal_CLC_path)

### Find land cover information from CLC full layer

In [7]:
%%time
coastal_bucco['land_use'] = zonal_stats(bucco_file,CLC_path)

obtain values: 100%|██████████████████████████████████████████████████████| 8038809/8038809 [03:33<00:00, 37701.26it/s]


### Find additional land cover information from CLC coastal zones 

In [8]:
coastal_CLC_tree = shapely.STRtree(coastal_CLC.geometry.values)
band_data_dict = dict(zip(list(coastal_CLC.index), coastal_CLC['CODE_4_18'].values))

In [9]:
coastal_bucco['centroid'] = coastal_bucco.centroid

tqdm.pandas(desc='obtain port values')

coastal_land_use = coastal_bucco.progress_apply(lambda x: vector_point_query(x,coastal_CLC_tree,band_data_dict),axis=1)

obtain port values: 100%|██████████████████████████████████████████████████| 8038809/8038809 [15:10<00:00, 8829.70it/s]


In [10]:
coastal_bucco['coastal_land_use'] = coastal_land_use

### Merge into a single column

In [11]:
%%time
coastal_bucco['use_type'] = coastal_bucco.progress_apply(lambda x: final_land_use(x),axis=1)

obtain port values: 100%|█████████████████████████████████████████████████| 8038809/8038809 [02:37<00:00, 50945.84it/s]

CPU times: total: 2min 37s
Wall time: 2min 37s





In [12]:
country_code = 'FRA'
df = gpd.read_parquet(os.path.join(data_path,'..','CIS_EU','{}_cis.parquet').format(country_code))

In [13]:
coastal_bucco.head()

Unnamed: 0,id,height,age,type,id_source,type_source,geometry,land_use,centroid,coastal_land_use,use_type
0,v0.1-FRA.3.1.1.2_1-3121,2.0,,,BATIMENT0000000288524932,Indifférencié,"POLYGON ((3432113.151 2875938.944, 3432116.966...",20.0,POINT (3432115.175 2875937.581),-9999,20.0
1,v0.1-FRA.3.1.1.2_1-3135,4.5,1993.0,residential,BATIMENT0000000288524916,Résidentiel,"POLYGON ((3431783.459 2875481.094, 3431790.522...",20.0,POINT (3431779.426 2875469.016),-9999,20.0
2,v0.1-FRA.3.1.1.2_1-3136,5.3,1940.0,residential,BATIMENT0000000288524919,Résidentiel,"POLYGON ((3432037.123 2875440.726, 3432032.999...",12.0,POINT (3432024.373 2875443.073),-9999,12.0
3,v0.1-FRA.3.1.1.2_1-3137,5.2,,,BATIMENT0000000288524917,Indifférencié,"POLYGON ((3432095.501 2875941.111, 3432094.483...",12.0,POINT (3432084.106 2875939.529),-9999,12.0
4,v0.1-FRA.3.1.1.2_1-3138,4.4,,,BATIMENT0000000288524918,Indifférencié,"POLYGON ((3431991.551 2875421.613, 3432002.326...",12.0,POINT (3431999.556 2875429.695),-9999,12.0
