In [1]:
import os,sys
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
import shapely
import pandas as pd
import numpy as np
import xarray as xr
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

sys.path.append('c://projects//osm-flex/src') 

from rasterstats import point_query

pd.options.mode.chained_assignment = None

In [2]:
data_path = 'c://data//CEED'
input_data = os.path.join(data_path,'input_data')
osm_path = os.path.join(data_path,'..','CIS_EU')

In [3]:
def raster_to_vector(xr_raster):
    """
    Convert a raster to a vector representation.

    Args:
        xr_raster (xarray.DataArray): Input raster data as xarray.DataArray.

    Returns:
        gpd.GeoDataFrame: Vector representation of the input raster.
    """

    # Convert xarray raster to pandas DataFrame
    df = xr_raster.to_dataframe()

    # Filter DataFrame to select rows where band_data is 1
    df_1 = df.loc[df.band_data > 0].reset_index()

    # Create a Shapely Point geometry column from x and y values
    df_1['geometry'] = shapely.points(df_1.x.values, df_1.y.values)

    # Remove unnecessary columns from the DataFrame
    df_1 = df_1.drop(['x', 'y', 'band', 'spatial_ref'], axis=1)

    # Calculate the resolution of the raster
    resolution = xr_raster.x[1].values - xr_raster.x[0].values

    # Buffer the Point geometries by half of the resolution with square caps
    df_1.geometry = shapely.buffer(df_1.geometry, distance=resolution/2, cap_style='square').values

    # Convert the DataFrame to a GeoDataFrame
    return gpd.GeoDataFrame(df_1)      

def zonal_stats(vector, raster_in):
    """
    Calculate zonal statistics of a raster dataset based on a vector dataset.
    
    Parameters:
    - vector_in (str): Path to the vector dataset file (in Parquet format).
    - raster_in (str): Path to the raster dataset file (in NetCDF format).
    
    Returns:
    - pandas.Series: A series containing the zonal statistics values corresponding to each centroid point in the vector dataset.
    """
    
    # Open the raster dataset using the xarray library
    raster = xr.open_dataset(raster_in, engine="rasterio")
    
    # Progress bar setup for obtaining values
    tqdm.pandas(desc='obtain values')
    
    # Clip the raster dataset to the bounding box of the vector dataset
    raster_clip = raster.rio.clip_box(vector.total_bounds[0], vector.total_bounds[1], vector.total_bounds[2], vector.total_bounds[3])
    
    # Convert the clipped raster dataset to a vector representation
    raster_vector = raster_to_vector(raster_clip)
    
    # Create a dictionary mapping each index to its corresponding band data value
    band_data_dict = dict(zip(list(raster_vector.index), raster_vector['band_data'].values))
    
    # Construct an STRtree from the vector geometry values
    tree = shapely.STRtree(raster_vector.geometry.values)
    
    # Apply a function to calculate zonal statistics for each centroid point in the vector dataset
    return vector.centroid.progress_apply(lambda x: band_data_dict[tree.query(x, predicate='intersects')[0]])

In [4]:
country_code = 'HRV'

In [5]:
bucco_file = os.path.join(input_data,'..','coastal_bucco_exact','{}_bucco.parquet').format(country_code)
CLC_path = os.path.join(input_data,'u2018_clc2018_v2020_20u1_raster100m','DATA','U2018_CLC2018_V2020_20u1.tif')
slope_path = os.path.join(input_data,'eudem_slop_3035_europe.tif')
coastal_CLC_path = os.path.join(input_data,'CZ_2018_DU004_3035_V010.parquet')

In [6]:
country_osm = gpd.read_parquet(os.path.join(osm_path,'{}_cis.parquet'.format(country_code)))                          

In [7]:
roads = gpd.GeoDataFrame(country_osm.loc['road'][['geometry','highway','maxspeed','lanes','surface']])
roads = roads.to_crs(3035)

In [8]:
%%time
land_use = zonal_stats(roads,CLC_path)

obtain values: 100%|████████████████████████████████████████████████████████| 152059/152059 [00:05<00:00, 29105.71it/s]


CPU times: total: 4min 5s
Wall time: 4min 8s


In [None]:
%%time
slope = zonal_stats(roads,slope_path)

In [None]:
roads['landuse'] =  land_use
roads['slope'] =  slope

In [None]:
geom = roads.iloc[-1].geometry

In [None]:
def sinuosity(geom):
    if geom.geom_type == 'MultiPolygon':
        return 1
    elif geom.geom_type == 'LineString':      
        return shapely.length(geom)/shapely.distance(shapely.get_point(geom,0),shapely.get_point(geom,-1))

In [None]:
roads['sinuosity'] = roads.geometry.progress_apply(lambda x: sinuosity(x))
roads.sinuosity.loc[roads.sinuosity>roads.sinuosity.quantile(.99)] = roads.sinuosity.quantile(.99)

In [None]:
full_data = roads.dropna()

full_data.loc[full_data['surface'].map(full_data['surface'].value_counts(normalize=True)
                                      .lt(0.001)),'surface'] = 'other'

full_data.loc[full_data['highway'].map(full_data['highway'].value_counts(normalize=True)
                                      .lt(0.01)),'highway'] = 'other'

full_data.loc[full_data['lanes'].map(full_data['lanes'].value_counts(normalize=True)
                                      .lt(0.01)),'lanes'] = 'other'

full_data.loc[full_data['landuse'].map(full_data['landuse'].value_counts(normalize=True)
                                      .lt(0.01)),'landuse'] = 0

full_data.landuse = full_data.landuse.astype('object')

In [None]:
def develop_predictor(data,y_col='maxspeed',x_cols=['land_use','highway','lanes','surface']):

    y = data[y_col]
    X = data[x_cols]
        
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.66)
    
    features_to_encode = X_train.columns[X_train.dtypes==object].tolist()  
       
    col_trans = make_column_transformer(
                        (OneHotEncoder(),features_to_encode),
                        remainder = "passthrough"
                        )
    
    rf_classifier = RandomForestClassifier(
                      criterion='gini',
                      min_samples_leaf=25,
                      n_estimators=50,
                      bootstrap=True,
                      oob_score=True,
                      n_jobs=-1)
    
#     # Number of trees in random forest
#     n_estimators = [int(x) for x in np.linspace(start = 50, stop = 1000, num = 50)]
#     # Number of features to consider at every split
#     max_features = ['log2', 'sqrt']
#     # Maximum number of levels in tree
#     max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
#     max_depth.append(None)
#     # Minimum number of samples required to split a node
#     min_samples_split = [2, 5, 10]
#     # Minimum number of samples required at each leaf node
#     min_samples_leaf = [1, 2, 4]
#     # Method of selecting samples for training each tree
#     bootstrap = [True, False]
#     # Create the random grid
#     random_grid = {'n_estimators': n_estimators,
#                    'max_features': max_features,
#                    'max_depth': max_depth,
#                    'min_samples_split': min_samples_split,
#                    'min_samples_leaf': min_samples_leaf,
#                    'bootstrap': bootstrap}

#     rf_random = RandomizedSearchCV(estimator = rf_classifier, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    
    # Fit the random search model
    #rf_random.fit(train_features, train_labels)

    
    pipe = make_pipeline(col_trans, rf_classifier)
    
    pipe.fit(X_train, y_train)
    
    y_pred = pipe.predict(X_test)
    
    accuracy_score(y_test, y_pred)
    print(f"The accuracy of the model is {round(accuracy_score(y_test,y_pred),3)*100} %")
    
    return pipe

In [None]:
%%time
pipe = develop_predictor(full_data,y_col='lanes',x_cols=['landuse','highway','sinuosity','slope'])

In [17]:
def develop_ann(data,y_col='maxspeed',x_cols=['land_use','highway','lanes','surface']):

    y = data[y_col]
    X = data[x_cols]
        
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.333)
    
    features_to_encode = X_train.columns[X_train.dtypes==object].tolist()  
       
    col_trans = make_column_transformer(
                        (OneHotEncoder(),features_to_encode),
                        remainder = "passthrough"
                        )
    
    clf = MLPClassifier(solver='adam', alpha=1e-5,activation='relu',learning_rate_init=0.001,
                     hidden_layer_sizes=(50, 4), random_state=1,max_iter=1000)

    pipe = make_pipeline(col_trans, clf)
    pipe.fit(X_train, y_train)
    
    y_pred = pipe.predict(X_test)
    
    accuracy_score(y_test, y_pred)
    print(f"The accuracy of the model is {round(accuracy_score(y_test,y_pred),3)*100} %")
    
    return pipe

In [18]:
%%time
pipe_nn = develop_ann(full_data,y_col='maxspeed',x_cols=['landuse','highway','sinuosity'])

The accuracy of the model is 43.3 %
CPU times: total: 11.9 s
Wall time: 12 s
