In [1]:
import sys
sys.path.append('..')

import utils.data_structures as ds
import utils.helpers as hp
import utils.readers as rs

import os
import geopandas as gpd
import pandas as pd
import numpy as np
import fiona.crs as fcrs
from shapely.strtree import STRtree
import rtree

import scipy.spatial as spatial
import scipy.ndimage as ndimage

import bokeh
from bokeh.models.renderers import GlyphRenderer
from bokeh.models import Range1d, LinearAxis

import geoviews as gv
gv.extension('bokeh')

import holoviews as hv
hv.notebook_extension('bokeh')

import hvplot.xarray
import hvplot.pandas

from bokeh.models import HoverTool
from holoviews.operation.datashader import regrid, datashade, rasterize
from holoviews.operation import histogram, decimate
from holoviews import opts
import datashader.transfer_functions as tf

  PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)


In [2]:
data = '/Volumes/HADDOCK 460GB/swiss_project/data/'
query_dir = '/Volumes/HADDOCK 460GB/swiss_project/query_dir'
query_dir_name = 'sa3d_WGS84_10m'

In [3]:
size = dict(width=800, height=600)
plot_opts = {'width':800, 'height':800}
shade_defaults = dict(x_sampling=1, y_sampling=1, width=1200, height=682, cmap='white')
height_diff_range = (-30, 30)

## Load Data from Query

In [4]:
saved_query_dir = os.path.join(query_dir, query_dir_name)

sm = ds.SWISSMap(load_dir=saved_query_dir, calc_dir=query_dir, mission=2, prod_nr=6)
qmap, snow_data, ice_data, slf_data, bbox = sm.load()

Using  /Volumes/HADDOCK 460GB/swiss_project/query_dir/sa3d_WGS84_10m as query_dir...
Query ICESat data ...


  0%|          | 1/319 [00:00<00:38,  8.36it/s]

Query snow cover data ...


100%|██████████| 319/319 [00:10<00:00, 30.85it/s]
100%|██████████| 3/3 [00:00<00:00, 17.67it/s]

Query background rasters ...





Query SLF data ...


In [5]:
ice_data = gpd.GeoDataFrame(ice_data.loc[~np.isnan(ice_data['dem']) & ~np.isnan(ice_data['snow_cover'])], 
                            crs=fcrs.from_epsg(4326))
ice_data = ice_data.query(str(height_diff_range[0]) + '< height_diff < ' + str(height_diff_range[1]))
ice_data['time'] = ice_data['time'].astype(np.datetime64)

In [6]:
ice_data.loc[:, 'height_diff'] = ice_data.loc[:, 'height'] - ice_data.loc[:, 'dem']

### Show points

In [7]:
size = dict(width=800, height=600)
opts.defaults(opts.Image(invert_axes=True, **size))
    
shade_defaults = dict(x_sampling=1, y_sampling=1, width=1200, height=682, cmap='white')
hover = HoverTool(tooltips=[('snow_cover', "@snow_cover")])#, formatters={'t_iso': 'datetime'})
pts = gv.Points(ice_data[['x', 'y', 'time']]).opts(size=3)
#slf_station_pts = gv.Points(sm.slf_reader.slf_stations[['x', 'y', 'time']]).opts(size=3)
#snow = gv.QuadMesh(snow_data, kdims=['x', 'y'], ).opts(cmap='viridis', colorbar=True)


In [8]:
plot_opts = {'width':600, 'height':600}
plot = gv.tile_sources.ESRI * decimate(pts) #* slf_station_pts
plot.opts(**plot_opts)

## Statistics of Mean Height

Multitemporal data subsets are given by ground tracks at different dates. The measurement points are considerably shifted. Denoting by $d(x, t)$ ICESat measurements, by $h(x)$ the DEM data, by $l(x, t)$ true height change above the DEM and by $\sigma(x, t)$ an error term, such that $ d(x, t) = h(x) + l(x, t) + \sigma(x, t)$, we use the following conceptual model

$$\begin{align*} 
    s(x, \tilde{t}, t)
    &\approx l(x, \tilde{t}) - l(x, t) \\
    &\approx l(\tilde{x}, \tilde{t}) - l(x, t) \\
    &= d(\tilde{x}, \tilde{t}) - d(x, t) - \Delta\sigma - \Delta h,
\end{align*}$$

where the first approximation is due to the assumption that any height change is caused by snow, and the second approximation follows from the assumption that we have reasonable snow height consistency within a region $A$ spanning $x$ and $\tilde{x}$.

In this setting, we can determine $s(x)$ easily, if we can show that $\Delta\sigma \approx 0$. In the following, we want to look at the mean height difference under a window function along ground tracks

$$\begin{align*}
    \langle d(\cdot, t) - h(\cdot)\rangle_w 
    &= \langle l(\cdot, t) + \sigma(\cdot, t)\rangle_w \\
    &\approx \langle \sigma(\cdot, t') \rangle_w
\end{align*}$$

where the last equality is the main idea: we want to determine $\sigma$ in regions and times with $l(x, t') \approx 0$ and making the assumption that the change in $\langle \sigma(\cdot, t') \rangle_w$ over time $t' \to t$ is not significant. The error, that neglecting $\Delta \sigma$ will introduce, is bounded if

$$
\langle \sigma(\cdot, t') \rangle_w < c_w = \mathrm{const.}
$$

Furthermore, for small $c_w$ (i.e. small distances/window_sizes), we can approximate $\Delta \sigma \approx \sigma(\tilde{x}, \tilde{t}) -  \langle \sigma(\cdot, t') \rangle_w \approx \langle \sigma(\cdot, t') \rangle_w  - \langle \sigma(\cdot, t') \rangle_w = \mathcal{O}(c_w)$, where the first approximate equality is given if $\sigma(x, t') \approx c_w$ and the second approximate equality follows from the assumption of spatial isotropy in a region $B$, such that $A \subset B$. 

In order to measure the closeness to constance $\langle \sigma(\cdot, t') \rangle_w  \to c_w$, the standard deviation can be used. With

$$
d(v, w) = \mathrm{std}_w\left[d(\cdot, t) - h(\cdot) \right] \approx \mathrm{std}_w\left[\sigma(\cdot, t) \right],
$$

i.e. the std as a local filter over window size $w$, we can introspect the assumption of constance of the mean height difference over window size $w$. Other approaches might work better. That was just my first idea. Maybe Autonomous RLS is the better choice? From this, we can derive a growth factor $r(w)$ which measures the speed with which the assumption of local constance is decays with increasing window sizes. 

The snow height consistency region $A$ could be perceived as lower bounds on the size of $B$ that has to be assumed. $r(w)$ determines the trustworthiness that can be placed in the size of A. Thus, larger growth rates and large distances between $x$ and $\tilde{x}$ make assumption $B$ less trustworthy.

In a further attempt, we could try put a generative model on $s(x)$ by defining a distribution over $\sigma(x)$. Let's see first how $\mathrm{std}(\langle \sigma(\cdot, t) \rangle_w)$ behaves, thouhgh, ...

#### Definitions (in distance)

In [1]:
def ll2masked_array(ll):
    lens = [len(l) for l in ll] 
    maxlen = max(lens)
    arr = np.zeros((len(ll), maxlen), dtype=type(ll[0][0]))
    mask = np.arange(maxlen) < np.array(lens)[:, None]
    arr[mask] = np.concatenate(ll)
    return np.ma.array(arr, mask=~mask)

In [2]:
def get_neighbours_in_radius(tree, pts, radius):

    # find neigbours in window_size distance
    windowed_neighbours_idx = tree.query_ball_point(pts, r=radius)
            
    # return masked array
    return ll2masked_array(windowed_neighbours_idx)

In [129]:
def create_passes(df, rgt=True, ground_track_id=True, cycle_number=True):
    """
    Identify different passes by conditioning on rgt, ground_track_id and cycle_number. If one of those
    is False, do not take it into account for creating an individual pass.
    """
    df = df.copy()
    
    # create data frame for each (rgt, cycle_number, ground_track_id) triple
    conditionals = []
    if rgt:
        conditionals.append('rgt')
    
    if cycle_number:
        conditionals.append('cycle_number')
        
    if ground_track_id:
        ground_track_id_map = {gt_id: i for i, gt_id in enumerate(np.unique(df[['ground_track_id']]))}
        df['ground_track_id'] = df['ground_track_id'].map(ground_track_id_map)
        conditionals.append('ground_track_id')
        
    passes = np.unique(df[conditionals].values.astype(np.int), axis=0)
    
    # binarize data frame into different passes
    dfs_per_pass = [] 
    for conditionals in passes:
        if len(conditionals) == 1:
            rgt = conditionals
            query_str = 'rgt == ' + str(rgt)
            
        elif len(conditionals) == 2:
            rgt, cycle_number = conditionals
            query_str = 'rgt == ' + str(rgt) + \
                        ' & cycle_number == ' + str(cycle_number)
                
        elif len(conditionals) == 3:
            rgt, cycle_number, gt_id = conditionals
            query_str = 'rgt == ' + str(rgt) + \
                        ' & cycle_number == ' + str(cycle_number) + \
                        ' & ground_track_id == ' + str(gt_id)
                
        dfs_per_pass.append(df.query(query_str).copy())
    
    # also, sasve pass id in a column
    df.loc[:, 'passes'] = np.array([[pass_idx for pass_idx, df_for_pass in zip(range(len(passes)), dfs_per_pass) 
                            if idx in df_for_pass.index][0] for idx in df.index])
    
    for pass_idx, df_for_pass in zip(range(len(passes)), dfs_per_pass):
        df_for_pass.loc[:, 'passes'] = pass_idx 
    
    return df, dfs_per_pass, passes

In [130]:
def running_mean_series_ball(window_sizes, dframes, epsg=3857, lc=None, qty='height_diff', eps=0.5, min_samples=5,
                            add_cond=''):
    """
    Calculate the running means of qty over window sizes (given in the unit of epsg) in all pd.DataFrames in dframes.
    If lc is supplied, points with land_cover in lc are queried and clustered. Querying and clustering is independent
    for each data frame in dframes.
    eps determines the looseness of clusters (look up DBSCAN). However, this parameter shouldn't really matter 
    in the present case as clusters are well separated. Clusters with nr_samples > min_samples are made individual 
    pd.DataFrames and the running means computation is run on each separately. 
    
    Parameters:
    -----------
    window_sizes (array): 
    dframes (list):
    epsg: determines unit in eps
    lc: land cover classes, if None, dframes are considered clusters
    eps:
    min_samples:
    add_cond (str) : query not only for land_cover types but also for a query string
    
    Returns:
    --------
    running_means (list of list of np.ma.arrays): each masked array in the list containes 
                                                  the running means for each cluster for each window_size
    running_std (list of np.ma.arrays):
    info (dict of dicts): indices of neighbours for each point for each window_size
    dfs (list of pd.DataFrame): list of data frames for each cluster, if None, dframes is returned
    """
    running_means = []
    running_std = []
    trees = {}
    info = {}
    dfs = []
    
    # condition on land_cover classes
    if lc is not None:
        query_str = ' | '.join(['land_cover == ' + str(l) for l in lc])
        for i, df in enumerate(dframes):
            # condition on land_covers
            df = df.query(query_str)
            if add_cond != '':
                df = df.query(add_cond)
            df = df.copy()
            if df.empty:
                continue
            
            # cluster points
            kms_per_radian = 6371.0088
            epsilon = eps / kms_per_radian
            labels, core_idxs = hp.cluster_points(df, epsilon=epsilon, min_samples=min_samples, 
                                                  algorithm='ball_tree', metric='haversine') 
            df.loc[:, 'label'] = labels
            
            # drop if cluster is too small
            clusters, counts = np.unique(labels, return_counts=True)
            too_small_clusters = clusters[counts < min_samples]
            df = df.drop(index=df.index[np.where(np.isin(df['label'], too_small_clusters))[0]])
            
            # add each cluster as separate pass
            if not df.empty:
                dfs += [v for k, v in df.groupby('label')]
    else:
        dfs = [df.copy() for df in dframes]
    
    # construct trees
    for i, df in enumerate(dfs):
        df = hp.to_crs(df, epsg=epsg, inplace=True)        
        pts = df[['x', 'y']].values
        trees[i] = spatial.cKDTree(pts)
    
    # iterate over window_sizes
    for window_size in window_sizes:
        info[window_size] = {}
        running_mean_ws = []
        running_std_ws = []
        for i, df in enumerate(dfs):
            windowed_neighbours_idx = get_neighbours_in_radius(trees[i], 
                                                               df[['x', 'y']].values.copy(), 
                                                               window_size)            
            #log neighbours
            info[window_size][i] = windowed_neighbours_idx
            
            # calculate mean / std height_diff
            windowed_height_diff = np.ma.array(df[qty].values[windowed_neighbours_idx], 
                                               mask=windowed_neighbours_idx.mask)
            running_mean_ws.append(np.ma.mean(windowed_height_diff, axis=1))     
            running_std_ws.append(np.ma.std(windowed_height_diff, axis=1))
            
        running_means.append(np.array(running_mean_ws))
        running_std.append(np.array(running_std_ws))

    return running_means, running_std, info, dfs

In [131]:
# take only long enough data frames, there is no point in using larger windows than the data frames length
min_points = 500

dframe, dfs_per_pass, passes = create_passes(ice_data)

idxs = [len(df) > min_points for df in dfs_per_pass]
dfs_per_pass = [df for i, df in enumerate(dfs_per_pass) if idxs[i]]
passes = passes[idxs]

In [132]:
# make sure the triples are ordered in time
for df in dfs_per_pass:
    df.sort_values(by='time', inplace=True)

### Compute $\Big\langle\mathrm{std}_w\left[ d(\cdot, t) - h(\cdot)\right] \Big\rangle$

#### Select land cover classes on which to condition on

In [133]:
lc = None # list [17]

#### Calculate heat map for window sizes in [m]

In [134]:
window_sizes = range(30, 1000, 30)
running_means, running_std, info, dfs = running_mean_series_ball(window_sizes, dfs_per_pass, lc=lc)

In [135]:
heat_map = np.zeros((len(window_sizes), len(dfs)))
for ij, rs in np.ndenumerate(running_std):
    heat_map[ij] = np.mean(rs)

In [136]:
dataset = hv.Dataset((range(len(dfs)), 
                      window_sizes,
                      heat_map),
                     ['pass', 'window_sizes [m]'], 'std_height_diff_w')

### Plot $\Big\langle\mathrm{std}_w\left[d(\cdot, t) - h(\cdot)\right] \Big\rangle$

#### in [m]

The plot shows the mean standard deviation over a window size of $v$ (in [pts]) when smoothing the height difference over a window size $w$ (in [m]) for all passes and different window sizes $w$. Clearly, $r(w)$ is quite different for different passes. Up to now, we didn't condition on anything, that might explain this different behaviour.

In [137]:
dataset.to(hv.Image).opts(title='ball', colorbar=True, **plot_opts)

### Show passes explicitly

In [213]:
# Set id of pass and the window size (index wrt window_sizes)
df_id = 34 #pass id
window_size_idx = 30

# get rgt, gt_id, cycle_nr
passes[df_id]

array([389,   2,   2])

In [215]:
def apply_formatter(plot, element):
    p = plot.state
    
    # create secondary range and axis
    p.extra_y_ranges = {"twiny": Range1d(start=-10, end=10)}
    p.add_layout(LinearAxis(y_range_name="twiny"), 'right')

    # set glyph y_range_name to the one we've just created
    glyph = p.select(dict(type=GlyphRenderer))[0]
    glyph.y_range_name = 'twiny'

In [216]:
hover = HoverTool(tooltips=[('rgt', "@rgt"), ('land_cover', '@land_cover'), ('index', '@index')])

In [220]:
df = hp.to_crs(dfs_per_pass[df_id], epsg=4326)
df['index'] = df.index
df['running_mean'] = running_means[window_size_idx][df_id]

red_df = df[['x', 'y', 'rgt', 'land_cover', 'index', 'dem', 'running_mean']].copy()
red_df.loc[:, 'running_mean'] = running_means[window_size_idx][df_id]

layout =  red_df.hvplot('y', 'dem') * \
          red_df.hvplot('y', 'running_mean').opts(hooks=[apply_formatter])
layout.opts(opts.Curve(height=500, width=500))

#red_df = red_df.rename(columns={'x': 'Longitude', 'y':'Latitude'})
pts = decimate(gv.Points(red_df, kdims=['x', 'y']).opts(size=3))
plot = gv.tile_sources.ESRI * pts.opts(color='rgt', tools=[hover])

plot.opts(height=500, width=300) + layout

## Land Cover Sensitivity

In [221]:
land_covers = [6]
min_samples = 50

In [222]:
window_sizes = np.arange(50, 1000, 50)

In [223]:
running_means, running_std, info, dfs = running_mean_series_ball(window_sizes, dfs_per_pass, #[ice_data.query('passes == ' + str(pass_id))]
                                                                 lc=land_covers, qty='height_diff',
                                                                 min_samples=min_samples, add_cond='snow_cover < 50')

In [83]:
heat_map = np.zeros((len(window_sizes), len(dfs)))
for ij, rs in np.ndenumerate(running_std):
    heat_map[ij] = np.mean(rs)

In [84]:
dataset_lc = hv.Dataset((range(len(dfs)), 
                           window_sizes,
                           heat_map),
                          ['cluster', 'window_sizes [m]'], 'std_height_diff_w')

In [85]:
dataset_lc.to(hv.Image).opts(title='land cover', colorbar=True, **plot_opts) 

In [86]:
keys = ['x', 'y', 'rgt', 'land_cover', 'snow_cover', 'height_diff', 'geometry', 'label', 'passes']

lc_pts = pd.concat(dfs)[keys]
lc_pts = hp.to_crs(lc_pts, epsg=4326)
lc_pts.loc[:, 'overall_label'] = lc_pts.loc[:, 'passes'].map(str) + '_' + lc_pts.loc[:, 'label'].map(str)

hover = HoverTool(tooltips=[('rgt', "@rgt"), ('height_diff', "@height_diff"), 
                            ('land_cover', '@land_cover'), ('snow_cover', '@snow_cover'),
                            ('overall_label', '@overall_label')])

dset_pts = hv.Dataset(lc_pts.values, keys + ['overall_label'])
pts = gv.Points(dset_pts, kdims=['x', 'y'])
plot = gv.tile_sources.ESRI * pts.opts(size=3)

plot.opts(height=500, width=700)

### Compare Land Covers

In [32]:
snow_cover_thr = 50
min_samples = 50
local_consistency_length = 10
window_sizes = np.arange(30, 1000, 10)

In [33]:
lcovers = np.unique(ice_data['land_cover'])
thr = pd.DataFrame(columns=['land_cover', 'window_size', 'cluster', 'std_w',])

for lc in lcovers:
    # get running means for land_cover
    running_means, running_std, info, dfs = running_mean_series_ball(window_sizes, dfs_per_pass, lc=[lc], 
                                                                     qty='height_diff', min_samples=min_samples, 
                                                                     add_cond='snow_cover < ' + str(snow_cover_thr))
    
    # get at least two clusters per land cover type
    if len(dfs) <= 1:
        continue
        
    # compute std_v
    heat_map = np.zeros((len(window_sizes), len(dfs)))
    for ij, rm in np.ndenumerate(running_means):
        i, j = ij
        thr = thr.append(pd.Series([lc, window_sizes[i], j, 
                                    np.mean(running_std[i][j])],
                                   index=thr.columns), ignore_index=True)
    
        

In [76]:
dset = hv.Dataset(thr[['land_cover', 'window_size', 'cluster', 'std_w']], 
                  kdims=['land_cover', 'window_size'])

The following plot shows the distributions of $\mathrm{std}_w$ over the clusters of single land cover classes at different window_sizes $w$. We can trace $r(w)$ as well as the spread of $\mathrm{std}_w$ as a function of land cover class. Also note that we conditioned on small snow probabilities (snow_cover_thr) and a sufficient amount of measurement points for a cluster to be recognised (min_samples).

In [77]:
dset.to(hv.BoxWhisker, ['window_size'], 'std_w').opts(**plot_opts)

In [78]:
# show number of clusters for each land_cover class
from collections import Counter
from operator import itemgetter

tuples = list(zip(thr['cluster'], thr['land_cover']))
res = Counter(map(itemgetter(1), tuples))
print({k:v // 19 for k, v in res.items()})

{2.0: 301, 3.0: 45, 6.0: 10, 11.0: 10, 12.0: 255, 15.0: 86, 16.0: 66, 18.0: 403, 20.0: 71, 21.0: 66, 23.0: 61, 24.0: 142, 25.0: 97, 26.0: 102, 27.0: 20, 31.0: 35, 32.0: 30, 41.0: 275}


## Using Cross Overs

With the ansatz from above 

$$ s(x) = d(\tilde{x}, \tilde{t}) - d(x, t) - \Delta\sigma - \Delta h$$

and suitable constraints on the distance of $x$ and $\tilde{x}$, we can measure $s(x)$ accross passes that parametrize trajectories $(x, t)$. A special case is when $x \approx \tilde{x}$ such that $\Delta h \to 0$ and assuming that in this case $\Delta \sigma \to 0$ (i.e. assuming $\sigma(x, t) = \sigma(x)$), in which case we can make use of ICESat measurements only as if they were perfectly aligned. We can find such instances by looking for cross overs in the passes in regions with low slope values.

First, let's have a look at what we got. How many points are there that are near enough? And how many if we condition on the slope such that we can assume that $h$ doesn't differ that much?

### find neighbours within distance

In [36]:
dframe = ice_data.copy()

In [37]:
radius = 60

In [38]:
dframe = hp.to_crs(dframe, epsg=3857, inplace=True)        
pts = dframe[['x', 'y']].values
tree = spatial.cKDTree(pts)

In [39]:
neigh_idxes = get_neighbours_in_radius(tree, pts.copy(), radius)

In [40]:
# map idx in idxes to pass and filter out instances in idxes with different passes
dframe, dfs_per_pass, passes = create_passes(dframe, rgt=True, ground_track_id=False, cycle_number=True)

In [41]:
def extract_neighbour_info(neigh_idxes, cond):
    info = neigh_idxes.copy()
    for ij, v in np.ndenumerate(neigh_idxes):
        if not neigh_idxes.mask[ij]:
            info[ij] = cond(v)
    return info

In [42]:
cond = lambda idx: dframe['passes'].iloc[idx]
passes_of_neighbours = extract_neighbour_info(neigh_idxes, cond)

In [43]:
diff_passes = [len(np.unique(passes_per_i[~passes_of_neighbours.mask[i, :]]))
               for i, passes_per_i in enumerate(passes_of_neighbours)]
diff_passes_idxes = np.where(np.array(diff_passes) > 1)

In [44]:
near_pts = dframe.iloc[diff_passes_idxes]

In [48]:
pd.DataFrame(diff_passes).hvplot.hist()

### Cluster Cross Overs

In [49]:
kms_per_radian = 6371.0088
epsilon = 0.1 / kms_per_radian
hp.to_crs(near_pts, inplace=True, epsg=4326)

clusters, core_idxs = hp.cluster_points(near_pts.copy(), epsilon=epsilon, min_samples=5, 
                                        algorithm='ball_tree', metric='haversine')

near_pts.loc[:, 'cluster'] = clusters.copy()
near_pts = near_pts.query('cluster != -1')


In [50]:
hover = HoverTool(tooltips=[('rgt', "@rgt"), ('land_cover', '@land_cover'), 
                            ('snow_cover', '@snow_cover'), ('cluster', '@cluster'), ('height_diff', '@height_diff')])
pts = decimate(gv.Points(near_pts[['x', 'y', 'rgt', 'land_cover', 'snow_cover', 'cluster', 'height_diff']], kdims=['x', 'y']).opts(size=3))
plot = gv.tile_sources.ESRI * pts.opts(tools=[hover], color='cluster', cmap='tab20')

plot.opts(height=500, width=700)

### Look at cluster properties

#### Intra-cluster height differences

In [115]:
# create two data frames : cluster_stats - mean values / differences between passes per cluster
#                          intra_cluster_stats - specific values for all clusters in cluster_stats

intra_cluster_stats = near_pts.copy()
intra_cluster_stats.loc[:, 'order'] = np.nan 

cluster_stats = pd.DataFrame(np.unique(near_pts['cluster']), columns=['cluster'])

for i, cluster in enumerate(cluster_stats.cluster):
    cluster_pts = intra_cluster_stats.query('cluster == ' + str(cluster))
    passes_of_neigh_in_cluster = np.unique(cluster_pts['passes'])
    
    # note : this works because all clusters consist of points of exactly two passes
    if len(passes_of_neigh_in_cluster) == 2:
        pass0 = cluster_pts.query('passes == ' + str(passes_of_neigh_in_cluster[0]))
        pass1 = cluster_pts.query('passes == ' + str(passes_of_neigh_in_cluster[1]))
        
        # order passes
        def sort_key(val):
            #return val['time'].iloc[0]
            return np.mean(val['snow_cover'])
        passes_in_cluster = [pass0, pass1]
        passes_in_cluster.sort(key=sort_key)
        pass0, pass1 = passes_in_cluster
        
        cluster_stats.loc[cluster, 'height_diff'] = np.mean(pass1['height_diff']) \
                                                            - np.mean(pass0['height_diff'])
        
        cluster_stats.loc[cluster, 'snow_cover_diff'] = np.mean(pass1['snow_cover']) \
                                                                - np.mean(pass0['snow_cover'])
        cluster_stats.loc[cluster, 'snow_cover_0'] = np.mean(pass0['snow_cover'])
        cluster_stats.loc[cluster, 'snow_cover_1'] = np.mean(pass1['snow_cover'])
           
        cluster_stats.loc[cluster, 'slope'] = np.mean(cluster_pts['slope'])
        
        cluster_stats.loc[cluster, 'land_covers'] = ' '.join(np.unique(cluster_pts.land_cover).astype(str))
        cluster_stats.loc[cluster, 'size'] = len(cluster_pts)
        
        cluster_stats.loc[cluster, 'height_pass0'] = np.mean(pass0['height'])
        cluster_stats.loc[cluster, 'height_pass1'] = np.mean(pass1['height'])
        
        cluster_stats.loc[cluster, 'cross_over_height_diff'] = cluster_stats.loc[cluster, 'height_pass0'] - \
                                                                cluster_stats.loc[cluster, 'height_pass1']
        
        # save some statstics back into intra_cluster_stats
        cluster_idx = np.concatenate((pass0.index.values, pass1.index.values))
        intra_cluster_stats.loc[cluster_idx, 'cross_over_height_diff'] = cluster_stats.loc[cluster,
                                                                                           'cross_over_height_diff']
        
        intra_cluster_stats.loc[pass0.index, 'order'] = 0
        intra_cluster_stats.loc[pass1.index, 'order'] = 1

In [116]:
cluster_stats = cluster_stats.iloc[np.where(~np.isnan(cluster_stats['height_diff']))[0]].copy()
cluster_stats['quantile_height_diff'] = pd.cut(cluster_stats['height_diff'], 2, labels=False)

height_diff_scatter_plt = cluster_stats.hvplot.scatter('snow_cover_0', 'snow_cover_1', 
                                                  color='cross_over_height_diff').opts(**plot_opts)

pts_qsnow = [hv.Points(cluster_stats.query('quantile_height_diff == ' + str(i))) 
             for i in np.unique(cluster_stats['quantile_height_diff'])]

xhist, yhist = (hv.Overlay([histogram(pts, dimension=dim, num_bins=40) for pts in pts_qsnow])
                for dim in ['snow_cover_0', 'snow_cover_1'])

composition = decimate(height_diff_scatter_plt) << yhist.opts(width=125) << xhist.opts(height=125)
composition.opts(opts.Histogram(alpha=0.3))

In [119]:
lc = [32]
slope_thr = 10

# condition dataset
# only cross overs with exactly one land_cover type
nr_lc = lambda st: len(st.split(' '))
cond = cluster_stats.iloc[np.where(cluster_stats['land_covers'].map(nr_lc) == 1)].copy()
cond.loc[:, 'land_covers'] = cond['land_covers'].values.astype('float')

# small slope
cond = cond.query('slope < ' + str(slope_thr))

# specific land_cover type 
# query_str = ' | '.join(['land_covers == ' + str(l) for l in lc])
# cond = cond.query(query_str)

cond.hvplot.scatter('snow_cover_diff', 'cross_over_height_diff', color='land_covers', cmap='tab20').opts(**plot_opts)

The following plot shows the distribution of height_diff inside each cluster with a specific land cover_type.

In [228]:
# condition on clusters with only one land_cover class
nr_lc = lambda st: len(st.split(' '))
dframe = intra_cluster_stats.iloc[np.isin(intra_cluster_stats['cluster'], 
                                          np.unique(cluster_stats.iloc[np.where(cluster_stats['land_covers']\
                                                                                .map(nr_lc) == 1)]['cluster']))]
dset = hv.Dataset(dframe[['cluster', 'order', 'height_diff', 'land_cover', 'snow_cover']], 
                  kdims=['cluster', 'order', 'land_cover'])
dset.to(hv.BoxWhisker, ['cluster', 'order'], 'height_diff').opts(ylim=(-2, 5), **plot_opts)

In [104]:
cluster_id = 9
cluster_stats.query('cluster == ' + str(cluster_id))

Unnamed: 0,cluster,height_diff,snow_cover_diff,snow_cover_0,snow_cover_1,slope,land_covers,size,height_pass0,height_pass1,cross_over_height_diff
9,9,-0.687908,152.0,0.0,152.0,11.218017,18.0,9.0,1536.691067,1539.4879,-2.796833


In [122]:
hover = HoverTool(tooltips=[('rgt', "@rgt"), ('land_cover', '@land_cover'), 
                            ('snow_cover', '@snow_cover'), ('cluster', '@cluster'), ('height_diff', '@height_diff'),
                            ('cross_over_height_diff', '@cross_over_height_diff')])

pts = gv.Points(intra_cluster_stats.query('cluster == ' + str(cluster_id))[['x', 'y', 'rgt', 'land_cover', 
                                                                            'snow_cover', 'cluster', 'height_diff',
                                                                            'cross_over_height_diff']],
                kdims=['x', 'y']).opts(size=3)

plot = gv.tile_sources.ESRI * pts.opts(tools=[hover])
plot.opts(**plot_opts)