# Currentness of buildings
(pyiceberg -> **duckdb** -> lonboard map)
In this notebook we demonstrate how to analyze and visualize the up-to-date-ness or currentness of the latest OSM data.

These are the steps you see further down:

* Set the query params
* Set the connection params to Iceberg Rest Catalog and Minio S3 Storage
* Prepare the data in 3 steps
    * Do an iceberg table scan with a pre-filter
    * Fine filter the data in a Dataframe after download
    * Transform the columns into the format that we need for mapping the features with `lonboard`
* Create a **Map**, an **interactive Slider** to filter the map data and display the data as a **currentness chart**   

In [24]:
import os
import datetime

import duckdb
import polars as pl
import pandas as pd
import geopandas as gpd

from pyiceberg.catalog.rest import RestCatalog

import ipywidgets as widgets
from lonboard.layer_extension import DataFilterExtension
from lonboard import Map, ScatterplotLayer, PolygonLayer, SolidPolygonLayer, basemap
from lonboard.colormap import apply_continuous_cmap
from palettable.matplotlib import Viridis_20
from ipywidgets import FloatRangeSlider, jsdlink, Layout
from IPython.display import display, HTML

## Prepare the Iceberg connection

In [25]:
s3_user = os.environ["S3_ACCESS_KEY_ID"]  # add your user here
s3_password = os.environ["S3_SECRET_ACCESS_KEY"]  # add your password here

In [26]:
catalog = RestCatalog(
    name="default",
    **{
        "uri": "https://sotm2024.iceberg.ohsome.org",
        "s3.endpoint": "https://sotm2024.minio.heigit.org",
        "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
        "s3.access-key-id": s3_user,
        "s3.secret-access-key": s3_password,
        "s3.region": "eu-central-1"
    }
)

## Prepare DuckDB

In [28]:
con = duckdb.connect(
    config={
        'threads': 8,
        'max_memory': '16GB'
    }
)
con.install_extension("spatial")
con.load_extension("spatial")

## Prepare the input params for your map

In [13]:
bboxes = {
    'heidelberg': (8.629761, 49.379556, 8.742371, 49.437890),  #  ~20.000 buildings
    'nairobi': (36.650938, -1.444471, 37.103887, -1.163522),
    'mannheim': (8.41416, 49.410362, 8.58999, 49.590489),  #  ~100.000 buildings
    'berlin': (13.088345, 52.338271, 13.761161, 52.675509)  # ~700.000 buildings
}

# select your input params

# bbox
(xmin, ymin, xmax, ymax) = bboxes['heidelberg']

# date only, do not appent timepart
(min_valid_from, max_valid_from) = ('2007-01-01', '2025-01-01')

# iceberg table
namespace = 'geo_sort'
tablename = 'contributions'
icebergtable = catalog.load_table((namespace, tablename))

## Get the data

### 1st Filter step: Define a pre-filter for the iceberg table scan

Based in this pre-filter pyIceberg will use the Iceberg Tables Metadata to minimize the number of parquet files which have to be touched.  

In [23]:
ice2 = icebergtable.scan(
    row_filter=f"""
        status = 'latest'
        and geometry_type = 'Polygon'
        and valid_from < '{max_valid_from}T00:00:00'
        and valid_from >= '{min_valid_from}T00:00:00' 
        and (bbox.xmax >= {xmin} and bbox.xmin <= {xmax})
        and (bbox.ymax >= {ymin} and bbox.ymin <= {ymax})   
    """,
    selected_fields=(
        "valid_from",
        "tags",
        "geometry"
    ),
    #limit=100
)


ice2 = icebergtable.scan(
    row_filter=f"""
        not map_features.building
    """,
    selected_fields=(
        "valid_from",
        "tags",
        "geometry"
    ),
    #limit=100
)

ParseException: Expected expr, found end of text  (at char 39), (line:3, col:5)

### 2nd filter step: Further filter the data 

Some properties are not reflected in the Iceberg Metadata and must be fine filtered after fetching the parquet data.

In [19]:
con = ice2.to_duckdb('buildings',connection=con)

query = """
SELECT
    valid_from,
    tags,
    geometry
FROM buildings
WHERE 1=1
    and tags['building'][1] is not null
    and tags['building'][1] != 'no'
--LIMIT 10
;
"""

duckdb_table = con.sql(query)
num_features = duckdb_table.count('*').fetchnumpy().get('count_star()').item()
display(HTML(
    f"""<h2>Attention</h2><p style="font-size:large">You are going to download <strong>{num_features}</strong> features.</p>"""))

ValueError: Could not find field with name xmax, case_sensitive=True

### Data preparation: Do some column projection and transformation

In [8]:
query = """
SELECT
    epoch(valid_from) as valid_from_epoch_s,
    --STRFTIME("valid_from", '%Y-%m-01')::date as valid_from,
    valid_from::TIMESTAMP_S as valid_from,
    list_transform(map_entries(tags), tag -> tag.key || '=' || tag.value) as tags,
    --tags,
    st_AsText(ST_GeomFromWKB(geometry)) as geometry
FROM duckdb_table
;
"""
buildings_df = con.sql(query).df()

# convert the data to geodata
gdf = gpd.GeoDataFrame(
    buildings_df,
    geometry=gpd.GeoSeries.from_wkt(buildings_df['geometry'])
).set_crs('epsg:4326')

## Plot Chart

In [10]:
chart_query = """
SELECT
    STRFTIME("valid_from", '%Y-%m-01')::date as month,
    count(*) as n_features
    FROM duckdb_table
    GROUP BY month
"""

chart_data = con.sql(chart_query).pl()
chart = chart_data.plot.step(x="month", y="n_features", xaxis='top', width=700, responsive=False)
display(chart)

## Prepare the Map

In [14]:
# compute lonboard color style for contious color map
min_valid_from_epoch_s = gdf["valid_from_epoch_s"].min()
max_valid_from_epoch_s = gdf["valid_from_epoch_s"].max()

# normalized color values from 0 to 1
valid_from_style = gdf["valid_from_epoch_s"].apply(
    lambda x: (x - min_valid_from_epoch_s) / (max_valid_from_epoch_s - min_valid_from_epoch_s))

# lonboard gpu filtering
filter_values = gdf["valid_from_epoch_s"]
initial_filter_range = [
    datetime.datetime.fromisoformat(min_valid_from).timestamp(),
    datetime.datetime.fromisoformat(max_valid_from).timestamp()
]

# the lonboard map definition
extension = DataFilterExtension(filter_size=1)
layer = SolidPolygonLayer.from_geopandas(
    gdf,
    extensions=[extension],
    get_filter_value=filter_values,  # replace with desired column
    filter_range=initial_filter_range,  # replace with desired filter range
    get_fill_color=apply_continuous_cmap(valid_from_style, Viridis_20, alpha=1)

)
currentness_map = Map(
    basemap_style=basemap.CartoBasemap.DarkMatter,
    layers=[layer]
)
# the interactive time slider
slider = FloatRangeSlider(
    value=(
        min_valid_from_epoch_s,
        max_valid_from_epoch_s
    ),
    min=min_valid_from_epoch_s,
    max=max_valid_from_epoch_s,
    step=3600 * 24,  #days
    #description="Valid from: ",
    layout=Layout(width='615px', left='calc(50% - 290px)'),
    readout=False,
)

# link the slider to the map
link = jsdlink(
    (slider, "value"),
    (layer, "filter_range")
)

## Display the Map
The Map shows the current OSM data color coded by the date the objects were edited for the last time.

In [15]:
display(currentness_map, slider)

Map(basemap_style=<CartoBasemap.DarkMatter: 'https://basemaps.cartocdn.com/gl/dark-matter-gl-style/style.json'…

FloatRangeSlider(value=(1217964804.0, 1722247253.0), layout=Layout(left='calc(50% - 290px)', width='615px'), m…