In [1]:
import duckdb

# 1. Connect (creates argo.db if it doesn’t exist)
con = duckdb.connect("../../LOCAL/Resources/argo.db")

In [2]:
con.execute("""
    CREATE TABLE IF NOT EXISTS argo2023_raw AS
    SELECT *
    FROM read_csv(
        'data/ArgoFloats2023.csv',
        header = true,
        skip = 1,
        nullstr = ' ',
        columns = {
            'platform_number': 'VARCHAR',
            'time': 'VARCHAR',
            'latitude': 'VARCHAR',
            'longitude': 'VARCHAR',
            'pres': 'VARCHAR',
            'pres_qc': 'VARCHAR',
            'temp': 'VARCHAR',
            'temp_qc': 'VARCHAR',
            'psal': 'VARCHAR',
            'psal_qc': 'VARCHAR'
        }
    )
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x8d399b0>

In [3]:
con.execute("""
CREATE OR REPLACE TABLE argo2023_slim AS
SELECT
    CAST(platform_number AS INT) AS platform_id,
    CAST(time AS TIMESTAMPTZ) AS date,
    CAST(latitude AS DOUBLE) AS lat,
    CAST(longitude AS DOUBLE) AS lon,
    CAST(pres AS DOUBLE) AS depth_m,
    CAST(temp AS DOUBLE) AS temp_c,
    CAST(psal AS DOUBLE) AS sal_psu,

    -- Only keep digits 09, else NULL
    CAST(NULLIF(regexp_extract(pres_qc, '^[0-9]$', 0), '') AS TINYINT) AS pres_qc,
    CAST(NULLIF(regexp_extract(temp_qc, '^[0-9]$', 0), '') AS TINYINT) AS temp_qc,
    CAST(NULLIF(regexp_extract(psal_qc, '^[0-9]$', 0), '') AS TINYINT) AS psal_qc

FROM argo2023_raw;

""")


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x8d399b0>

In [4]:
con.execute("""delete from argo2023_slim where 
lat NOT BETWEEN -90 AND 90
OR lon NOT BETWEEN -180 AND 180;""")

<duckdb.duckdb.DuckDBPyConnection at 0x8d399b0>

In [5]:
con.execute("INSTALL spatial; LOAD spatial;")

<duckdb.duckdb.DuckDBPyConnection at 0x8d399b0>

In [6]:
con.execute("""      
CREATE TABLE argo2023_positions AS
SELECT DISTINCT
    lat,
    lon,
    ST_Point(lon, lat) AS geom
FROM
    argo2023_slim;""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x8d399b0>

In [7]:
con.execute("""
LOAD spatial;
CREATE TABLE world_seas_iho_v3 AS
SELECT *
FROM ST_Read('World_Seas_IHO_v3/World_Seas_IHO_v3.shp');""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x8d399b0>

In [8]:
con.execute("""
ALTER TABLE world_seas_iho_v3
ADD COLUMN geom_simple GEOMETRY;

UPDATE world_seas_iho_v3
SET geom_simple = ST_Simplify(geom, 0.01);""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x8d399b0>

In [9]:
con.execute("""
CREATE INDEX IF NOT EXISTS idx_world_seas_geom ON world_seas_iho_v3 USING rtree(geom_simple);
CREATE INDEX IF NOT EXISTS idx_positions_geom ON argo2023_positions USING rtree(geom);
ANALYZE world_seas_iho_v3;
ANALYZE argo2023_positions;
""")

<duckdb.duckdb.DuckDBPyConnection at 0x8d399b0>

In [10]:
con.execute("""
-- First, assign regions where there is an intersection
CREATE TABLE positions_with_region AS
SELECT 
    p.lat,
    p.lon,
    o.name AS region_name,
    p.geom
FROM argo2023_positions p
LEFT JOIN world_seas_iho_v3 o
    ON ST_Intersects(p.geom, o.geom_simple);
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x8d399b0>

In [11]:
con.execute("""
-- Next, find the nearest region for points with NULL region_name
CREATE TABLE nearest_region AS
SELECT 
    p.lat,
    p.lon,
    s.name AS region_name
FROM positions_with_region p
JOIN world_seas_iho_v3 s
    ON p.region_name IS NULL
QUALIFY ROW_NUMBER() OVER (PARTITION BY p.lat, p.lon ORDER BY ST_Distance(p.geom, s.geom_simple)) = 1;
""")

<duckdb.duckdb.DuckDBPyConnection at 0x8d399b0>

In [12]:
con.execute("""
-- Finally, combine the tables
CREATE TABLE argo2023_positions_region AS
SELECT lat, lon, region_name 
FROM positions_with_region
WHERE region_name IS NOT NULL

UNION ALL

SELECT lat, lon, region_name 
FROM nearest_region
WHERE region_name IS NOT NULL;
""")

<duckdb.duckdb.DuckDBPyConnection at 0x8d399b0>

In [13]:
con.execute("""
DROP TABLE argo2023_positions;
DROP TABLE argo2023_raw;
DROP TABLE nearest_region;
DROP TABLE positions_with_region;""").fetch_df()

Unnamed: 0,Success


In [14]:
con.execute("""
CREATE TABLE argo2023_with_region AS
SELECT a.*, r.region_name
FROM argo2023_slim a
LEFT JOIN argo2023_positions_region r
ON a.lat = r.lat AND a.lon = r.lon;
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x8d399b0>

In [15]:
con.execute("DROP TABLE argo2023_slim").fetch_df()

Unnamed: 0,Success


In [2]:
con.execute("ALTER TABLE argo2023_with_region RENAME TO argo2023;")

<duckdb.duckdb.DuckDBPyConnection at 0x8cadcb0>

In [4]:
con.execute("""Create table distinct_float_positions as
Select distinct platform_id, date, lat, lon from argo2023;""").fetchdf()

Unnamed: 0,Count
0,170839


In [5]:
con.execute("""CREATE TABLE latest_float_positions AS
SELECT
    platform_id,
    ARGMAX(lat, date) AS lat,
    ARGMAX(lon, date) AS lon,
    MAX(date) AS date
FROM
    distinct_float_positions
GROUP BY
    platform_id;""").fetch_df()

Unnamed: 0,Count
0,4663


In [9]:
con.close()