In [1]:
import os
import pickle
import shutil
import shapely
import cartopy.crs as ccrs
import cartopy.geodesic as cgeo
import cartopy.vector_transform as cvt
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
from shapely import prepare, Point, Polygon
from shapely.validation import make_valid
from skimage import measure, morphology
import sqlite3
from oceannavigator.dataset_config import DatasetConfig
from data import open_dataset
from data.sqlite_database import SQLiteDatabase

In [None]:
dataset_keys= DatasetConfig.get_datasets()
for dataset in dataset_keys:
    try:
        config= DatasetConfig(dataset)
        url= config.url
        with SQLiteDatabase(url) as db:
            variable_list= db.get_data_variables()
            variable=variable_list[0].key
            timestamp=db.get_latest_timestamp(variable)
            files = db.get_netcdf_files([timestamp], [variable])
            src_path = files[-1]
            filename = os.path.basename(src_path)
            if not os.path.exists(filename):
                shutil.copy(src_path, filename)
            ds= xr.open_dataset(filename)
            var=ds[variable]
        # get surface level data
        if len(ds.dims)==3:
            surface_data = var[0, :, :].data
        elif len(ds.dims)==2:
            surface_data= var.data
        else:
            surface_data = var[0, 0, :, :].data

        # create binary mask from data
        binary_mask = np.where(np.isnan(surface_data), 0, 1)
        binary_mask = np.pad(binary_mask, 1)  # pad the mask so that the edges will be included in the perimeter

        # create a convex hull mask
        ch_mask = morphology.convex_hull_image(binary_mask)

        # get the contours from the mask
        contours = measure.find_contours(ch_mask, level=0)

        # select the first contour for our perimeter (the first element should be the one we're interested in but you'll have to confirm yourself)
        perim_y, perim_x = np.transpose(contours[0]).astype(int)

        # shift coordinates on array edges so that we're not selecting the padded portion
        height, width = ch_mask.shape

        perim_y[perim_y == 0] = 1
        perim_y[perim_y >= height - 1] = height - 2

        perim_x[perim_x == 0] = 1
        perim_x[perim_x >= width - 1] = width - 2

        #second version
        lat_var = ds.get("latitude", ds.get("lat"))
        lon_var=ds.get("longitude", ds.get("lon"))
        dim = lat_var.ndim
        # Select that actual lon lat values
        if dim==2:
            pad_lat = np.pad(lat_var.data, 1)
            pad_lon = np.pad(lon_var.data, 1)
            lon_mesh = lon_var.data
            lat_mesh = lat_var.data

        elif dim==1:
            lon_mesh, lat_mesh = np.meshgrid(ds.longitude.data, ds.latitude.data)
            pad_lat=np.pad(lat_mesh,1)
            pad_lon=np.pad(lon_mesh,1)


        pts = np.stack([lon_mesh, lat_mesh], axis=2)
        pts = np.apply_along_axis(lambda pt: Point(pt), 2, pts)
        perim_lat = pad_lat[perim_y, perim_x]
        perim_lon = pad_lon[perim_y, perim_x]
        perim_poly = Polygon(np.stack([perim_lon, perim_lat], axis=1))
        prepare(perim_poly)
        poly_mask = perim_poly.contains_properly(pts).astype(bool)

        #third version

        idx = np.argmax(np.abs(np.diff(perim_lon)))
        if shapely.is_simple(perim_poly)==False:
            new_lons = [360, 360, 0, 0]
            new_lats = [perim_lat[idx], 90, 90, perim_lat[idx + 1]]
            perim_lon = np.insert(perim_lon, idx + 1, new_lons)
            perim_lat = np.insert(perim_lat, idx + 1, new_lats)

        name = config.name.strip().replace(" ", "_") + ".pkl"
        with open(name, "wb") as f:
            pickle.dump(perim_poly, f)
    except Exception as e:
        print(f"Error: {e} for dataset: {dataset}")


In [None]:
with open(config.name, "wb") as f:
    pickle.dump(perim_poly, f)

In [None]:
dataset_keys= DatasetConfig.get_datasets()
config= DatasetConfig(dataset_keys[2])
with open(config.name, "rb") as f:
    poly = pickle.load(f)

point = Point([-65, 79])

print(poly.contains(point))
print(poly.boundary)

In [None]:
dataset_keys= DatasetConfig.get_datasets()

In [None]:
config=dataset_keys[2]
config

In [None]:
import time
from oceannavigator import DatasetConfig
from data.sqlite_database import SQLiteDatabase
from dateutil.parser import parse as dateparse
from data.utils import get_data_vars_from_equation, time_index_to_datetime
import numpy as  np


for key in DatasetConfig.get_datasets():
    try:
        t0 = time.time()
        config = DatasetConfig(key)
        target_date='2025-07-23T02:30:00.000Z'
        parsed_date = dateparse(target_date)
        with SQLiteDatabase(config.url) as db:
            sample_var = config.variables[0]
            vals = np.asarray(db.get_timestamps(sample_var))
            if vals.size==0:
                continue

            time_dim_units = config.time_dim_units
            converted_times = time_index_to_datetime(vals[[0, -1]], time_dim_units)

            if (
                converted_times[0] <= parsed_date
                and converted_times[1] >= parsed_date
            ):
                print(f"Data Matched for {key}")
        t1 = time.time()
        print(f"time taken for {key} is {t1 - t0}")
    except Exception as e:
         continue




In [None]:
import time
from oceannavigator import DatasetConfig
from data.sqlite_database import SQLiteDatabase
from dateutil.parser import parse as dateparse
from data.utils import get_data_vars_from_equation, time_index_to_datetime
import numpy as  np


for key in DatasetConfig.get_datasets():
    try:
  
        config = DatasetConfig(key)
        target_date='2025-07-23T02:30:00.000Z'
        parsed_date = dateparse(target_date)
        with SQLiteDatabase(config.url) as db:
            sample_var = config.variables[0]
            t0 = time.time()
            vals = np.asarray(db.get_timestamps(sample_var))
            t1 = time.time()
            if vals.size==0:
                continue

            time_dim_units = config.time_dim_units
            converted_times = time_index_to_datetime(vals[[0, -1]], time_dim_units)
            print("converted:",converted_times)
            print("parsed",parsed_date)
      
            

            if (
                converted_times[0] <= parsed_date
                and converted_times[1] >= parsed_date
            ):
                print(f"Data Matched for {key}")
     
    except Exception as e:
         continue




In [None]:
key=DatasetConfig.get_datasets()
config = DatasetConfig(key[2])
print(config.variable["votemper"].scale)

In [None]:
config.variables[0]

In [None]:
with SQLiteDatabase(config.url) as db:
    t0 = time.time()
    db.get_earliest_timestamp(config.variables[0])
    t1 = time.time()
print(t1-t0)

In [None]:
import xarray as xr
for key in DatasetConfig.get_datasets():
    config = DatasetConfig(key)
    try:
        if not isinstance(config.url,list) and config.url.endswith(".sqlite3"):
            pass
        else:
            if not isinstance(config.url, list):
                data=xr.open_mfdataset([config.url])
            else:
                data=xr.open_mfdataset(config.url)
            time_vals=data.time.values
            first_time = pd.to_datetime(time_vals[0]).tz_localize('UTC')
            last_time = pd.to_datetime(time_vals[-1]).tz_localize('UTC')
            if first_time <= parsed_date and last_time >= parsed_date:
                matching_dataset_ids.append(dataset_id)

            print(data)
            
    except Exception as e:
        print(f"{e} dataset {key} config url {config.url}")
    

In [None]:
import pandas as pd
from datetime import datetime, timezone
ds=xr.open_mfdataset(["https://salishsea.eos.ubc.ca/erddap/griddap/ubcSSg3DBiologyFields1hV21-11"])
time_vals=ds.time.values
first_timestamp = time_vals[0].astype('datetime64[s]').astype(int)
last_timestamp = time_vals[-1].astype('datetime64[s]').astype(int)

first_time = datetime.fromtimestamp(first_timestamp, tz=timezone.utc)
last_time = datetime.fromtimestamp(last_timestamp, tz=timezone.utc)
target_date='2025-07-23T02:30:00.000Z'
parsed_date = dateparse(target_date)
if first_time <= parsed_date and last_time >= parsed_date:
        print(first_time)
        print(parsed_date)

In [None]:
variable_list = []
dataset_keys = DatasetConfig.get_datasets()

for dataset_key in dataset_keys:
    config = DatasetConfig(dataset_key)
    for variable in config.variables:
        variable_list.append(config.variable[variable].name)

variable_list = list(dict.fromkeys(variable_list))
print(variable_list)

NameError: name 'DatasetConfig' is not defined

In [None]:
dataset_keys = DatasetConfig.get_datasets()
config = DatasetConfig(dataset_keys[2])
for var_key in config.vector_variables.keys():
    var_data = config.vector_variables[var_key]
    var_name = var_data.get("name", var_key)
var_name


In [None]:
vector_variable_list=[]
for var_key in config.vector_variables.keys():
    var_data = config.vector_variables[var_key]
    var_name = var_data.get("name", var_key)
    vector_variable_list.append(var_name)
vector_variable_list=list(dict.fromkeys(vector_variable_list))
print(vector_variable_list)

In [3]:
dataset_id_list=DatasetConfig.get_datasets()
target_date='2025-07-23T02:30:00.000Z'
parsed_date = dateparse(target_date)

for dataset_id in dataset_id_list:
    try:
        config = DatasetConfig(dataset_id)
        if not isinstance(config.url,list) and config.url.endswith(".sqlite3"):
            with SQLiteDatabase(config.url) as db:
                sample_var = config.variables[0]
                vals = np.array(db.get_timestamps(sample_var))

                time_dim_units = config.time_dim_units
                converted_times = time_index_to_datetime(
                    vals[[0, -1]], time_dim_units
                )

                if (
                    converted_times[0] <= parsed_date
                    and converted_times[1] >= parsed_date
                ):
                    print(f"matched{dataset_id}")
        else:
            if not isinstance(config.url, list):
                data=xr.open_mfdataset([config.url])
            else:
                data=xr.open_mfdataset(config.url)
            time_vals=data.time.values
            first_timestamp = time_vals[0].astype('datetime64[s]').astype(int)
            last_timestamp = time_vals[-1].astype('datetime64[s]').astype(int)
            first_time = datetime.fromtimestamp(first_timestamp, tz=timezone.utc)
            last_time = datetime.fromtimestamp(last_timestamp, tz=timezone.utc)
            if first_time <= parsed_date and last_time >= parsed_date:
                print(f"matched{dataset_id}")
    except Exception as e:
        print(f'{e} for {dataset_id}')

index 0 is out of bounds for axis 0 with size 0 for giops_fc_10d_2dll
index 0 is out of bounds for axis 0 with size 0 for riops_fc_3dps
matchedciops-east_fc_3dll
matchedciops-east_fc_2dll
matchedciops-west_fc_3dll
matchedciops-west_fc_2dll
matchedciops-salish-sea_fc_3dll
matchedciops-salish-sea_fc_2dll
unable to open database file for cmems_climatology
unable to open database file for cmems_monthly_climatology
unable to open database file for cmems_seasonal_climatology
matchedsalishseacast_3d_biology
matchedsalishseacast_3d_currents


In [2]:
from dateutil.parser import parse as dateparse
from datetime import datetime, timezone
from data.utils import get_data_vars_from_equation, time_index_to_datetime


In [9]:
vals


array([], dtype=float64)

In [53]:
from dateutil.parser import parse as dateparse
import numpy as np

dataset_id_list = DatasetConfig.get_datasets()
target_date = '2025-07-23T02:30:00.000Z'
parsed_date = dateparse(target_date)



# pick a variable robustly
sample_var = getattr(config, "sample_variable", None) or (config.variables[0] if config.variables else None)
print("DB path:", config.url)
print("Sample variable:", sample_var)

if not sample_var:
    raise RuntimeError("No variables found in this dataset config")

with SQLiteDatabase(config.url) as db:
    # Sanity: list tables
    tables = db.c.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
    print("Tables:", tables)

    # Do we even have any timestamps at all?
    ts_count = db.c.execute("SELECT COUNT(*) FROM Timestamps").fetchone()[0]
    print("Total rows in Timestamps:", ts_count)

    # How many rows for THIS variable?
    cnt_for_var = db.c.execute("""
        SELECT COUNT(*)
        FROM TimestampVariableFilepath tvf
        JOIN Variables v ON tvf.variable_id = v.id
        JOIN Timestamps t ON tvf.timestamp_id = t.id
        WHERE v.variable = ?
    """, (sample_var,)).fetchone()[0]
    print(f"Rows for variable '{sample_var}':", cnt_for_var)

    # Peek a few timestamps for this var (ordered)
    first_last = db.c.execute("""
        SELECT t.timestamp
        FROM TimestampVariableFilepath tvf
        JOIN Variables v ON tvf.variable_id = v.id
        JOIN Timestamps t ON tvf.timestamp_id = t.id
        WHERE v.variable = ?
        ORDER BY t.timestamp ASC
        LIMIT 5
    """, (sample_var,)).fetchall()
    print("First few timestamps:", first_last)

    # What your code was doing:
    vals = np.array(db.get_timestamps(sample_var))
    print("get_timestamps size:", vals.size)
    if vals.size == 0:
        print("No timestamps returned by get_timestamps(sample_var) -> this caused your index error.")
    else:
        print("First/last from get_timestamps:", vals[0], vals[-1])


DB path: /data/db/canso500.sqlite3
Sample variable: sos
Tables: [('Dimensions',), ('Variables',), ('VarsDims',), ('Filepaths',), ('Timestamps',), ('TimestampVariableFilepath',)]
Total rows in Timestamps: 48
Rows for variable 'sos': 48
First few timestamps: [(2380928400,), (2380932000,), (2380935600,), (2380939200,), (2380942800,)]
get_timestamps size: 48
First/last from get_timestamps: 2380928400 2381097600


In [52]:
config = DatasetConfig(dataset_id_list[14])

In [72]:
dataset_id_list=DatasetConfig.get_datasets()
target_date='2025-07-23T02:30:00.000Z'
parsed_date = dateparse(target_date)

for dataset_id in dataset_id_list:
    try:
        config = DatasetConfig(dataset_id)
        if not isinstance(config.url,list) and config.url.endswith(".sqlite3"):
            with SQLiteDatabase(config.url) as db:
                for var in config.variables:
                    vals = np.array(db.get_timestamps(var))
                    if vals.size > 0:
                        break
                time_dim_units = config.time_dim_units
                converted_times = time_index_to_datetime(
                    vals[[0, -1]], time_dim_units
                )

                if (
                    converted_times[0] <= parsed_date
                    and converted_times[1] >= parsed_date
                ):
                    print(f"matched{dataset_id}")
                    print(converted_times, var)
        else:
            if not isinstance(config.url, list):
                data=xr.open_mfdataset([config.url])
            else:
                data=xr.open_mfdataset(config.url)
            time_vals=data.time.values
            first_timestamp = time_vals[0].astype('datetime64[s]').astype(int)
            last_timestamp = time_vals[-1].astype('datetime64[s]').astype(int)
            first_time = datetime.fromtimestamp(first_timestamp, tz=timezone.utc)
            last_time = datetime.fromtimestamp(last_timestamp, tz=timezone.utc)
            if first_time <= parsed_date and last_time >= parsed_date:
                print(f"matched{dataset_id}")
    except Exception as e:
        print(f'{e} for {dataset_id}')

matchedgiops_fc_10d_2dll
[real_datetime(2024, 7, 22, 3, 0, tzinfo=<UTC>), real_datetime(2025, 8, 2, 0, 0, tzinfo=<UTC>)] vozocrtx
matchedciops-east_fc_3dll
[real_datetime(2025, 7, 23, 0, 0, tzinfo=<UTC>), real_datetime(2025, 7, 23, 3, 0, tzinfo=<UTC>)] vozocrtx
matchedciops-east_fc_2dll
[real_datetime(2025, 7, 23, 0, 0, tzinfo=<UTC>), real_datetime(2025, 7, 23, 3, 0, tzinfo=<UTC>)] vozocrtx
matchedciops-west_fc_3dll
[real_datetime(2025, 7, 23, 1, 0, tzinfo=<UTC>), real_datetime(2025, 7, 23, 3, 0, tzinfo=<UTC>)] vozocrtx
matchedciops-west_fc_2dll
[real_datetime(2025, 7, 23, 1, 0, tzinfo=<UTC>), real_datetime(2025, 7, 23, 3, 0, tzinfo=<UTC>)] vozocrtx
matchedciops-salish-sea_fc_3dll
[real_datetime(2025, 7, 23, 1, 0, tzinfo=<UTC>), real_datetime(2025, 7, 24, 20, 0, tzinfo=<UTC>)] vozocrtx
matchedciops-salish-sea_fc_2dll
[real_datetime(2025, 7, 23, 1, 0, tzinfo=<UTC>), real_datetime(2025, 7, 24, 20, 0, tzinfo=<UTC>)] vozocrtx
matchedsalishseacast_3d_biology
matchedsalishseacast_3d_currents

In [68]:
dataset_id_list=DatasetConfig.get_datasets()
target_date='2025-07-23T02:30:00.000Z'
parsed_date = dateparse(target_date)
config = DatasetConfig(dataset_id_list[3])
sample_var=config.variables[3]
with SQLiteDatabase(config.url) as db:
    vals = np.array(db.get_timestamps(sample_var))
time_dim_units = config.time_dim_units
converted_times = time_index_to_datetime(
    vals[[0, -1]], time_dim_units
                )
print(converted_times)
print(dataset_id_list[3])


[real_datetime(2024, 6, 1, 0, 0, tzinfo=<UTC>), real_datetime(2025, 6, 12, 17, 0, tzinfo=<UTC>)]
riops_fc_3dps


In [69]:
config.variables

['vozocrte',
 'vomecrtn',
 'magwatervel',
 'votemper',
 'vosaline',
 'sspeed',
 'density',
 'deepsoundchannel',
 'soniclayerdepth',
 'deepsoundchannelbottom',
 'psubsurfacechannel']