## Data visualization

In [1]:
import os
import time

import fsspec
import geopandas as gpd
import holoviews as hv
import hvplot.pandas
import pandas as pd
import panel as pn
import xarray as xr
from dotenv import load_dotenv
from pyproj import CRS

load_dotenv(override=True)

# NOTE: access tokens to the data are available upon request from Floris Calkoen
sas_token = os.getenv("AZURE_STORAGE_SAS_TOKEN")
account_name = "coclico"
storage_options = {"account_name": account_name, "credential": sas_token}

In [2]:
TEST_PREDICTIONS_PREFIX = "az://typology/test/*.parquet"

fs = fsspec.filesystem("az", **storage_options)
files = fs.glob(TEST_PREDICTIONS_PREFIX)
test_layer_options = {f.split("/")[-1].replace(".parquet", ""): f for f in files}

In [3]:
file_browser = pn.widgets.Select(options=list(test_layer_options.keys()))
file_browser

In [4]:
def fetch_data(fs, urlpath, storage_options):
    with fs.open(urlpath, mode="rb", **storage_options) as f:
        df = gpd.read_parquet(f)
    return df


df = fetch_data(fs, test_layer_options[file_browser.value], storage_options)

In [5]:
import json

fs = fsspec.filesystem("az", **storage_options)
files1 = fs.glob("az://typology/labels/*.json")
files2 = fs.glob("az://typology/labels2/*.json")
files3 = fs.glob("az://typology/labels3/*.json")


def get_signed_url(container, record_name: str) -> str:
    """Constructs the signed HTTPS URL with the SAS token."""
    return f"{container}/{record_name}?{storage_options['credential']}"


def read_record(container, record_name: str) -> dict:
    """Reads a record from the Azure storage backend using HTTPS."""
    # Open the file using https to avoid issues in Panel apps
    signed_url = get_signed_url(container, record_name)
    with fsspec.open(signed_url, mode="r") as f:
        record = json.load(f)
    return record


records1 = []
for urlpath in files1:
    record_name = urlpath.split("/")[-1]
    r = read_record(
        "https://coclico.blob.core.windows.net/typology/labels", record_name
    )
    records1.append(r)

records2 = []
for urlpath in files2:
    record_name = urlpath.split("/")[-1]
    r = read_record(
        "https://coclico.blob.core.windows.net/typology/labels2", record_name
    )
    records2.append(r)

records3 = []
for urlpath in files3:
    record_name = urlpath.split("/")[-1]
    r = read_record(
        "https://coclico.blob.core.windows.net/typology/labels3", record_name
    )
    records3.append(r)

In [6]:
import geopandas as gpd
import shapely.wkt

gdf1 = gpd.GeoDataFrame.from_records(records1)
gdf1["geometry"] = gdf1["geometry"].apply(shapely.wkt.loads)
gdf1 = gdf1.set_geometry("geometry")
gdf1 = gdf1.set_crs(epsg=4326)

gdf2 = gpd.GeoDataFrame.from_records(records2)
gdf2["geometry"] = gdf2["geometry"].apply(shapely.wkt.loads)
gdf2 = gdf2.set_geometry("geometry")
gdf2 = gdf2.set_crs(epsg=4326)

gdf3 = gpd.GeoDataFrame.from_records(records3)
gdf3["geometry"] = gdf3["geometry"].apply(shapely.wkt.loads)
gdf3 = gdf3.set_geometry("geometry")
gdf3 = gdf3.set_crs(epsg=4326)

In [80]:
GeometryCollection

shapely.geometry.collection.GeometryCollection

In [48]:
import datetime
from typing import Any, Dict, Optional, Set, Tuple, Type, Union, get_args

import geopandas as gpd
import msgspec
import numpy as np
import pandas as pd
import shapely.wkt
from shapely.geometry import GeometryCollection, LineString, Point
import shapely


# Custom encoding and decoding functions
def encode_custom(obj):
    if isinstance(obj, datetime.datetime):
        return obj.isoformat()
    elif isinstance(obj, (GeometryCollection)):
        return obj.wkt
    elif isinstance(obj, BoundingBox):
        return obj.to_dict()
    elif isinstance(obj, BaseModel):
        return obj.to_dict()
    raise TypeError(f"Type {type(obj)} not supported")


def decode_custom(type, obj):
    if type is datetime.datetime:
        return datetime.datetime.fromisoformat(obj)
    elif type is GeometryCollection:
        return shapely.wkt.loads(obj)
    elif type is Point:
        return shapely.wkt.loads(obj)
    return obj


class BoundingBox(msgspec.Struct):
    xmax: float = np.nan
    xmin: float = np.nan
    ymax: float = np.nan
    ymin: float = np.nan

    def to_dict(self):
        msgspec.structs.asdict(self)


class BaseModel(
    msgspec.Struct,
    tag=True,
    tag_field="type",
    dict=True,
    omit_defaults=True,
    repr_omit_defaults=True,
):

    @property
    def __field_types__(self) -> Dict[str, Type]:
        """Retrieve a dictionary of field names and their types for fields that are defined."""
        field_types = {}

        for field in msgspec.structs.fields(self):
            # Retrieve the base type, ignoring Optional if present
            base_type = get_args(field.type)[0] if get_args(field.type) else field.type
            # Include the field type only if it has been defined
            if field.name in self.__defined_struct_fields__:
                field_types[field.name] = base_type

        return field_types

    @classmethod
    def null(cls) -> "BaseModel":
        """Create an instance of the class with null values for each field based on its type."""
        null_values = {}

        # Iterate through each field and set appropriate null values
        for field in msgspec.structs.fields(cls):
            field_type = field.type

            # Define null values based on type
            if field_type == str or field_type == Optional[str]:
                null_values[field.name] = ""
            elif (
                field_type == int
                or field_type == float
                or field_type == Optional[int]
                or field_type == Optional[float]
            ):
                null_values[field.name] = np.nan
            elif field_type == bool or field_type == Optional[bool]:
                null_values[field.name] = False
            elif field_type == pd.Timestamp or field_type == Optional[pd.Timestamp]:
                null_values[field.name] = pd.NaT
            elif (
                field_type == GeometryCollection
                or field_type == Optional[GeometryCollection]
            ):
                null_values[field.name] = GeometryCollection()
            elif field_type == BoundingBox or field_type == Optional[BoundingBox]:
                null_values[field.name] = BoundingBox()
            else:

                null_values[field.name] = None

        return cls(**null_values)

    def encode(self) -> bytes:
        encoder = msgspec.json.Encoder(enc_hook=encode_custom)
        return encoder.encode(self)

    def decode(self, data: bytes):
        decoder = msgspec.json.Decoder(ModelUnion, dec_hook=decode_custom)
        return decoder.decode(data)

    def to_dict(self) -> dict:
        """Convert instance to a dictionary without type conversions."""
        return msgspec.structs.asdict(self)

    def to_json(self) -> str:
        return self.encode().decode()

    def to_meta(self):
        return self.__field_types__

    def empty_frame(self) -> "gpd.GeoDataFrame":
        """Create an empty GeoDataFrame with the appropriate column types as defined by to_meta."""
        column_types = {}

        # Iterate through field types and assign appropriate dtypes or placeholders
        for col, dtype in self.to_meta().items():
            if dtype in {float, int}:
                column_types[col] = float  # Use float as a generic numeric type
            elif dtype == str:
                column_types[col] = object  # Use object for strings
            elif dtype == bool:
                column_types[col] = bool
            elif dtype == pd.Timestamp:
                column_types[col] = "datetime64[ns]"
            elif issubclass(dtype, GeometryCollection):
                column_types[col] = GeometryCollection()  # Empty geometry placeholder
            else:
                column_types[col] = object  # Fallback for any unsupported dtype

        # Initialize empty data with specified types or placeholders
        empty_data = {
            col: pd.Series(dtype=col_type) if col_type != GeometryCollection() else []
            for col, col_type in column_types.items()
        }

        return gpd.GeoDataFrame(empty_data, geometry="geometry", crs="EPSG:4326")

    def to_frame(self) -> "gpd.GeoDataFrame":
        """Convert to GeoDataFrame based on the currently defined fields."""
        return gpd.GeoDataFrame(
            [{field: getattr(self, field) for field in self.__defined_struct_fields__}],
            geometry="geometry",
            crs="EPSG:4326",
        )

    @property
    def __defined_struct_fields__(self) -> Tuple[str, ...]:
        """Returns a tuple of fields that have been explicitly defined with non-default values."""
        defined_fields = []

        for field in msgspec.structs.fields(self):
            field_name = field.name
            default_value = field.default
            current_value = getattr(self, field_name, None)

            if current_value != default_value:
                defined_fields.append(field_name)

        return tuple(defined_fields)


class Transect(BaseModel):
    transect_id: str
    geometry: GeometryCollection
    lon: Optional[float] = None
    lat: Optional[float] = None
    bearing: Optional[float] = None
    osm_coastline_is_closed: Optional[bool] = None
    osm_coastline_length: Optional[int] = None
    utm_epsg: Optional[int] = None
    bbox: Optional[BoundingBox] = None
    quadkey: Optional[str] = None
    continent: Optional[str] = None
    country: Optional[str] = None
    common_country_name: Optional[str] = None
    common_region_name: Optional[str] = None


# Union of all possible classes for decoding
ModelUnion = Union[Transect]

transect = Transect(transect_id="a", geometry=LineString([(0, 0), (1, 1)]), bearing=40)
transect

Transect(transect_id='a', geometry=<LINESTRING (0 0, 1 1)>, bearing=40)

In [49]:
from typing import Any, Dict, Optional, Tuple, Type, get_args

import geopandas as gpd
import msgspec
import numpy as np
import pandas as pd
from shapely.geometry import GeometryCollection, LineString
import shapely


# Custom encoding and decoding functions
def encode_custom(obj):
    if isinstance(obj, datetime.datetime):
        return obj.isoformat()
    elif isinstance(obj, (GeometryCollection)):
        return obj.wkt
    elif isinstance(obj, BoundingBox):
        return obj.to_dict()
    elif isinstance(obj, BaseModel):
        return obj.to_dict()
    raise TypeError(f"Type {type(obj)} not supported")


def decode_custom(type, obj):
    if type is datetime.datetime:
        return datetime.datetime.fromisoformat(obj)
    elif type is GeometryCollection:
        return shapely.wkt.loads(obj)
    elif type is Point:
        return shapely.wkt.loads(obj)
    return obj


class BaseModel(
    msgspec.Struct,
    tag=True,
    tag_field="type",
    dict=True,
    omit_defaults=True,
    repr_omit_defaults=True,
):
    @property
    def __field_types__(self) -> Dict[str, Type]:
        """Retrieve a dictionary of field names and their types for fields that are defined."""
        field_types = {}

        for field in msgspec.structs.fields(self):
            # Retrieve the base type, ignoring Optional if present
            base_type = get_args(field.type)[0] if get_args(field.type) else field.type
            # Include the field type only if it has been defined
            if field.name in self.__defined_struct_fields__:
                field_types[field.name] = base_type

        return field_types

    @classmethod
    def null(cls) -> "BaseModel":
        """Create an instance of the class with null values for each field based on its type."""
        null_values = {}

        for field in msgspec.structs.fields(cls):
            field_type = field.type
            # Extract base type if Optional or generic
            base_type = get_args(field_type)[0] if get_args(field_type) else field_type

            if base_type == str:
                null_values[field.name] = ""
            elif base_type in {int, float}:
                null_values[field.name] = np.nan
            elif base_type == bool:
                null_values[field.name] = False
            elif base_type == pd.Timestamp:
                null_values[field.name] = pd.NaT
            elif issubclass(base_type, GeometryCollection):
                null_values[field.name] = GeometryCollection()
            elif issubclass(base_type, BaseModel):
                null_values[field.name] = base_type.null()
            else:
                null_values[field.name] = None

        return cls(**null_values)

    def to_dict(self) -> dict:
        """Convert instance to a dictionary, ensuring custom fields like BoundingBox are serialized correctly."""
        data = msgspec.structs.asdict(self)
        for key, value in data.items():
            if isinstance(value, BaseModel):
                data[key] = value.to_dict()
        return data

    def encode(self) -> bytes:
        encoder = msgspec.json.Encoder(enc_hook=encode_custom)
        return encoder.encode(self)

    def decode(self, data: bytes):
        decoder = msgspec.json.Decoder(ModelUnion, dec_hook=decode_custom)
        return decoder.decode(data)

    def to_json(self) -> str:
        return self.encode().decode()

    def to_meta(self):
        return self.__field_types__

    def to_meta(self) -> Dict[str, Type]:
        """Generate a dictionary with field types, casting custom and unsupported types to 'object'."""
        field_types = {}

        for field_name, field_type in self.__field_types__.items():
            if field_type in {float, int, str, bool, pd.Timestamp}:
                # Standard types are added as-is
                field_types[field_name] = field_type
            elif issubclass(field_type, (GeometryCollection, LineString)):
                # Geometry types are assigned a placeholder for pandas compatibility
                field_types[field_name] = object
            elif issubclass(field_type, BaseModel):
                # Nested BaseModel instances like BoundingBox are cast to object
                field_types[field_name] = object
            else:
                # Any other unsupported types are also cast to object
                field_types[field_name] = object

        return field_types

    def empty_frame(self) -> "gpd.GeoDataFrame":
        """Create an empty GeoDataFrame with the appropriate column types as defined by to_meta."""
        column_types = {}

        for col, dtype in self.to_meta().items():
            if dtype in {float, int}:
                column_types[col] = float
            elif dtype == str:
                column_types[col] = object
            elif dtype == bool:
                column_types[col] = bool
            elif dtype == pd.Timestamp:
                column_types[col] = "datetime64[ns]"
            else:
                column_types[col] = object  # Default to object for unsupported types

        empty_data = {
            col: pd.Series(dtype=col_type) if col_type != GeometryCollection() else []
            for col, col_type in column_types.items()
        }

        return gpd.GeoDataFrame(empty_data, geometry="geometry", crs="EPSG:4326")

    def to_frame(self) -> "gpd.GeoDataFrame":
        """Convert to GeoDataFrame based on the currently defined fields."""
        data = {}
        for field in self.__defined_struct_fields__:
            value = getattr(self, field)
            # Convert BoundingBox or any BaseModel field to dictionary format
            if isinstance(value, BaseModel):
                data[field] = value.to_dict()
            else:
                data[field] = value

        return gpd.GeoDataFrame([data], geometry="geometry", crs="EPSG:4326")

    @property
    def __defined_struct_fields__(self) -> Tuple[str, ...]:
        """Returns a tuple of fields that have been explicitly defined with non-default values."""
        defined_fields = []

        for field in msgspec.structs.fields(self):
            field_name = field.name
            default_value = field.default
            current_value = getattr(self, field_name, None)

            if current_value != default_value:
                defined_fields.append(field_name)

        return tuple(defined_fields)


class BoundingBox(BaseModel):
    xmax: float
    xmin: float
    ymax: float
    ymin: float


class Transect(BaseModel):
    transect_id: str
    geometry: LineString
    lon: Optional[float] = None
    lat: Optional[float] = None
    bearing: Optional[float] = None
    osm_coastline_is_closed: Optional[bool] = None
    osm_coastline_length: Optional[int] = None
    utm_epsg: Optional[int] = None
    bbox: Optional[BoundingBox] = None
    quadkey: Optional[str] = None
    continent: Optional[str] = None
    country: Optional[str] = None
    common_country_name: Optional[str] = None
    common_region_name: Optional[str] = None


# Union of all possible classes for decoding
ModelUnion = Union[BoundingBox, Transect]

linestring = LineString([[45, 55], [34, 57]])
transect = Transect(
    transect_id="a", geometry=linestring, bearing=40, bbox=linestring.bounds
)
transect

Transect(transect_id='a', geometry=<LINESTRING (45 55, 34 57)>, bearing=40, bbox=(34.0, 55.0, 45.0, 57.0))

In [64]:
import datetime
from typing import Any, Dict, Optional, Tuple, Type, Union, get_args

import geopandas as gpd
import msgspec
import numpy as np
import pandas as pd
from shapely.geometry import GeometryCollection, LineString, Point
import shapely


# Custom encoding and decoding functions
def encode_custom(obj):
    """Encode custom data types for serialization."""
    if isinstance(obj, datetime.datetime):
        return obj.isoformat()
    elif isinstance(obj, (GeometryCollection, LineString, Point)):
        return obj.wkt
    elif isinstance(obj, (BoundingBox, BaseModel)):
        return obj.to_dict()
    raise TypeError(f"Type {type(obj)} not supported")


def decode_custom(type, obj):
    """Decode custom data types for deserialization."""
    if type is datetime.datetime:
        return datetime.datetime.fromisoformat(obj)
    elif type in {GeometryCollection, LineString, Point}:
        return shapely.wkt.loads(obj)
    elif type is BoundingBox:
        # Handle dict input format for BoundingBox
        if isinstance(obj, dict):
            return BoundingBox(**obj)
    elif isinstance(obj, dict) and "bbox" in obj:
        bbox_val = obj["bbox"]
        if isinstance(bbox_val, list) and len(bbox_val) == 4:
            obj["bbox"] = BoundingBox(xmin=bbox_val[0], ymin=bbox_val[1], xmax=bbox_val[2], ymax=bbox_val[3])
    return obj

class BaseModel(
    msgspec.Struct,
    tag=True,
    tag_field="type",
    dict=True,
    omit_defaults=True,
    repr_omit_defaults=True,
):
    @property
    def __defined_struct_fields__(self) -> Tuple[str, ...]:
        """Return tuple of fields explicitly defined with non-default values."""
        defined_fields = [
            field.name
            for field in msgspec.structs.fields(self)
            if getattr(self, field.name, None) != field.default
        ]
        return tuple(defined_fields)

    @property
    def __field_types__(self) -> Dict[str, Type]:
        """Return a dictionary of field names and their types."""
        field_types = {}
        for field in msgspec.structs.fields(self):
            base_type = get_args(field.type)[0] if get_args(field.type) else field.type
            if field.name in self.__defined_struct_fields__:
                field_types[field.name] = base_type
        return field_types

    @classmethod
    def null(cls) -> "BaseModel":
        """Create an instance with null values for each field."""
        null_values = {}
        for field in msgspec.structs.fields(cls):
            base_type = get_args(field.type)[0] if get_args(field.type) else field.type
            if base_type == str:
                null_values[field.name] = ""
            elif base_type in {int, float}:
                null_values[field.name] = np.nan
            elif base_type == bool:
                null_values[field.name] = False
            elif base_type == pd.Timestamp:
                null_values[field.name] = pd.NaT
            elif issubclass(base_type, (GeometryCollection, LineString, Point)):
                null_values[field.name] = GeometryCollection()
            elif issubclass(base_type, BaseModel):
                null_values[field.name] = base_type.null()
            else:
                null_values[field.name] = None
        return cls(**null_values)

    def to_dict(self) -> dict:
        """Convert instance to dictionary format."""
        data = msgspec.structs.asdict(self)
        for key, value in data.items():
            if isinstance(value, BaseModel):
                data[key] = value.to_dict()
        return data

    def encode(self) -> bytes:
        """Encode instance as JSON bytes."""
        encoder = msgspec.json.Encoder(enc_hook=encode_custom)
        return encoder.encode(self)

    def decode(self, data: bytes):
        """Decode JSON bytes to an instance."""
        decoder = msgspec.json.Decoder(ModelUnion, dec_hook=decode_custom)
        return decoder.decode(data)

    def to_json(self) -> str:
        """Encode instance as JSON string."""
        return self.encode().decode()

    def to_meta(self) -> Dict[str, Type]:
        """Generate a dictionary with field types for metadata."""
        field_types = {}
        for field_name, field_type in self.__field_types__.items():
            if field_type in {float, int, str, bool, pd.Timestamp}:
                field_types[field_name] = field_type
            elif issubclass(field_type, (GeometryCollection, LineString)):
                field_types[field_name] = object
            elif issubclass(field_type, BaseModel):
                field_types[field_name] = object
            else:
                field_types[field_name] = object
        return field_types

    def empty_frame(self) -> "gpd.GeoDataFrame":
        """Create an empty GeoDataFrame with appropriate column types."""
        column_types = {
            col: (
                float
                if dtype in {float, int}
                else "datetime64[ns]" if dtype == pd.Timestamp else object
            )
            for col, dtype in self.to_meta().items()
        }
        empty_data = {
            col: pd.Series(dtype=col_type) if col_type != GeometryCollection() else []
            for col, col_type in column_types.items()
        }
        return gpd.GeoDataFrame(empty_data, geometry="geometry", crs="EPSG:4326")

    def to_frame(self) -> "gpd.GeoDataFrame":
        """Convert instance to GeoDataFrame format."""
        data = {
            field: (
                getattr(self, field).to_dict()
                if isinstance(getattr(self, field), BaseModel)
                else getattr(self, field)
            )
            for field in self.__defined_struct_fields__
        }
        return gpd.GeoDataFrame([data], geometry="geometry", crs="EPSG:4326")

    @classmethod
    def from_json(cls, json_str: str) -> "BaseModel":
        """Create an instance from JSON string."""
        decoder = msgspec.json.Decoder(cls, dec_hook=decode_custom)
        return decoder.decode(json_str.encode())

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "BaseModel":
        """Create an instance from a dictionary."""
        return cls(**data)

    @classmethod
    def from_frame(cls, frame: gpd.GeoDataFrame) -> "BaseModel":
        """Create an instance from a GeoDataFrame row."""
        data = frame.iloc[0].to_dict()
        return cls.from_dict(data)


class BoundingBox(BaseModel):
    xmax: float
    xmin: float
    ymax: float
    ymin: float


class Transect(BaseModel):
    transect_id: str
    geometry: LineString
    lon: Optional[float] = None
    lat: Optional[float] = None
    bearing: Optional[float] = None
    osm_coastline_is_closed: Optional[bool] = None
    osm_coastline_length: Optional[int] = None
    utm_epsg: Optional[int] = None
    bbox: Optional[BoundingBox] = None
    quadkey: Optional[str] = None
    continent: Optional[str] = None
    country: Optional[str] = None
    common_country_name: Optional[str] = None
    common_region_name: Optional[str] = None


# Union of all possible classes for decoding
ModelUnion = Union[BoundingBox, Transect]

# Testing instantiation
linestring = LineString([[45, 55], [34, 57]])
transect = Transect(
    transect_id="a",
    geometry=linestring,
    bearing=40,
    bbox=[34, 55, 45, 57],
)

In [71]:

class Base(msgspec.Struct, kw_only=True):
    a: str = ""
    b: int

class Subclass(Base):
    c: float
    d: bytes = b""

class SubSubClass(Subclass):
    e: str
    f: int = 3



TypeError: Required field 'e' cannot follow optional fields. Either reorder the struct fields, or set `kw_only=True` in the struct definition.

In [75]:
from sklearn.preprocessing import OneHotEncoder
from typing import Literal

SHORE_TYPE_OPTIONS = [
    "sandy_gravel_or_small_boulder_sediments",
    "muddy_sediments",
    "rocky_shore_platform_or_large_boulders",
    "no_sediment_or_shore_platform",
    "ice_or_tundra",
]

# Dynamically create Literal and encoding from the options
SHORE_TYPE = Literal[tuple(SHORE_TYPE_OPTIONS)]
SHORE_TYPE_ENCODING = {value: index for index, value in enumerate(SHORE_TYPE_OPTIONS)}

# Create a one-hot encoder instance
one_hot_encoder = OneHotEncoder(categories=[SHORE_TYPE_OPTIONS])
one_hot_encoder.fit([[type] for type in SHORE_TYPE_OPTIONS])


In [78]:
# Create a one-hot encoder instance with sparse=False
one_hot_encoder = OneHotEncoder(categories=[SHORE_TYPE_OPTIONS], sparse_output=False)
one_hot_encoder.fit([[type] for type in SHORE_TYPE_OPTIONS])

# Transform the value
one_hot_encoded = one_hot_encoder.transform([[shore_type_value]])[0]  # Directly get the row
print("One-hot encoded array:", one_hot_encoded.tolist())


One-hot encoded array: [0.0, 0.0, 1.0, 0.0, 0.0]


In [65]:
transect_json = Transect.from_frame(transect.to_frame()).to_json()
# transect_round_trip = Transect.from_json(transect_json)

# print("Original Transect:", transect)
# print("Deserialized Transect:", transect_round_trip)

In [66]:
transect_json

'{"type":"Transect","transect_id":"a","geometry":"LINESTRING (45 55, 34 57)","bearing":40,"bbox":[34,55,45,57]}'

In [67]:
Transect.from_json(transect_json)

ValidationError: Expected `object | null`, got `array` - at `$.bbox`

In [43]:
Transect.from_json(Transect.from_frame(transect.to_frame()).to_json())

TypeError: Type 'list[float, float, float, float]' is not supported

In [9]:
linestring = LineString([[45, 55], [34, 57]])
transect = Transect(
    transect_id="a", geometry=linestring, bearing=40, bbox=linestring.bounds
)
transect.to_dict()

{'transect_id': 'a',
 'geometry': <LINESTRING (45 55, 34 57)>,
 'lon': None,
 'lat': None,
 'bearing': 40,
 'osm_coastline_is_closed': None,
 'osm_coastline_length': None,
 'utm_epsg': None,
 'bbox': (34.0, 55.0, 45.0, 57.0),
 'quadkey': None,
 'continent': None,
 'country': None,
 'common_country_name': None,
 'common_region_name': None}

In [194]:
transect.to_meta()

{'transect_id': str,
 'geometry': object,
 'lon': float,
 'lat': float,
 'bearing': float,
 'osm_coastline_is_closed': bool,
 'osm_coastline_length': int,
 'utm_epsg': int,
 'bbox': object,
 'quadkey': str,
 'continent': str,
 'country': str,
 'common_country_name': str,
 'common_region_name': str}

In [176]:
BoundingBox(*linestring.bounds).to_dict()

{'xmax': 34.0, 'xmin': 45.0, 'ymax': 44.0, 'ymin': 60.0}

In [187]:
Transect.null().empty_frame().dtypes

transect_id       object
geometry        geometry
bounding_box      object
dtype: object

In [159]:
import dask_geopandas

ddf = dask_geopandas.read_parquet(
    "az://gcts/release/2024-08-02/*.parquet", storage_options=storage_options
).partitions[0]
record = ddf.sample(frac=0.00001).compute().sample(1)

In [171]:
record[["transect_id", "geometry", "bbox"]]

Unnamed: 0,transect_id,geometry,bbox
470707,cl48876s00tr00021541,"LINESTRING (80.3399 73.51191, 80.38929 73.50075)","{'xmax': 80.38929455335335, 'xmin': 80.3398952..."


In [162]:
record.bbox.item()

{'xmax': 80.38929455335335,
 'xmin': 80.33989523811559,
 'ymax': 73.5119124693236,
 'ymin': 73.50075488694772}

In [146]:
linestring = LineString([[34, 45], [44, 60]])
Transect(transect_id="a", geometry=linestring, bounding_box=linestring.bounds).to_meta()

{'transect_id': str,
 'geometry': shapely.geometry.collection.GeometryCollection,
 'bounding_box': __main__.BoundingBox}

In [147]:
from typing import Any, Dict, Optional, Tuple, Type, get_args

import geopandas as gpd
import msgspec
import numpy as np
import pandas as pd
from shapely.geometry import GeometryCollection, LineString


class BaseModel(
    msgspec.Struct,
    tag=True,
    tag_field="type",
    dict=True,
    omit_defaults=True,
    repr_omit_defaults=True,
):
    @property
    def __field_types__(self) -> Dict[str, Type]:
        """Retrieve a dictionary of field names and their types for fields that are defined."""
        field_types = {}

        for field in msgspec.structs.fields(self):
            # Retrieve the base type, ignoring Optional if present
            base_type = get_args(field.type)[0] if get_args(field.type) else field.type
            field_types[field.name] = base_type

        return field_types

    @classmethod
    def null(cls) -> "BaseModel":
        """Create an instance of the class with null values for each field based on its type."""
        null_values = {}

        for field in msgspec.structs.fields(cls):
            field_type = field.type

            if field_type == str or field_type == Optional[str]:
                null_values[field.name] = ""
            elif field_type in {int, float, Optional[int], Optional[float]}:
                null_values[field.name] = np.nan
            elif field_type == bool or field_type == Optional[bool]:
                null_values[field.name] = False
            elif field_type == pd.Timestamp or field_type == Optional[pd.Timestamp]:
                null_values[field.name] = pd.NaT
            elif issubclass(field_type, GeometryCollection):
                null_values[field.name] = GeometryCollection()
            elif issubclass(field_type, BaseModel):
                null_values[field.name] = (
                    field_type.null()
                )  # Recursively create null instance for nested BaseModel subclasses
            else:
                null_values[field.name] = None

        return cls(**null_values)

    def to_dict(self) -> dict:
        """Convert instance to a dictionary without type conversions."""
        return msgspec.structs.asdict(self)

    def to_meta(self) -> Dict[str, Type]:
        """Generate a dictionary with field types, casting custom and unsupported types to 'object'."""
        field_types = {}

        for field_name, field_type in self.__field_types__.items():
            if field_type in {float, int, str, bool, pd.Timestamp}:
                # Standard types are added as-is
                field_types[field_name] = field_type
            elif issubclass(field_type, (GeometryCollection, LineString)):
                # Geometry types are assigned a placeholder for pandas compatibility
                field_types[field_name] = object
            elif issubclass(field_type, BaseModel):
                # Nested BaseModel instances like BoundingBox are cast to object
                field_types[field_name] = object
            else:
                # Any other unsupported types are also cast to object
                field_types[field_name] = object

        return field_types

    def empty_frame(self) -> "gpd.GeoDataFrame":
        """Create an empty GeoDataFrame with the appropriate column types as defined by to_meta."""
        column_types = {}

        for col, dtype in self.to_meta().items():
            if dtype in {float, int}:
                column_types[col] = float
            elif dtype == str:
                column_types[col] = object
            elif dtype == bool:
                column_types[col] = bool
            elif dtype == pd.Timestamp:
                column_types[col] = "datetime64[ns]"
            else:
                column_types[col] = object  # Default to object for unsupported types

        empty_data = {
            col: pd.Series(dtype=col_type) if col_type != GeometryCollection() else []
            for col, col_type in column_types.items()
        }

        return gpd.GeoDataFrame(empty_data, geometry="geometry", crs="EPSG:4326")

    def to_frame(self) -> "gpd.GeoDataFrame":
        """Convert to GeoDataFrame based on the currently defined fields."""
        return gpd.GeoDataFrame(
            [{field: getattr(self, field) for field in self.__defined_struct_fields__}],
            geometry="geometry",
            crs="EPSG:4326",
        )

    @property
    def __defined_struct_fields__(self) -> Tuple[str, ...]:
        """Returns a tuple of fields that have been explicitly defined with non-default values."""
        defined_fields = []

        for field in msgspec.structs.fields(self):
            field_name = field.name
            default_value = field.default
            current_value = getattr(self, field_name, None)

            if current_value != default_value:
                defined_fields.append(field_name)

        return tuple(defined_fields)


class BoundingBox(BaseModel):
    xmax: float
    xmin: float
    ymax: float
    ymin: float


# Example subclass using BoundingBox with LineString for geometry
class Transect(BaseModel):
    transect_id: str
    geometry: LineString
    bounding_box: BoundingBox


# Testing `to_meta` and `empty_frame` for Transect with LineString geometry
transect = Transect.null()
print(transect.to_meta())
empty_gdf = transect.empty_frame()
print(empty_gdf)

{'transect_id': <class 'str'>, 'geometry': <class 'object'>, 'bounding_box': <class 'object'>}
Empty GeoDataFrame
Columns: [transect_id, geometry, bounding_box]
Index: []


In [149]:
transect.to_frame()

Unnamed: 0,transect_id,geometry,bounding_box
0,,,"BoundingBox(xmax=nan, xmin=nan, ymax=nan, ymin..."


In [49]:
msgspec.structs.FieldInfo??

[0;31mInit signature:[0m
[0mmsgspec[0m[0;34m.[0m[0mstructs[0m[0;34m.[0m[0mFieldInfo[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencode_name[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtype[0m[0;34m:[0m [0mAny[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdefault[0m[0;34m:[0m [0mAny[0m [0;34m=[0m [0;34m<[0m[0mfactory[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdefault_factory[0m[0;34m:[0m [0mAny[0m [0;34m=[0m [0;34m<[0m[0mfactory[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mFieldInfo[0m[0;34m([0m[0mStruct[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""A record describing a field in a struct type.[0m
[0;34m[0m
[0;34m    Parameters[0m
[0;34m    ----------[0m
[0;34m    name: str[0m
[0;34m        The field name as seen b

In [47]:
# I am not familiar yet with the library msgspec, however, I want to use msgspec to inpect my 
# transect object that are defined as a msgspec struct (baseclass as provided below). Provide m
# with a list of commonly used functions and tools to inspect the msgspec structs. Carefully think
# about what is contained in this library. I want to know for example which fields are defined. 
# And what their structure and datatypes are. 
msgspec.structs.

SyntaxError: invalid syntax (92606505.py, line 6)

In [101]:
import typing

typing.get_args(msgspec.structs.fields(transect)[3].type)[0]

float

In [None]:
msgspec.structs.

In [42]:
transect.get_defined_fields()

{'bbox',
 'bearing',
 'common_country_name',
 'common_region_name',
 'continent',
 'country',
 'geometry',
 'lat',
 'lon',
 'osm_coastline_is_closed',
 'osm_coastline_length',
 'quadkey',
 'transect_id',
 'utm_epsg'}

In [None]:
import datetime
from typing import Any, Dict, Optional, Union

import geopandas as gpd
import msgspec
import numpy as np
import pandas as pd
from shapely.geometry import GeometryCollection, LineString, Point


# Define a BoundingBox class to represent the bbox field
class BoundingBox(msgspec.Struct):
    xmax: float
    xmin: float
    ymax: float
    ymin: float

    def to_dict(self):
        return {
            "xmax": self.xmax,
            "xmin": self.xmin,
            "ymax": self.ymax,
            "ymin": self.ymin,
        }

    @classmethod
    def from_defaults(cls):
        return cls(xmax=np.nan, xmin=np.nan, ymax=np.nan, ymin=np.nan)


# Base class with tagging enabled
class BaseModel(msgspec.Struct, tag=True, tag_field="type", dict=True):
    def encode(self) -> bytes:
        encoder = msgspec.json.Encoder(enc_hook=encode_custom)
        return encoder.encode(self)

    def decode(self, data: bytes):
        decoder = msgspec.json.Decoder(ModelUnion, dec_hook=decode_custom)
        return decoder.decode(data)

    def to_dict(self) -> dict:
        return {
            field: getattr(self, field)
            for field in self.__struct_fields__
            if hasattr(self, field)
        }

    def to_json(self) -> str:
        return self.encode().decode()

    def to_frame(self) -> gpd.GeoDataFrame:
        defined_fields = self.get_defined_fields()
        return gpd.GeoDataFrame(
            [{field: getattr(self, field) for field in defined_fields}],
            geometry="geometry",
            crs="EPSG:4326",
        )

    @classmethod
    def to_meta(cls) -> Dict[str, Any]:
        """Return a dictionary with field names and their data types using __annotations__."""
        meta = {}
        for field, field_type in cls.__annotations__.items():
            if field_type == str:
                meta[field] = pd.StringDtype()
            elif field_type == int:
                meta[field] = "int64"
            elif field_type == bool:
                meta[field] = "bool"
            elif field_type == float:
                meta[field] = "float64"
            elif field_type in (LineString, Point):
                meta[field] = "geometry"
            elif field_type == datetime.datetime:
                meta[field] = "datetime64[ns]"
            elif field_type == Optional[BoundingBox]:
                meta[field] = BoundingBox
            else:
                meta[field] = "object"
        return meta

    @classmethod
    def null_dict(cls) -> Dict[str, Any]:
        """Create a dictionary with null values based on field types."""
        null_values = {}
        meta = cls.to_meta()
        for field_name, dtype in meta.items():
            if dtype == pd.StringDtype():
                null_values[field_name] = ""
            elif dtype in ("int64", "float64"):
                null_values[field_name] = np.nan
            elif dtype == "bool":
                null_values[field_name] = False
            elif dtype == "geometry":
                null_values[field_name] = GeometryCollection()
            elif dtype == "datetime64[ns]":
                null_values[field_name] = pd.NaT
            else:
                null_values[field_name] = None
        return null_values

    @classmethod
    def empty_frame(cls) -> gpd.GeoDataFrame:
        meta = cls.to_meta()
        empty_data = {col: pd.Series(dtype=dtype) for col, dtype in meta.items()}
        return gpd.GeoDataFrame(empty_data, geometry="geometry", crs="EPSG:4326")

    def get_defined_fields(self):
        """Get fields with non-None values."""
        return {
            field
            for field in self.__struct_fields__
            if getattr(self, field) is not None
        }


class Transect(BaseModel):
    transect_id: str
    geometry: LineString
    lon: Optional[float] = 0
    lat: Optional[float] = 0
    bearing: Optional[float] = None
    osm_coastline_is_closed: Optional[bool] = None
    osm_coastline_length: Optional[int] = None
    utm_epsg: Optional[int] = None
    bbox: Optional[BoundingBox] = None
    quadkey: Optional[str] = None
    continent: Optional[str] = None
    country: Optional[str] = None
    common_country_name: Optional[str] = None
    common_region_name: Optional[str] = None


# Union of all possible classes for decoding
ModelUnion = Union[Transect]

# Testing the solution
transect = Transect(transect_id="a", geometry=LineString([(0, 0), (1, 1)]))
print("Metadata:", Transect.to_meta())
print("Null Dictionary:", Transect.null_dict())

In [None]:
class Transect:
    def __init__(self, data=None):
        self.data = data or {}

    # Instance method
    def to_method(self):
        return f"Called on instance with data: {self.data}"

    # Class method
    @classmethod
    def to_meta(cls):
        return "Called from class method without an instance"

In [None]:
import datetime
import typing
from typing import Any, Dict, Optional, Union, get_args, get_origin

import geopandas as gpd
import msgspec
import numpy as np
import pandas as pd
from shapely.geometry import GeometryCollection, LineString, Point


# Define a BoundingBox class to represent the bbox field
class BoundingBox(msgspec.Struct):
    xmax: float
    xmin: float
    ymax: float
    ymin: float

    def to_dict(self):
        return {
            "xmax": self.xmax,
            "xmin": self.xmin,
            "ymax": self.ymax,
            "ymin": self.ymin,
        }

    @classmethod
    def from_defaults(cls):
        return cls(xmax=np.nan, xmin=np.nan, ymax=np.nan, ymin=np.nan)


# Custom encoding and decoding functions
def encode_custom(obj):
    if isinstance(obj, datetime.datetime):
        return obj.isoformat()
    elif isinstance(obj, (Point, LineString)):
        return obj.wkt
    elif isinstance(obj, BoundingBox):
        return obj.to_dict()
    elif isinstance(obj, BaseModel):
        return obj.to_dict()
    raise TypeError(f"Type {type(obj)} not supported")


def decode_custom(type, obj):
    if type is datetime.datetime:
        return datetime.datetime.fromisoformat(obj)
    elif type is LineString:
        return shapely.wkt.loads(obj)
    elif type is Point:
        return shapely.wkt.loads(obj)
    return obj


# Base class with tagging enabled
class BaseModel(msgspec.Struct, tag=True, tag_field="type", dict=True):
    def encode(self) -> bytes:
        encoder = msgspec.json.Encoder(enc_hook=encode_custom)
        return encoder.encode(self)

    def decode(self, data: bytes):
        decoder = msgspec.json.Decoder(ModelUnion, dec_hook=decode_custom)
        return decoder.decode(data)

    def to_dict(self) -> dict:
        return {
            field: getattr(self, field)
            for field in self.__struct_fields__
            if hasattr(self, field)
        }

    def to_json(self) -> str:
        return self.encode().decode()

    def to_frame(self) -> gpd.GeoDataFrame:
        defined_fields = self.get_defined_fields()
        return gpd.GeoDataFrame(
            [{field: getattr(self, field) for field in defined_fields}],
            geometry="geometry",
            crs="EPSG:4326",
        )

    def to_meta(self) -> Dict[str, Any]:
        """Return a dictionary with field names and their resolved data types using `__annotations__`."""
        meta = {}
        defined_fields = self.get_defined_fields()
        print(defined_fields)
        # Use typing.get_type_hints to extract type hints, handling Optional types by extracting the base type
        type_hints = typing.get_type_hints(self)

        for field, field_type in type_hints.items():
            if field in type_hints:
                # Check if the field type is Optional and extract the non-None type
                if get_origin(field_type) is Union:
                    # Extract the non-None type from Union if it exists
                    field_type = [
                        t for t in get_args(field_type) if t is not type(None)
                    ][0]

                # Map field types to pandas-compatible types
                if field_type == str:
                    meta[field] = pd.StringDtype()
                elif field_type == int:
                    meta[field] = "int64"
                elif field_type == bool:
                    meta[field] = "bool"
                elif field_type == float:
                    meta[field] = "float64"
                elif field_type in (LineString, Point):
                    meta[field] = "geometry"
                elif field_type == datetime.datetime:
                    meta[field] = "datetime64[ns]"
                elif field_type == BoundingBox:
                    meta[field] = BoundingBox
                else:
                    meta[field] = "object"

        return meta

    @classmethod
    def to_meta(cls) -> Dict[str, Any]:
        """Return a dictionary with field names and their resolved data types using `__annotations__`."""
        meta = {}

        # Use typing.get_type_hints to extract type hints, handling Optional types by extracting the base type
        type_hints = typing.get_type_hints(cls)

        for field, field_type in type_hints.items():
            print("again")

            # Check if the field type is Optional and extract the non-None type
            if get_origin(field_type) is Union:
                # Extract the non-None type from Union if it exists
                field_type = [t for t in get_args(field_type) if t is not type(None)][0]

            # Map field types to pandas-compatible types
            if field_type == str:
                meta[field] = pd.StringDtype()
            elif field_type == int:
                meta[field] = "int64"
            elif field_type == bool:
                meta[field] = "bool"
            elif field_type == float:
                meta[field] = "float64"
            elif field_type in (LineString, Point):
                meta[field] = "geometry"
            elif field_type == datetime.datetime:
                meta[field] = "datetime64[ns]"
            elif field_type == BoundingBox:
                meta[field] = BoundingBox
            else:
                meta[field] = "object"

        return meta

    @classmethod
    def null_dict(cls) -> Dict[str, Any]:
        """Create a dictionary with null values based on field types."""
        null_values = {}
        meta = cls.to_meta()
        for field_name, dtype in meta.items():
            if dtype == pd.StringDtype():
                null_values[field_name] = ""
            elif dtype in ("int64", "float64"):
                null_values[field_name] = np.nan
            elif dtype == "bool":
                null_values[field_name] = False
            elif dtype == "geometry":
                null_values[field_name] = GeometryCollection()
            elif dtype == "datetime64[ns]":
                null_values[field_name] = pd.NaT
            else:
                null_values[field_name] = None
        return null_values

    @classmethod
    def empty_frame(cls) -> gpd.GeoDataFrame:
        meta = cls.to_meta()
        empty_data = {col: pd.Series(dtype=dtype) for col, dtype in meta.items()}
        return gpd.GeoDataFrame(empty_data, geometry="geometry", crs="EPSG:4326")

    def get_defined_fields(self):
        """Get fields with non-None values."""
        return {
            field
            for field in self.__struct_fields__
            if getattr(self, field) is not None
        }


class Transect(BaseModel):
    transect_id: str
    geometry: LineString
    lon: Optional[float] = 0
    lat: Optional[float] = 0
    bearing: Optional[float] = None
    osm_coastline_is_closed: Optional[bool] = None
    osm_coastline_length: Optional[int] = None
    utm_epsg: Optional[int] = None
    bbox: Optional[BoundingBox] = None
    quadkey: Optional[str] = None
    continent: Optional[str] = None
    country: Optional[str] = None
    common_country_name: Optional[str] = None
    common_region_name: Optional[str] = None


# Union of all possible classes for decoding
ModelUnion = Union[Transect]

# Testing the solution
transect = Transect(transect_id="a", geometry=LineString([(0, 0), (1, 1)]))
print("Metadata:", Transect.to_meta())
print("Null Dictionary:", Transect.null_dict())

In [None]:
Transect.to_meta()

In [None]:
transect.to_meta()

In [144]:
import typing

print(typing.get_type_hints(Transect))
print("")


typing.get_args(typing.get_type_hints(Transect)["country"])

{'transect_id': <class 'str'>, 'geometry': <class 'shapely.geometry.linestring.LineString'>, 'lon': typing.Optional[float], 'lat': typing.Optional[float], 'bearing': typing.Optional[float], 'osm_coastline_is_closed': typing.Optional[bool], 'osm_coastline_length': typing.Optional[int], 'utm_epsg': typing.Optional[int], 'bbox': typing.Optional[__main__.BoundingBox], 'quadkey': typing.Optional[str], 'continent': typing.Optional[str], 'country': typing.Optional[str], 'common_country_name': typing.Optional[str], 'common_region_name': typing.Optional[str]}



(str, NoneType)

In [None]:
# Union of all possible classes for decoding
ModelUnion = Union[TransectOrigin, Transect]
transect = Transect.from_defaults()

In [146]:
type_hints = typing.get_type_hints(Transect)
type_hints

{'transect_id': str,
 'geometry': shapely.geometry.linestring.LineString,
 'lon': typing.Optional[float],
 'lat': typing.Optional[float],
 'bearing': typing.Optional[float],
 'osm_coastline_is_closed': typing.Optional[bool],
 'osm_coastline_length': typing.Optional[int],
 'utm_epsg': typing.Optional[int],
 'bbox': typing.Optional[__main__.BoundingBox],
 'quadkey': typing.Optional[str],
 'continent': typing.Optional[str],
 'country': typing.Optional[str],
 'common_country_name': typing.Optional[str],
 'common_region_name': typing.Optional[str]}

In [77]:
import datetime
from typing import Any, Optional, Union

import geopandas as gpd
import msgspec
import numpy as np
import pandas as pd
import shapely.wkt
from shapely.geometry import GeometryCollection, LineString, Point


# Define a BoundingBox class to represent the bbox field
class BoundingBox(msgspec.Struct):
    xmax: float
    xmin: float
    ymax: float
    ymin: float

    def to_dict(self):
        return {
            "xmax": self.xmax,
            "xmin": self.xmin,
            "ymax": self.ymax,
            "ymin": self.ymin,
        }

    @classmethod
    def from_defaults(cls):
        """Create a default BoundingBox instance with NaN values for each boundary."""
        return cls(xmax=np.nan, xmin=np.nan, ymax=np.nan, ymin=np.nan)


# Custom encoding and decoding functions
def encode_custom(obj):
    if isinstance(obj, datetime.datetime):
        return obj.isoformat()
    elif isinstance(obj, (Point, LineString)):
        return obj.wkt
    elif isinstance(obj, BoundingBox):
        return obj.to_dict()
    elif isinstance(obj, BaseModel):
        return obj.to_dict()
    raise TypeError(f"Type {type(obj)} not supported")


def decode_custom(type, obj):
    if type is datetime.datetime:
        return datetime.datetime.fromisoformat(obj)
    elif type is LineString:
        return shapely.wkt.loads(obj)
    elif type is Point:
        return shapely.wkt.loads(obj)
    return obj


# Base class with tagging enabled
class BaseModel(msgspec.Struct, tag=True, tag_field="type"):
    def encode(self) -> bytes:
        encoder = msgspec.json.Encoder(enc_hook=encode_custom)
        return encoder.encode(self)

    def decode(self, data: bytes):
        decoder = msgspec.json.Decoder(ModelUnion, dec_hook=decode_custom)
        return decoder.decode(data)

    def to_dict(self) -> dict:
        """Convert instance to a dictionary without type conversions."""
        return {field: getattr(self, field) for field in self.__struct_fields__}

    def to_json(self) -> str:
        return self.encode().decode()

    def to_frame(self) -> gpd.GeoDataFrame:
        """Convert to GeoDataFrame based on the currently defined fields."""
        included_fields = (
            self._defined_fields
            if hasattr(self, "_defined_fields")
            else self.__struct_fields__
        )
        return gpd.GeoDataFrame(
            [{field: getattr(self, field) for field in included_fields}],
            geometry="geometry",
            crs="EPSG:4326",
        )

    @classmethod
    def to_meta(cls) -> dict:
        """Return a dictionary with field names and their data types, respecting defined fields if available."""
        if hasattr(cls, "_defined_fields") and cls._defined_fields:
            fields_to_include = cls._defined_fields
        else:
            fields_to_include = cls.__struct_fields__

        meta = {}
        for field in fields_to_include:
            field_type = cls.__annotations__[field]
            if field_type == str:
                meta[field] = pd.StringDtype()
            elif field_type == int:
                meta[field] = "int64"
            elif field_type == bool:
                meta[field] = "bool"
            elif field_type == float:
                meta[field] = "float64"
            elif field_type == LineString or field_type == Point:
                meta[field] = "geometry"
            elif field_type == datetime.datetime:
                meta[field] = "datetime64[ns]"
            elif field_type == BoundingBox:
                meta[field] = BoundingBox
            else:
                meta[field] = "object"
        return meta

    @classmethod
    def empty_frame(cls) -> gpd.GeoDataFrame:
        """Create an empty GeoDataFrame based on metadata for all fields."""
        meta = cls.to_meta()
        empty_data = {col: pd.Series(dtype=dtype) for col, dtype in meta.items()}
        return gpd.GeoDataFrame(empty_data, geometry="geometry", crs="EPSG:4326")


# Transect class with conditional fields based on initialization
class Transect(BaseModel):
    transect_id: str
    geometry: LineString
    lon: Optional[float] = None
    lat: Optional[float] = None
    bearing: Optional[float] = None
    osm_coastline_is_closed: Optional[bool] = None
    osm_coastline_length: Optional[int] = None
    utm_epsg: Optional[int] = None
    bbox: Optional[BoundingBox] = None
    quadkey: Optional[str] = None
    continent: Optional[str] = None
    country: Optional[str] = None
    common_country_name: Optional[str] = None
    common_region_name: Optional[str] = None

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._defined_fields = set(
            kwargs.keys()
        )  # Track fields set during initialization


# Union of all possible classes for decoding
ModelUnion = Union[TransectOrigin, Transect]

# Example usage
if __name__ == "__main__":
    transect = Transect(transect_id="a", geometry=LineString([(0, 0), (1, 1)]))
    print("Metadata with specified fields:", transect.to_meta())

    transect_all_fields = Transect.from_defaults()
    print("Metadata with all fields:", transect_all_fields.to_meta())

    empty_frame = Transect.empty_frame()
    print("\nEmpty GeoDataFrame with all fields:\n", empty_frame)

TypeError: Struct types cannot define __init__

In [78]:
import datetime
from typing import Any, Optional, Union

import geopandas as gpd
import msgspec
import numpy as np
import pandas as pd
import shapely.wkt
from shapely.geometry import GeometryCollection, LineString, Point


# Define a BoundingBox class to represent the bbox field
class BoundingBox(msgspec.Struct):
    xmax: float
    xmin: float
    ymax: float
    ymin: float

    def to_dict(self):
        return {
            "xmax": self.xmax,
            "xmin": self.xmin,
            "ymax": self.ymax,
            "ymin": self.ymin,
        }

    @classmethod
    def from_defaults(cls):
        """Create a default BoundingBox instance with NaN values for each boundary."""
        return cls(xmax=np.nan, xmin=np.nan, ymax=np.nan, ymin=np.nan)


# Custom encoding and decoding functions
def encode_custom(obj):
    if isinstance(obj, datetime.datetime):
        return obj.isoformat()
    elif isinstance(obj, (Point, LineString)):
        return obj.wkt
    elif isinstance(obj, BoundingBox):
        return obj.to_dict()
    elif isinstance(obj, BaseModel):
        return obj.to_dict()
    raise TypeError(f"Type {type(obj)} not supported")


def decode_custom(type, obj):
    if type is datetime.datetime:
        return datetime.datetime.fromisoformat(obj)
    elif type is LineString:
        return shapely.wkt.loads(obj)
    elif type is Point:
        return shapely.wkt.loads(obj)
    return obj


# Base class with tagging enabled
class BaseModel(msgspec.Struct, tag=True, tag_field="type"):
    def encode(self) -> bytes:
        encoder = msgspec.json.Encoder(enc_hook=encode_custom)
        return encoder.encode(self)

    def decode(self, data: bytes):
        decoder = msgspec.json.Decoder(ModelUnion, dec_hook=decode_custom)
        return decoder.decode(data)

    def to_dict(self) -> dict:
        """Convert instance to a dictionary without type conversions."""
        return {field: getattr(self, field) for field in self.__struct_fields__}

    def to_json(self) -> str:
        return self.encode().decode()

    def to_frame(self) -> gpd.GeoDataFrame:
        """Convert to GeoDataFrame based on the currently defined fields."""
        included_fields = getattr(self, "_defined_fields", self.__struct_fields__)
        return gpd.GeoDataFrame(
            [{field: getattr(self, field) for field in included_fields}],
            geometry="geometry",
            crs="EPSG:4326",
        )

    @classmethod
    def to_meta(cls) -> dict:
        """Return a dictionary with field names and their data types, respecting defined fields if available."""
        fields_to_include = getattr(cls, "_defined_fields", cls.__struct_fields__)
        meta = {}
        for field in fields_to_include:
            field_type = cls.__annotations__[field]
            if field_type == str:
                meta[field] = pd.StringDtype()
            elif field_type == int:
                meta[field] = "int64"
            elif field_type == bool:
                meta[field] = "bool"
            elif field_type == float:
                meta[field] = "float64"
            elif field_type == LineString or field_type == Point:
                meta[field] = "geometry"
            elif field_type == datetime.datetime:
                meta[field] = "datetime64[ns]"
            elif field_type == BoundingBox:
                meta[field] = BoundingBox
            else:
                meta[field] = "object"
        return meta

    @classmethod
    def empty_frame(cls) -> gpd.GeoDataFrame:
        """Create an empty GeoDataFrame based on metadata for all fields."""
        meta = cls.to_meta()
        empty_data = {col: pd.Series(dtype=dtype) for col, dtype in meta.items()}
        return gpd.GeoDataFrame(empty_data, geometry="geometry", crs="EPSG:4326")


# Transect class with conditional fields based on initialization
class Transect(BaseModel):
    transect_id: str
    geometry: LineString
    lon: Optional[float] = msgspec.field(default=None)
    lat: Optional[float] = msgspec.field(default=None)
    bearing: Optional[float] = msgspec.field(default=None)
    osm_coastline_is_closed: Optional[bool] = msgspec.field(default=None)
    osm_coastline_length: Optional[int] = msgspec.field(default=None)
    utm_epsg: Optional[int] = msgspec.field(default=None)
    bbox: Optional[BoundingBox] = msgspec.field(default=None)
    quadkey: Optional[str] = msgspec.field(default=None)
    continent: Optional[str] = msgspec.field(default=None)
    country: Optional[str] = msgspec.field(default=None)
    common_country_name: Optional[str] = msgspec.field(default=None)
    common_region_name: Optional[str] = msgspec.field(default=None)

    def __post_init__(self):
        """Post-init processing to track which fields were explicitly provided."""
        self._defined_fields = {
            f for f in self.__struct_fields__ if getattr(self, f) is not None
        }


# Union of all possible classes for decoding
ModelUnion = Union[Transect]

# Example usage
if __name__ == "__main__":
    transect = Transect(transect_id="a", geometry=LineString([(0, 0), (1, 1)]))
    print("Metadata with specified fields:", transect.to_meta())

    transect_all_fields = Transect()
    print("Metadata with all fields:", transect_all_fields.to_meta())

    empty_frame = Transect.empty_frame()
    print("\nEmpty GeoDataFrame with all fields:\n", empty_frame)

AttributeError: 'Transect' object has no attribute '_defined_fields'

In [89]:
msgspec.field(default=3, name="t") = None

SyntaxError: cannot assign to function call here. Maybe you meant '==' instead of '='? (3742688804.py, line 1)