In [1]:
import xarray as xr
import re

ds_opt1 = xr.Dataset(attrs={
"locale_default":"en-CA",
"locale_others": "fr: fr-CA es: es-MX jp: jp-JP",
"title": "English Title",
"title_fr": "Titre française",
"title_es": "Título en español",
"title_jp": "日本語のタイトル", # I wanted something very much deeper in the Basic Multilingual Plane
})

# my largest argument for option 2 is that everything comes from a controlled list or standard
# except the colon (though maybe that's xml)
# I personally want to avoid the underscore as a seperator due to its use in the normal attributes
ds_opt2 = xr.Dataset(attrs={
"locale_default":"en-CA", # standard BCP 47 locale string
"locale_others": "fr-CA, es-MX, jp, tlh", # http standard language priority list (without weights)
"title": "English Title",
"fr-CA:title": "Titre française", # two controlled vocabs seperated by :
"es-MX:title": "Título en español", # two controlled vocabs seperated by :
"jp:title": "日本語のタイトル", # W3C says to use the shortest tag needed
"tlh:title": "Heghlu’meH QaQ jajvam", # Klingon: Today is a good day to die
})

ds_opt3 = xr.Dataset(attrs={
"locale_default":"en-CA", # standard BCP 47 locale string
"locale_others": "fr-CA es-MX jp tlh", # no other spaces then delimiting ones
"title": "English Title",
"title[fr-CA]": "Titre française", # two controlled vocabs one inside []
"title[es-MX]": "Título en español",
"title[jp]": "日本語のタイトル", # W3C says to use the shortest tag needed
"title[tlh]": "Heghlu’meH QaQ jajvam", # Klingon: Today is a good day to die
})

In [2]:
def rfc4647_lookup(locale:str, options, default="en"):
    # options is an iterable with string elements, but I don't want to go full type hints
    # implements https://datatracker.ietf.org/doc/html/rfc4647#section-3.4
    # but badly
    # note this is case insensitive as per the standard
    lower_locale = locale.lower()
    tokens = lower_locale.split("-")
    lowered_options = {}
    for option in options:
        option_tokens = option.lower().split("-")
        while option_tokens:
            lowered_options["-".join(option_tokens)] = option
            option_tokens = option_tokens[:-1]

    while tokens:
        if (match := "-".join(tokens)) in lowered_options:
            return lowered_options[match]
        tokens = tokens[:-1]
    return default

In [3]:
def get_localized_attrs_opt1(attrs, locale):
    default = attrs.get("locale_default", "en")
    _others = attrs.get("locale_others", "")

    others = dict()

    # we need to make a suffix lookup table
    # no good delimieter, go to regex
    for match in re.finditer(f"([a-zA-Z]+: \S+)", _others):
        # we do want this inverted
        value, key = (m.strip() for m in match.group(0).split(":"))
        # prepending the _ prefix to the value will be useful later
        others[key] = f"_{value}"

    matched_locale = rfc4647_lookup(locale, {default, *others.keys()})

    filtered = {}
    for key, value in attrs.items():
        if key.endswith(tuple(others.values())):
            # ignore localized strings
            continue
        try:
            filtered[key] = attrs[f"{key}{others[matched_locale]}"]
        except KeyError:
            filtered[key] = value
    return filtered

In [4]:
def get_localized_attrs_opt2(attrs, locale):
    default = attrs.get("locale_default", "en")
    others = [other.strip() for other in attrs.get("locale_others", "").split(",")]

    # There is probably a locale matching library (or maybe giant regex)
    # basically find the best matching locale from the user requested one to one that exists in the actual file
    matched_locale = rfc4647_lookup(locale, {default, *others})

    filtered = {}
    for key, value in attrs.items():
        if ":" in key:
            # ignore localized strings
            # ":" is a reserved char in localized files, only localized attrs can have it
            continue
        try:
            # since both parrts of the attr name are from controlled lists
            # we can just construct the key directly
            filtered[key] = attrs[f"{matched_locale}:{key}"]
        except KeyError:
            filtered[key] = value
    return filtered

In [21]:
def get_localized_attrs_opt3(attrs, locale):
    default = attrs.get("locale_default", "en")
    others = attrs.get("locale_others", "").split(" ") # this is simpler than option 2

    matched_locale = rfc4647_lookup(locale, {default, *others})

    filtered = {}
    for key, value in attrs.items():
        if "[" in key and "]" in key:
            # ignore localized strings
            continue
        try:
            # since both parrts of the attr name are from controlled lists
            # we can just construct the key directly like in option 3
            filtered[key] = attrs[f"{key}[{matched_locale}]"]
        except KeyError:
            filtered[key] = value
    return filtered

In [6]:
# Perfect match
get_localized_attrs_opt1(ds_opt1.attrs, "fr-CA")

{'locale_default': 'en-CA',
 'locale_others': 'fr: fr-CA es: es-MX jp: jp-JP',
 'title': 'Titre française'}

In [7]:
# Just language
get_localized_attrs_opt1(ds_opt1.attrs, "fr")

{'locale_default': 'en-CA',
 'locale_others': 'fr: fr-CA es: es-MX jp: jp-JP',
 'title': 'Titre française'}

In [8]:
# get the closest french language match?
# I don't think this is quite rfc4647, but is probably useful
get_localized_attrs_opt1(ds_opt1.attrs, "fr-FR")

{'locale_default': 'en-CA',
 'locale_others': 'fr: fr-CA es: es-MX jp: jp-JP',
 'title': 'Titre française'}

In [9]:
#option 2 should work the same way
get_localized_attrs_opt2(ds_opt2.attrs, "fr-CA")

{'locale_default': 'en-CA',
 'locale_others': 'fr-CA, es-MX, jp, tlh',
 'title': 'Titre française'}

In [10]:
get_localized_attrs_opt2(ds_opt2.attrs, "es")

{'locale_default': 'en-CA',
 'locale_others': 'fr-CA, es-MX, jp, tlh',
 'title': 'Título en español'}

In [11]:
# garbage locale string
get_localized_attrs_opt2(ds_opt2.attrs, "something")

{'locale_default': 'en-CA',
 'locale_others': 'fr-CA, es-MX, jp, tlh',
 'title': 'English Title'}

In [12]:
get_localized_attrs_opt2(ds_opt2.attrs, "jp-JP")

{'locale_default': 'en-CA',
 'locale_others': 'fr-CA, es-MX, jp, tlh',
 'title': '日本語のタイトル'}

In [13]:
get_localized_attrs_opt3(ds_opt3.attrs, "tlh")

{'locale_default': 'en-CA',
 'locale_others': 'fr-CA es-MX jp tlh',
 'title': 'Heghlu’meH QaQ jajvam'}

In [14]:
get_localized_attrs_opt3(ds_opt3.attrs, "fr-CA")

{'locale_default': 'en-CA',
 'locale_others': 'fr-CA es-MX jp tlh',
 'title': 'Titre française'}

In [15]:
ds_opt1.to_netcdf("option1.nc")

In [16]:
!ncdump option1.nc

netcdf option1 {

// global attributes:
		:locale_default = "en-CA" ;
		:locale_others = "fr: fr-CA es: es-MX jp: jp-JP" ;
		:title = "English Title" ;
		string :title_fr = "Titre française" ;
		string :title_es = "Título en español" ;
		string :title_jp = "日本語のタイトル" ;
}


In [17]:
ds_opt2.to_netcdf('option2.nc')

In [18]:
!ncdump option2.nc

netcdf option2 {

// global attributes:
		:locale_default = "en-CA" ;
		:locale_others = "fr-CA, es-MX, jp, tlh" ;
		:title = "English Title" ;
		string :fr-CA\:title = "Titre française" ;
		string :es-MX\:title = "Título en español" ;
		string :jp\:title = "日本語のタイトル" ;
		string :tlh\:title = "Heghlu’meH QaQ jajvam" ;
}


In [19]:
ds_opt3.to_netcdf('option3.nc')

In [20]:
!ncdump option3.nc

netcdf option3 {

// global attributes:
		:locale_default = "en-CA" ;
		:locale_others = "fr-CA es-MX jp tlh" ;
		:title = "English Title" ;
		string :title\[fr-CA\] = "Titre française" ;
		string :title\[es-MX\] = "Título en español" ;
		string :title\[jp\] = "日本語のタイトル" ;
		string :title\[tlh\] = "Heghlu’meH QaQ jajvam" ;
}
