# Experiment in using Variables to Capture Localization information

In [1]:
import xarray as xr

In [2]:
# imported from my other example as is
def rfc4647_lookup(locale:str, options, default="en"):
    # options is an iterable with string elements, but I don't want to go full type hints
    # implements https://datatracker.ietf.org/doc/html/rfc4647#section-3.4
    # but badly
    # note this is case insensitive as per the standard
    lower_locale = locale.lower()
    tokens = lower_locale.split("-")
    lowered_options = {}
    for option in options:
        option_tokens = option.lower().split("-")
        while option_tokens:
            lowered_options["-".join(option_tokens)] = option
            option_tokens = option_tokens[:-1]

    while tokens:
        if (match := "-".join(tokens)) in lowered_options:
            return lowered_options[match]
        tokens = tokens[:-1]
    return default

In [3]:
vars = {"locale_fr_ca" :xr.DataArray(attrs = {
    "locale": "fr-CA",
    "title": "Titre française",
}),
"locale_es_mx" : xr.DataArray(attrs = {
    "locale": "es-MX",
    "title": "Título en español",
}),
"locale_jp" : xr.DataArray(attrs = {
    "locale": "jp",
    "title": "日本語のタイトル",
}),
"locale_tlh" : xr.DataArray(attrs = {
    "locale": "tlh",
    "title": "Heghlu’meH QaQ jajvam",
})
       }
per_var = xr.Dataset(vars, attrs={
"locale":"en-CA", # standard BCP 47 locale string
"localizations": " ".join(vars.keys()), # no other spaces then delimiting ones
"title": "English Title",}
)

# make a dummy data var to try variable level localization
# have the locale be different than global
data_var_localization = xr.DataArray(attrs = {
    "locale": "en",
    "title": "salinity",
})
data_var = xr.DataArray(attrs = {
    "locale": "jp",
    "localizations": "data_var_localization",
    "title": "塩分濃度",
})
per_var["data_var"] = data_var
per_var["data_var_localization"] = data_var_localization

# Localized data?
char_var_localization = xr.DataArray(["San Diego", "Hawaii"], attrs = {
    "locale": "en",
    "title": "Locations",
})
char_var = xr.DataArray(["サンディエゴ", "ハワイ"], attrs = {
    "locale": "jp",
    "localizations": "char_var_localization",
    "localized_data": 1, # some boolean
    "title": "地名",
})
per_var["char_var"] = char_var
per_var["char_var_localization"] = char_var_localization

In [4]:
per_var.to_netcdf("per_var.nc")

In [5]:
!ncdump per_var.nc

netcdf per_var {
dimensions:
	dim_0 = 2 ;
variables:
	double locale_fr_ca ;
		locale_fr_ca:_FillValue = NaN ;
		locale_fr_ca:locale = "fr-CA" ;
		string locale_fr_ca:title = "Titre française" ;
	double locale_es_mx ;
		locale_es_mx:_FillValue = NaN ;
		locale_es_mx:locale = "es-MX" ;
		string locale_es_mx:title = "Título en español" ;
	double locale_jp ;
		locale_jp:_FillValue = NaN ;
		locale_jp:locale = "jp" ;
		string locale_jp:title = "日本語のタイトル" ;
	double locale_tlh ;
		locale_tlh:_FillValue = NaN ;
		locale_tlh:locale = "tlh" ;
		string locale_tlh:title = "Heghlu’meH QaQ jajvam" ;
	double data_var ;
		data_var:_FillValue = NaN ;
		data_var:locale = "jp" ;
		data_var:localizations = "data_var_localization" ;
		string data_var:title = "塩分濃度" ;
	double data_var_localization ;
		data_var_localization:_FillValue = NaN ;
		data_var_localization:locale = "en" ;
		data_var_localization:title = "salinity" ;
	string char_var(dim_0) ;
		char_var:locale = "jp" ;
		char_var:localizations = "ch

In [6]:
def localize_(ds: xr.Dataset, locale, var_name = None):
    ds = ds.copy()
    if var_name is None: # global case
        default = ds.attrs.get("locale", "en")
        localization_vars = [other.strip() for other in ds.attrs.get("localizations", "").split(" ")]
    else:
        default = ds[var_name].attrs.get("locale", "en")
        localization_vars = [other.strip() for other in ds[var_name].attrs.get("localizations", "").split(" ")]
        
    # make a locale to var mapping:
    localizations = {}
    for var in localization_vars:
        var = ds[var]
        localizations[var.attrs["locale"]] = var

    # There is probably a locale matching library (or maybe giant regex)
    # basically find the best matching locale from the user requested one to one that exists in the actual file
    matched_locale = rfc4647_lookup(locale, localizations.keys(), default)
    if matched_locale != default:
        if var_name is None:
            ds.attrs.update(**localizations[matched_locale].attrs)
        else:
            if ds[var_name].attrs.get("localized_data") is not None:
                ds[var_name].values = localizations[matched_locale].values
                del ds[var_name].attrs["localized_data"]
            ds[var_name].attrs.update(**localizations[matched_locale].attrs)

    # assume the user doesn't want localization info retained
    ds = ds.drop_vars(localization_vars)
    if var_name is None:
        del ds.attrs["localizations"]
    else:
        del ds[var_name].attrs["localizations"]
    return ds
    
def get_localized_attrs(ds, locale):
    ds = localize_(ds, locale) # global_vars
    for var_name in ds.filter_by_attrs(localizations=lambda v: v is not None):
        ds = localize_(ds, locale, var_name)
    return ds

In [7]:
# the entire dataset is busy in this example
# but note all attribute names and variable names do not break any existing char
# limiations in ERDDAP/current CF
per_var

In [8]:
# show the attrs of the variable
per_var.data_var.attrs

{'locale': 'jp', 'localizations': 'data_var_localization', 'title': '塩分濃度'}

In [9]:
# Note the variable attributes and values in the case of char var
en = get_localized_attrs(per_var, "en")
en

In [10]:
en.data_var

In [11]:
en.char_var

In [12]:
# fallback to defaults if a locale is not found
fr = get_localized_attrs(per_var, "fr")

In [13]:
fr.data_var

In [14]:
fr.char_var