Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 57 additions & 92 deletions tide/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from bigtree import dict_to_tree, levelordergroup_iter
from bigtree.node import node
from typing import TypeVar
import re

T = TypeVar("T", bound=node.Node)

Expand Down Expand Up @@ -145,136 +144,102 @@ def get_data_col_names_from_root(data_root):
][-1]


def find_cols_with_tide_tags(
data_columns: pd.Index | list[str], request: str
) -> list[str]:
request_parts = request.split("__")

if not (1 <= len(request_parts) <= 4):
raise ValueError(
f"Request '{request}' is malformed. "
f"Use 'name__unit__bloc__sub_bloc' format or a "
f"combination of these tags."
)

full_tag_col_map = {
col_name_tag_enrichment(col, get_tags_max_level(data_columns)): col
for col in data_columns
}

def find_exact_match(search_str, target):
pattern = rf"(?:^|__)(?:{re.escape(search_str)})(?:$|__)"
match = re.search(pattern, target)
return match is not None

return [
full_tag_col_map[augmented_col]
for augmented_col in full_tag_col_map.keys()
if all(find_exact_match(part, augmented_col) for part in request_parts)
]


def find_cols_multiple_tag_groups(
data_columns: pd.Index | list[str], request: str
) -> list[str]:
request_parts = request.split("|")
list_to_return = []
for req in request_parts:
list_to_return.extend(find_cols_with_tide_tags(data_columns, req))
return list_to_return


def tide_request(
data_columns: pd.Index | list[str], request: str | pd.Index | list[str] = None
data_columns: pd.Index | list[str],
request: str | list[str] | pd.Index | None = None,
) -> list[str]:
"""
Select columns by matching structured TIDE-style tags.

Filters column names based on a TIDE-style structured tag syntax. Columns are
expected to use a naming convention with double underscores (`__`) separating
tags.
Column names follow the format:

name__unit__bloc__sub_bloc

Tags are separated by double underscores ("__"). Not all levels are required.

Before matching, column names are automatically enriched to the maximum
tag depth present in `data_columns`. Missing tag levels are filled using
DEFAULT_TAGS, ensuring consistent hierarchical comparison.

A column name can include up to four hierarchical parts:
'name__unit__bloc__sub_bloc' where each part is optional, but must be separated
with double underscores.
The `request` argument defines tag queries:

The `request` argument allows searching for columns matching one or more
of these parts using full or partial tag patterns. Multiple tag patterns
can be combined using the pipe `|` character to form OR conditions.
- Tags are separated by "__"
- OR conditions are separated by "|"
- Multiple request entries are OR-combined
- Matching is exact per tag part (no substring matching)

Parameters
----------
data_columns : pandas.Index or list of str
A collection of column names to filter. Each column name should follow
the TIDE format (e.g., "sensor__°C__bloc1").
Collection of column names using TIDE-style tagging.

request : str or list of str or pandas.Index, optional
Tag(s) to match against the column names. Each tag string may be:
request : str or list[str] or pandas.Index, optional
Tag query expression(s). Each expression may contain:

- A full structured tag (e.g., "name__°C__bloc2")
- A full tag path (e.g., "name__°C__bloc2")
- A partial tag (e.g., "°C", "bloc1")
- A group of tags separated by "|" (e.g., "kWh|°C")
- OR groups separated by "|" (e.g., "kWh|°C")

If None, all columns from `data_columns` are returned.
If None, all columns are returned.

Returns
-------
list of str
The list of column names that match any of the provided tag queries.
list[str]
Column names matching at least one request expression.
Order is preserved and duplicates are removed.

Notes
-----
- Matching is done per tag part, not substrings. For instance, the query
"bloc1" will match "name__°C__bloc1" but not "bloc11".
- If multiple requests are given, columns are returned if they match
at least one of them (logical OR).
- Tags can include between 1 and 4 parts, split by `__`.
- Matching is performed on enriched tag representations.
- Default tag values (e.g., "OTHER") may be injected during enrichment.
- Matching is exact at tag level, not substring-based.
- Requests may contain between 1 and 4 tag levels.

Examples
--------
>>> DF_COLUMNS = [
... "name_1__°C__bloc1",
... "name_1__°C__bloc2",
... "name_2",
... "name_2__DIMENSIONLESS__bloc2",
... "name_3__kWh/m²",
... "name_5__kWh",
... "name4__DIMENSIONLESS__bloc4",
... ]

>>> tide_request(DF_COLUMNS)
['name_1__°C__bloc1', 'name_1__°C__bloc2', 'name_2',
'name_2__DIMENSIONLESS__bloc2', 'name_3__kWh/m²',
'name_5__kWh', 'name4__DIMENSIONLESS__bloc4']

>>> tide_request(DF_COLUMNS, "°C")
['name_1__°C__bloc1', 'name_1__°C__bloc2']

>>> tide_request(DF_COLUMNS, "kWh|°C")
['name_5__kWh', 'name_1__°C__bloc1', 'name_1__°C__bloc2']

>>> # Columns are not selected twice
>>> tide_request(DF_COLUMNS, ["kWh|°C", "name_5__kWh"])
['name_5__kWh', 'name_1__°C__bloc1', 'name_1__°C__bloc2']
"""

if request is None:
return list(data_columns)

elif isinstance(request, str):
if isinstance(request, str):
request = [request]

if not (isinstance(request, pd.Index) or isinstance(request, list)):
if not isinstance(request, (list, pd.Index)):
raise ValueError(
"Invalid request. Was expected an instance of str, pd.Index or List[str]"
f"got {type(request)} instead"
f"request must be str, list[str], pd.Index or None, got {type(request)}"
)

list_to_return = []
max_level = get_tags_max_level(data_columns)

# Enrich columns once
enriched_map = {
col_name_tag_enrichment(col, max_level): col for col in data_columns
}

selected = []

for req in request:
list_to_return.extend(find_cols_multiple_tag_groups(data_columns, req))
for group in req.split("|"):
group_tags = group.split("__")

if not (1 <= len(group_tags) <= 4):
raise ValueError(
f"Request '{group}' is malformed. "
"Use up to 4 tags separated by '__'."
)

for enriched_name, original in enriched_map.items():
tags = enriched_name.split("__")

# Exact per-tag match
if all(tag in tags for tag in group_tags):
selected.append(original)

return list(dict.fromkeys(list_to_return))
return list(dict.fromkeys(selected))


def data_columns_to_tree(columns: pd.Index | list[str]) -> T:
Expand Down