In [1]:
import polars as pl

In [2]:
pl.DataFrame({"SA1_2021": ["A", "A", "B", "B"], "year": [2020, 2021] * 2, "population": [10.0, 20.0, 30.0, 40.0]}).pipe(
    print
)

shape: (4, 3)
┌──────────┬──────┬────────────┐
│ SA1_2021 ┆ year ┆ population │
│ ---      ┆ ---  ┆ ---        │
│ str      ┆ i64  ┆ f64        │
╞══════════╪══════╪════════════╡
│ A        ┆ 2020 ┆ 10.0       │
│ A        ┆ 2021 ┆ 20.0       │
│ B        ┆ 2020 ┆ 30.0       │
│ B        ┆ 2021 ┆ 40.0       │
└──────────┴──────┴────────────┘


In [49]:
import os
from functools import cached_property
from typing import Callable, Literal, Optional, Union

from cachetools import LRUCache, TTLCache, cached
from electoralyze import region
from electoralyze.common.constants import ROOT_DIR
from electoralyze.region.region_abc import RegionABC
from pydantic import BaseModel, ConfigDict

##### ABSTRACT ###########

METRIC_DATA_TYPES = Literal["categorical", "ordinal", "numeric", "single"]


class MetricRegion(BaseModel):
    # region: Union[tuple(RegionABC.__subclasses__())]
    region: type[RegionABC]
    # redistribute_from:  Union[tuple(RegionABC.__subclasses__())] | None = None
    redistribute_from: type[RegionABC] | None = None
    redistribute_kwargs: dict | None = None
    process_raw: Callable | None = None
    process_raw_kwargs: dict | None = None

    model_config = ConfigDict(arbitrary_types_allowed=True)


class Metric(BaseModel):
    allowed_regions: list[MetricRegion]
    name: str
    data_type: METRIC_DATA_TYPES
    name_suffix: str | None = None
    file: str

    def by(self, region: RegionABC):
        if region.id not in self.allowed_regions_map:
            raise KeyError(f"Region {region.id!r} not found for metric: {self.full_name!r}")

        region_metric = self.allowed_regions_map[region.id]

        if region_metric.redistribute_from:
            metric_data = self.get_redistributed_data(region)
        else:
            metric_data = self.get_stored_data(region)

        return metric_data

    def get_stored_data(self, region: RegionABC) -> pl.DataFrame:
        file = self.get_file().format(region=region.id)
        metric_data = pl.read_parquet(file)

        return metric_data

    def get_redistributed_data(self, region: RegionABC) -> pl.DataFrame:
        raise NotImplementedError("Not read yet")

    def process_raw(self):
        for metric_region in self.allowed_regions:
            if metric_region.process_raw is not None:
                kwargs = metric_region.process_raw_kwargs or {}
                metric_region.process_raw(
                    parent_metric=self,
                    **kwargs,
                )

    @cached_property
    def full_name(self):
        full_name = f"{self.name}_{self.name_suffix}" if self.name_suffix else f"{self.name}"
        return full_name

    @cached_property
    def allowed_regions_map(self) -> dict[str, MetricRegion]:
        allowed_regions = {}
        for metric_region in self.allowed_regions:
            allowed_regions[metric_region.region.id] = metric_region

        return allowed_regions

    def get_file(self) -> str:
        file = self.file
        return file


###### Creating group of metrics ########

PROCESSED_FILE = os.path.join(ROOT_DIR, "data/census/{census}/{metric}/{region}.parquet")


class National2021Metric(Metric):
    name_suffix: str = "national_2021"
    file: None = None

    def get_file(self) -> str:
        file = PROCESSED_FILE.format(
            census=self.name_suffix,
            metric=self.name,
            region="{region}",
        )
        return file


####### Creating specific metric ###########


RAW_POPULATION_FILE = os.path.join(ROOT_DIR, "data/raw/...")


def process_raw_population(**_kwargs):
    RAW_POPULATION_FILE

    print("doing something...")


population = National2021Metric(
    name="population",
    data_type="categorical",
    allowed_regions=[
        MetricRegion(region=region.SA1_2021, process_raw=process_raw_population),
        MetricRegion(region=region.SA2_2021, redistribute_from=region.SA1_2021),
    ],
)

In [52]:
population.get_file()

'/home/andre/git/private/electoralyze/packages/electoralyze/electoralyze/common/../../../../data/census/national_2021/population/{region}.parquet'

In [53]:
population.allowed_regions

[MetricRegion(region=<class 'electoralyze.region.regions.SA1_2021.SA1_2021'>, redistribute_from=None, redistribute_kwargs=None, process_raw=<function process_raw_population at 0x7f34b4d9cd60>, process_raw_kwargs=None),
 MetricRegion(region=<class 'electoralyze.region.regions.SA2_2021.SA2_2021'>, redistribute_from=<class 'electoralyze.region.regions.SA1_2021.SA1_2021'>, redistribute_kwargs=None, process_raw=None, process_raw_kwargs=None)]

In [54]:
population.allowed_regions_map

{'SA1_2021': MetricRegion(region=<class 'electoralyze.region.regions.SA1_2021.SA1_2021'>, redistribute_from=None, redistribute_kwargs=None, process_raw=<function process_raw_population at 0x7f34b4d9cd60>, process_raw_kwargs=None),
 'SA2_2021': MetricRegion(region=<class 'electoralyze.region.regions.SA2_2021.SA2_2021'>, redistribute_from=<class 'electoralyze.region.regions.SA1_2021.SA1_2021'>, redistribute_kwargs=None, process_raw=None, process_raw_kwargs=None)}

In [55]:
population.process_raw()

doing something...


In [56]:
population.by(region.SA1_2021)

FileNotFoundError: No such file or directory (os error 2): ...ze/electoralyze/common/../../../../data/census/national_2021/population/SA1_2021.parquet

In [58]:
population.by(region.SA2_2021)

NotImplementedError: Not read yet

In [57]:
population.by(RegionABC)

KeyError: "Region None not found for metric: 'population_national_2021'"