In [1]:
%load_ext autoreload
%autoreload 2
import json
from openenergyid.pandera_poc.models import InputModel, OutputModel

# Polars with Pandera for type checking


interesting overview
* https://pola.rs/posts/polars_birds_eye_view/

useful links
* https://www.rhosignal.com/posts/polars-pandas-cheatsheet/
* https://www.rhosignal.com/posts/pandas-to-polars-time-series/
* https://pandera.readthedocs.io/en/latest/polars.html#how-it-works
* https://github.com/unionai-oss/pandera


In [2]:
# %run "openenergyid\pandera_poc\models.py" # ipynb just keeps NOT refreshing my code in the notebook, so I have to add this

# Input Model Specification
print(json.dumps(json.loads(InputModel.to_schema().to_json()), indent=2))  # type: ignore

{
  "schema_type": "dataframe",
  "version": "0.20.3",
  "columns": {
    "column1": {
      "title": null,
      "description": null,
      "dtype": "Int64",
      "nullable": false,
      "checks": {
        "less_than_or_equal_to": 10
      },
      "unique": false,
      "coerce": false,
      "required": true,
      "regex": false
    },
    "column2": {
      "title": null,
      "description": null,
      "dtype": "Float64",
      "nullable": false,
      "checks": {
        "less_than": -1.2
      },
      "unique": false,
      "coerce": false,
      "required": true,
      "regex": false
    },
    "column3": {
      "title": null,
      "description": null,
      "dtype": "String",
      "nullable": false,
      "checks": {
        "str_startswith": "value_"
      },
      "unique": false,
      "coerce": false,
      "required": true,
      "regex": false
    }
  },
  "checks": null,
  "index": null,
  "dtype": null,
  "coerce": false,
  "strict": false,
  "name": "InputMod



In [3]:
# Input data is pure python
import polars as pl

# we use a polars lazy frame for optimal performance
# https://pandera.readthedocs.io/en/latest/polars.html#how-it-works
lf = pl.LazyFrame(
    {
        "column1": [1, 4, 0, 10, 9],
        "column2": [-1.3, -1.4, -2.9, -10.1, -20.4],
        "column3": ["value_1", "value_2", "value_3", "value_2", "value_1"],
    }
)
wrong_lazy_frame = pl.LazyFrame(
    {
        "column1": ["value_1", "value_2", "value_3", "value_4", "value_5"],
        "column2": [0.1, 4, 0, 10, 9],
        "column3": ["value_a", "value_b", "value_c", "value_d", "value_e"],
    }
)
# index may have negative performance impact
# lf = lf.with_row_index().cast(pl.Int64)
display(lf.collect_schema())
display(wrong_lazy_frame.collect_schema())
lf

Schema([('column1', Int64), ('column2', Float64), ('column3', String)])

Schema([('column1', String), ('column2', Float64), ('column3', String)])

In [4]:
from openenergyid.pandera_poc.models import InputModel
from openenergyid.pandera_poc.analysis import analyse

InputModel.validate(lf).collect()  # type: ignore , issue with pandera-polars integration
result = analyse(lf)
OutputModel.validate(result)
try:
    # this only validates schema-level properties so we can continue doing lazy operations
    InputModel.validate(wrong_lazy_frame)
except Exception as e:
    print("Validation error:", str(e))

result.collect()

Validation error: expected column 'column1' to have type Int64, got String


column1,column2,column3
i64,f64,str
1,-1.3,"""value_1"""
4,-1.4,"""value_2"""
0,-2.9,"""value_3"""
10,-10.1,"""value_2"""
9,-20.4,"""value_1"""
