# 資料驗證

In [1]:
%matplotlib inline
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import patito as pt
from typing import Literal, Optional

plt.style.use('seaborn-v0_8-darkgrid')

In [2]:
def val(validator_class, data) -> bool:
    try:
        validator_class.validate(data)
        print('Pass')
    except Exception as e:
        print(e)

In [3]:
df = pl.read_csv('Tutorial_Data/penguins_v.csv')

In [4]:
df.describe()

statistic,ID,Species,Island,Clutch_Completion,Culmen_Length,Culmen_Depth,Flipper_Length,Body_Mass,Sex,Comments,Island_Code
str,f64,str,str,str,f64,f64,f64,f64,str,str,f64
"""count""",344.0,"""344""","""344""","""344""",342.0,342.0,342.0,342.0,"""334""","""26""",344.0
"""null_count""",0.0,"""0""","""0""","""0""",2.0,2.0,2.0,2.0,"""10""","""318""",0.0
"""mean""",63.151163,,,,68.83655,17.15117,200.915205,4201.754386,,,1.206395
"""std""",40.430199,,,,328.085458,1.974793,14.061714,801.954536,,,0.687977
"""min""",1.0,"""Adelie Penguin (Pygoscelis ade…","""Biscoe""","""No""",4.1,13.1,172.0,2700.0,""".""","""Adult not sampled.""",0.0
"""25%""",29.0,,,,39.3,15.6,190.0,3550.0,,,1.0
"""50%""",58.0,,,,44.5,17.3,197.0,4050.0,,,1.0
"""75%""",95.0,,,,48.6,18.7,213.0,4750.0,,,2.0
"""max""",152.0,"""Gentoo penguin (Pygoscelis pap…","""Torgersen""","""Yes""",4730.0,21.5,231.0,6300.0,"""MALE""","""Sexing primers did not amplify…",2.0


In [5]:
df.columns

['ID',
 'Species',
 'Island',
 'Clutch_Completion',
 'Culmen_Length',
 'Culmen_Depth',
 'Flipper_Length',
 'Body_Mass',
 'Sex',
 'Comments',
 'Island_Code']

In [6]:
df

ID,Species,Island,Clutch_Completion,Culmen_Length,Culmen_Depth,Flipper_Length,Body_Mass,Sex,Comments,Island_Code
i64,str,str,str,f64,f64,i64,i64,str,str,i64
1,"""Adelie Penguin (Pygoscelis ade…","""Torgersen""","""Yes""",39.1,18.7,181,3750,"""MALE""","""Not enough blood for isotopes.""",0
2,"""Adelie Penguin (Pygoscelis ade…","""Torgersen""","""Yes""",39.5,17.4,186,3800,"""FEMALE""",,0
3,"""Adelie Penguin (Pygoscelis ade…","""Torgersen""","""Yes""",40.3,18.0,195,3250,"""FEMALE""",,0
4,"""Adelie Penguin (Pygoscelis ade…","""Torgersen""","""Yes""",,,,,,"""Adult not sampled.""",0
5,"""Adelie Penguin (Pygoscelis ade…","""Torgersen""","""Yes""",36.7,19.3,193,3450,"""FEMALE""",,0
…,…,…,…,…,…,…,…,…,…,…
120,"""Gentoo penguin (Pygoscelis pap…","""Biscoe""","""No""",,,,,,,1
121,"""Gentoo penguin (Pygoscelis pap…","""Biscoe""","""Yes""",46.8,14.3,215,4850,"""FEMALE""",,1
122,"""Gentoo penguin (Pygoscelis pap…","""Biscoe""","""Yes""",50.4,15.7,222,5750,"""MALE""",,1
123,"""Gentoo penguin (Pygoscelis pap…","""Biscoe""","""Yes""",45.2,14.8,212,5200,"""FEMALE""",,1


# 0. 欄位檢查

In [9]:
class P00(pt.Model):
    ID: int
    Species: str
    Island: str
    Clutch_Completion: str
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: float
    Body_Mass: float
    Sex: str
    Comments: str

In [10]:
val(P00, df)

9 validation errors for P00
Island_Code
  Superfluous column (type=type_error.superfluouscolumns)
Comments
  318 missing values (type=value_error.missingvalues)
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  Polars dtype Int64 does not match model field type. (type=type_error.columndtype)
Body_Mass
  Polars dtype Int64 does not match model field type. (type=type_error.columndtype)


# 1. 資料型態

In [11]:
class P01(pt.Model):
    ID: int
    Species: str
    Island: str
    Clutch_Completion: str
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: float
    Body_Mass: float
    Sex: str
    Comments: str
    Island_Code: int

In [12]:
val(P01, df)

8 validation errors for P01
Comments
  318 missing values (type=value_error.missingvalues)
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  Polars dtype Int64 does not match model field type. (type=type_error.columndtype)
Body_Mass
  Polars dtype Int64 does not match model field type. (type=type_error.columndtype)


In [13]:
class P01_1(pt.Model):
    ID: int
    Species: str
    Island: str
    Clutch_Completion: str
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: str
    Comments: str
    Island_Code: int

In [14]:
val(P01_1, df)

6 validation errors for P01_1
Comments
  318 missing values (type=value_error.missingvalues)
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)


In [15]:
class P01_2(pt.Model):
    ID: int
    Species: str
    Island: str
    Clutch_Completion: str
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int = pt.Field(dtype=pl.UInt64)
    Sex: str
    Comments: str
    Island_Code: int

In [16]:
val(P01_2, df)

7 validation errors for P01_2
Comments
  318 missing values (type=value_error.missingvalues)
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
Body_Mass
  Polars dtype Int64 does not match model field type. (type=type_error.columndtype)


# 2. 是否為一個 list 內的值（例如：其值必為 [1, 2, 3] 其中之一）

In [17]:
class P02(pt.Model):
    ID: int
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: str
    Island_Code: int

In [18]:
val(P02, df)

7 validation errors for P02
Comments
  318 missing values (type=value_error.missingvalues)
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


In [19]:
# use wrong species name

class P02_1(pt.Model):
    ID: int
    Species: Literal['Gentoo penguin', 
                     'Adelie Penguin', 
                     'Chinstrap penguin']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: str
    Island_Code: int

In [20]:
val(P02_1, df)

8 validation errors for P02_1
Comments
  318 missing values (type=value_error.missingvalues)
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
Species
  Rows with invalid values: {'Adelie Penguin (Pygoscelis adeliae)', 'Chinstrap penguin (Pygoscelis antarctica)', 'Gentoo penguin (Pygoscelis papua)'}. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


# 3. 是否允許空值

In [21]:
class P03(pt.Model):
    ID: int
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]
    Island_Code: int

In [22]:
val(P03, df)

6 validation errors for P03
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


# 4. 是否不允許重複

In [23]:
class P04(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]
    Island_Code: int

In [24]:
val(P04, df)

7 validation errors for P04
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


# 5. 字串長度

In [25]:
df.select(pl.col('Comments').str.len_chars().min().alias('Comments Min Length'),
          pl.col('Comments').str.len_chars().max().alias('Comments Max Length'))

Comments Min Length,Comments Max Length
u32,u32
18,68


In [26]:
df.select(pl.col('Comments').str.len_chars().value_counts()).unnest('Comments')

Comments,count
u32,u32
30.0,6
37.0,13
25.0,2
,318
62.0,1
68.0,1
36.0,2
18.0,1


In [27]:
class P05(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str] = pt.Field(max_length=50)
    Island_Code: int

In [28]:
val(P05, df)

8 validation errors for P05
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)
Comments
  2 rows with out of bound values. (type=value_error.rowvalue)


In [29]:
class P05_1(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str] = pt.Field(min_length=20)
    Island_Code: int

In [30]:
val(P05_1, df)

8 validation errors for P05_1
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)
Comments
  1 row with out of bound values. (type=value_error.rowvalue)


In [31]:
class P05_2(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str] = pt.Field(min_length=20, max_length=50)
    Island_Code: int

In [32]:
val(P05_2, df)

8 validation errors for P05_2
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)
Comments
  3 rows with out of bound values. (type=value_error.rowvalue)


# 6. 字串 pattern (re)

In [33]:
df.filter(pl.col('Comments').is_not_null()).select(pl.col('Comments'))

Comments
str
"""Not enough blood for isotopes."""
"""Adult not sampled."""
"""Nest never observed with full …"
"""Nest never observed with full …"
"""No blood sample obtained."""
…
"""Nest never observed with full …"
"""Nest never observed with full …"
"""Nest never observed with full …"
"""Nest never observed with full …"


In [34]:
df.filter(pl.col('Comments').is_not_null()).filter(~pl.col('Comments').str.starts_with('Nest')).select(pl.col('Comments'))

Comments
str
"""Not enough blood for isotopes."""
"""Adult not sampled."""
"""No blood sample obtained."""
"""No blood sample obtained for s…"
"""No blood sample obtained for s…"
…
"""Not enough blood for isotopes."""
"""Not enough blood for isotopes."""
"""Not enough blood for isotopes."""
"""Not enough blood for isotopes."""


In [35]:
# use pattern instead of regex, which is not supported

class P06(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str] = pt.Field(pattern=r'^Nest[A-Za-z0-9\s]+')
    Island_Code: int

In [36]:
val(P06, df)

8 validation errors for P06
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)
Comments
  12 rows with out of bound values. (type=value_error.rowvalue)


# 7. 值的範圍（大於或小於特定值）

In [37]:
df.describe()

statistic,ID,Species,Island,Clutch_Completion,Culmen_Length,Culmen_Depth,Flipper_Length,Body_Mass,Sex,Comments,Island_Code
str,f64,str,str,str,f64,f64,f64,f64,str,str,f64
"""count""",344.0,"""344""","""344""","""344""",342.0,342.0,342.0,342.0,"""334""","""26""",344.0
"""null_count""",0.0,"""0""","""0""","""0""",2.0,2.0,2.0,2.0,"""10""","""318""",0.0
"""mean""",63.151163,,,,68.83655,17.15117,200.915205,4201.754386,,,1.206395
"""std""",40.430199,,,,328.085458,1.974793,14.061714,801.954536,,,0.687977
"""min""",1.0,"""Adelie Penguin (Pygoscelis ade…","""Biscoe""","""No""",4.1,13.1,172.0,2700.0,""".""","""Adult not sampled.""",0.0
"""25%""",29.0,,,,39.3,15.6,190.0,3550.0,,,1.0
"""50%""",58.0,,,,44.5,17.3,197.0,4050.0,,,1.0
"""75%""",95.0,,,,48.6,18.7,213.0,4750.0,,,2.0
"""max""",152.0,"""Gentoo penguin (Pygoscelis pap…","""Torgersen""","""Yes""",4730.0,21.5,231.0,6300.0,"""MALE""","""Sexing primers did not amplify…",2.0


In [38]:
class P07(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int = pt.Field(ge=2700, le=6300)
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]
    Island_Code: int

In [39]:
val(P07, df)

7 validation errors for P07
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


In [40]:
class P07_1(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int = pt.Field(gt=2700, lt=6300)
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]
    Island_Code: int

In [41]:
val(P07_1, df)

8 validation errors for P07_1
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Body_Mass
  2 rows with out of bound values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


# 8. 值為某個倍數

In [42]:
class P08(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int = pt.Field(multiple_of=100)
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]
    Island_Code: int

In [43]:
val(P08, df)

8 validation errors for P08
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Body_Mass
  179 rows with out of bound values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


In [44]:
class P08_1(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int = pt.Field(multiple_of=5)
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]
    Island_Code: int

In [45]:
val(P08_1, df)

7 validation errors for P08_1
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


# 9. 值為常數

In [46]:
df.filter(pl.col('Body_Mass')!=5000).shape

(336, 11)

In [47]:
class P09(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: Literal[5000]
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]
    Island_Code: int

In [48]:
val(P09, df)

8 validation errors for P09
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Body_Mass
  336 rows with out of bound values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


# 10. 某個欄位加總為特定值

In [49]:
df.select(pl.col('Body_Mass').sum())

Body_Mass
i64
1437000


In [50]:
class P10(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int = pt.Field(constraints=pt.field.sum() == 1437000)
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]
    Island_Code: int

In [51]:
val(P10, df)

7 validation errors for P10
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


In [52]:
# pt.field is an alias for the field column and is automatically replaced with polars.col("Body_Mass") before validation

class P10_1(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int = pt.Field(constraints=pt.field.sum() == 1400000)
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]
    Island_Code: int

In [53]:
val(P10_1, df)

8 validation errors for P10_1
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Body_Mass
  344 rows does not match custom constraints. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


# 11. 欄位之間的關係

In [55]:
class P11(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float = pt.Field(constraints=pl.col('Culmen_Length') > pl.col('Culmen_Depth'))
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]
    Island_Code: int

In [56]:
val(P11, df)

8 validation errors for P11
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Culmen_Length
  1 row does not match custom constraints. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


In [57]:
df.filter(~(pl.col('Culmen_Length') > pl.col('Culmen_Depth'))).select(pl.all())

ID,Species,Island,Clutch_Completion,Culmen_Length,Culmen_Depth,Flipper_Length,Body_Mass,Sex,Comments,Island_Code
i64,str,str,str,f64,f64,i64,i64,str,str,i64
11,"""Adelie Penguin (Pygoscelis ade…","""Torgersen""","""Yes""",4.1,17.1,186,3300,,"""No blood sample obtained for s…",0


# 12. 條件關係

In [58]:
class P12(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float = pt.Field(constraints=pl.col('Culmen_Length') > pl.col('Culmen_Depth'))
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]
    Island_Code: int = pt.Field(constraints=pl.when(pl.col('Island')=='Torgersen')
                                             .then(pt.field == 0)
                                             .when(pl.col('Island')=='Biscoe')
                                             .then(pt.field == 1).otherwise(pt.field == 2))

In [59]:
val(P12, df)

9 validation errors for P12
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Culmen_Length
  1 row does not match custom constraints. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)
Island_Code
  1 row does not match custom constraints. (type=value_error.rowvalue)


In [61]:
df.filter(~(pl.when(pl.col('Island')=='Torgersen')
            .then(pl.col('Island_Code') == 0)
            .when(pl.col('Island')=='Biscoe')
            .then(pl.col('Island_Code') == 1)
            .otherwise(pl.col('Island_Code') == 2)))\
    .select(pl.all())

ID,Species,Island,Clutch_Completion,Culmen_Length,Culmen_Depth,Flipper_Length,Body_Mass,Sex,Comments,Island_Code
i64,str,str,str,f64,f64,i64,i64,str,str,i64
29,"""Adelie Penguin (Pygoscelis ade…","""Biscoe""","""No""",37.9,18.6,172,3150,"""FEMALE""","""Nest never observed with full …",0


# 13. 多重關係

In [62]:
class P13(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float = pt.Field(constraints=[pl.col('Culmen_Length') > pl.col('Culmen_Depth'),
                                                 pl.col('Body_Mass') > pl.col('Culmen_Length')])
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]
    Island_Code: int

In [63]:
val(P13, df)

8 validation errors for P13
Body_Mass
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Culmen_Length
  3 rows does not match custom constraints. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


In [65]:
df.filter(~((pl.col('Culmen_Length') > pl.col('Culmen_Depth')) & (pl.col('Body_Mass') > pl.col('Culmen_Length'))))\
    .select(pl.all())

ID,Species,Island,Clutch_Completion,Culmen_Length,Culmen_Depth,Flipper_Length,Body_Mass,Sex,Comments,Island_Code
i64,str,str,str,f64,f64,i64,i64,str,str,i64
11,"""Adelie Penguin (Pygoscelis ade…","""Torgersen""","""Yes""",4.1,17.1,186,3300,,"""No blood sample obtained for s…",0
108,"""Adelie Penguin (Pygoscelis ade…","""Biscoe""","""Yes""",3910.0,20.0,190,3900,"""MALE""",,1
105,"""Gentoo penguin (Pygoscelis pap…","""Biscoe""","""Yes""",4730.0,13.8,216,4725,,,1
