# 資料驗證

In [51]:
%matplotlib inline
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import patito as pt
from typing import List, Tuple, Dict, Any, Union, Literal, Optional

plt.style.use('seaborn-v0_8-darkgrid')

In [52]:
def val(validator_class: Any, data: Any) -> bool:
    try:
        validator_class.validate(data)
        print('Pass')
    except Exception as e:
        print(e)

In [53]:
df = pl.read_csv('Data_Prep/penguins.csv')

In [54]:
df.describe()

statistic,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
str,str,f64,str,str,str,str,str,str,str,f64,f64,f64,f64,str,f64,f64,str
"""count""","""344""",344.0,"""344""","""344""","""344""","""344""","""344""","""344""","""344""",342.0,342.0,342.0,342.0,"""334""",330.0,331.0,"""26"""
"""null_count""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""",2.0,2.0,2.0,2.0,"""10""",14.0,13.0,"""318"""
"""mean""",,63.151163,,,,,,,,43.92193,17.15117,200.915205,4201.754386,,8.733382,-25.686292,
"""std""",,40.430199,,,,,,,,5.459584,1.974793,14.061714,801.954536,,0.55177,0.793961,
"""min""","""PAL0708""",1.0,"""Adelie Penguin (Pygoscelis ade…","""Anvers""","""Biscoe""","""Adult, 1 Egg Stage""","""N100A1""","""No""","""11/10/07""",32.1,13.1,172.0,2700.0,""".""",7.6322,-27.01854,"""Adult not sampled."""
"""25%""",,29.0,,,,,,,,39.2,15.6,190.0,3550.0,,8.2993,-26.3146,
"""50%""",,58.0,,,,,,,,44.5,17.3,197.0,4050.0,,8.65466,-25.83352,
"""75%""",,95.0,,,,,,,,48.5,18.7,213.0,4750.0,,9.17847,-25.0602,
"""max""","""PAL0910""",152.0,"""Gentoo penguin (Pygoscelis pap…","""Anvers""","""Torgersen""","""Adult, 1 Egg Stage""","""N9A2""","""Yes""","""12/3/07""",59.6,21.5,231.0,6300.0,"""MALE""",10.02544,-23.78767,"""Sexing primers did not amplify…"


In [55]:
df = df.drop('studyName', 'Region', 'Stage', 'Individual ID', 'Date Egg', 'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)')

In [56]:
df.columns

['Sample Number',
 'Species',
 'Island',
 'Clutch Completion',
 'Culmen Length (mm)',
 'Culmen Depth (mm)',
 'Flipper Length (mm)',
 'Body Mass (g)',
 'Sex',
 'Comments']

In [57]:
df = df.rename({'Sample Number': 'ID', 'Clutch Completion': 'Clutch_Completion',
                'Culmen Length (mm)': 'Culmen_Length', 'Culmen Depth (mm)': 'Culmen_Depth',
                'Flipper Length (mm)': 'Flipper_Length', 'Body Mass (g)': 'Body_Mass'
 })

In [58]:
df

ID,Species,Island,Clutch_Completion,Culmen_Length,Culmen_Depth,Flipper_Length,Body_Mass,Sex,Comments
i64,str,str,str,f64,f64,i64,i64,str,str
1,"""Adelie Penguin (Pygoscelis ade…","""Torgersen""","""Yes""",39.1,18.7,181,3750,"""MALE""","""Not enough blood for isotopes."""
2,"""Adelie Penguin (Pygoscelis ade…","""Torgersen""","""Yes""",39.5,17.4,186,3800,"""FEMALE""",
3,"""Adelie Penguin (Pygoscelis ade…","""Torgersen""","""Yes""",40.3,18.0,195,3250,"""FEMALE""",
4,"""Adelie Penguin (Pygoscelis ade…","""Torgersen""","""Yes""",,,,,,"""Adult not sampled."""
5,"""Adelie Penguin (Pygoscelis ade…","""Torgersen""","""Yes""",36.7,19.3,193,3450,"""FEMALE""",
…,…,…,…,…,…,…,…,…,…
120,"""Gentoo penguin (Pygoscelis pap…","""Biscoe""","""No""",,,,,,
121,"""Gentoo penguin (Pygoscelis pap…","""Biscoe""","""Yes""",46.8,14.3,215,4850,"""FEMALE""",
122,"""Gentoo penguin (Pygoscelis pap…","""Biscoe""","""Yes""",50.4,15.7,222,5750,"""MALE""",
123,"""Gentoo penguin (Pygoscelis pap…","""Biscoe""","""Yes""",45.2,14.8,212,5200,"""FEMALE""",


# 1. 資料型態

In [60]:
class P01(pt.Model):
    ID: int
    Species: str
    Island: str
    Clutch_Completion: str
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: float
    Body_Mass: float
    Sex: str
    Comments: str

In [61]:
val(P01, df)

8 validation errors for P01
Body_Mass
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Comments
  318 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
Flipper_Length
  Polars dtype Int64 does not match model field type. (type=type_error.columndtype)
Body_Mass
  Polars dtype Int64 does not match model field type. (type=type_error.columndtype)


In [62]:
class P01_1(pt.Model):
    ID: int
    Species: str
    Island: str
    Clutch_Completion: str
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: str
    Comments: str

In [63]:
val(P01_1, df)

6 validation errors for P01_1
Body_Mass
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Comments
  318 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)


# 2. 是否為一個 list 內的值（例如：其值必為 [1, 2, 3] 其中之一）

In [64]:
class P02(pt.Model):
    ID: int
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: str

In [65]:
val(P02, df)

7 validation errors for P02
Body_Mass
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Comments
  318 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


In [66]:
# use wrong species name

class P02_1(pt.Model):
    ID: int
    Species: Literal['Gentoo penguin', 
                     'Adelie Penguin', 
                     'Chinstrap penguin']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: str

In [67]:
val(P02_1, df)

8 validation errors for P02_1
Body_Mass
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Comments
  318 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
Species
  Rows with invalid values: {'Chinstrap penguin (Pygoscelis antarctica)', 'Gentoo penguin (Pygoscelis papua)', 'Adelie Penguin (Pygoscelis adeliae)'}. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


# 3. 是否允許空值

In [68]:
class P03(pt.Model):
    ID: int
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]

In [69]:
val(P03, df)

6 validation errors for P03
Body_Mass
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


# 4. 是否不允許重複

In [70]:
class P04(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str]

In [71]:
val(P04, df)

7 validation errors for P04
Body_Mass
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)


# 5. 字串長度

In [82]:
df.select(pl.col('Comments').str.len_chars().min().alias('Comments Min Length'),
          pl.col('Comments').str.len_chars().max().alias('Comments Max Length'))

Comments Min Length,Comments Max Length
u32,u32
18,68


In [90]:
df.select(pl.col('Comments').str.len_chars().value_counts()).unnest('Comments')

Comments,count
u32,u32
,318
25.0,2
36.0,2
30.0,6
68.0,1
18.0,1
37.0,13
62.0,1


In [80]:
class P05(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str] = pt.Field(max_length=50)

In [83]:
val(P05, df)

8 validation errors for P05
Body_Mass
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)
Comments
  2 rows with out of bound values. (type=value_error.rowvalue)


In [84]:
class P05_1(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str] = pt.Field(min_length=20)

In [85]:
val(P05_1, df)

8 validation errors for P05_1
Body_Mass
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)
Comments
  1 row with out of bound values. (type=value_error.rowvalue)


In [86]:
class P05_2(pt.Model):
    ID: int = pt.Field(unique=True)
    Species: Literal['Gentoo penguin (Pygoscelis papua)', 
                     'Adelie Penguin (Pygoscelis adeliae)', 
                     'Chinstrap penguin (Pygoscelis antarctica)']
    Island: Literal['Torgersen', 'Biscoe', 'Dream']
    Clutch_Completion: Literal['Yes', 'No']
    Culmen_Length: float
    Culmen_Depth: float
    Flipper_Length: int
    Body_Mass: int
    Sex: Literal['MALE', 'FEMALE']
    Comments: Optional[str] = pt.Field(min_length=20, max_length=50)

In [87]:
val(P05_2, df)

8 validation errors for P05_2
Body_Mass
  2 missing values (type=value_error.missingvalues)
Culmen_Depth
  2 missing values (type=value_error.missingvalues)
Sex
  10 missing values (type=value_error.missingvalues)
Flipper_Length
  2 missing values (type=value_error.missingvalues)
Culmen_Length
  2 missing values (type=value_error.missingvalues)
ID
  316 rows with duplicated values. (type=value_error.rowvalue)
Sex
  Rows with invalid values: {None, '.'}. (type=value_error.rowvalue)
Comments
  3 rows with out of bound values. (type=value_error.rowvalue)


# 5. 字串 pattern (re)

# 6. 值的範圍（大於或小於特定值）

# 8. 如果某個欄位的值為特定值時，另一個欄位的值必須為某些值