<a href="https://colab.research.google.com/github/98ketaki/Blog/blob/main/pydantic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Quick & Easy Pydantic Tutorial

Author: Ketaki Ghatole

In [None]:
# Install Pydantic
!pip install pydantic



In [None]:
# Imports
from pydantic import BaseModel, Field, field_validator, model_validator
from typing import List, Literal, Optional
from datetime import date

In [None]:
# Simple sample model. This class
class Sample(BaseModel):
    sample_id: str
    replicate: int
    condition: str


In [None]:
#Valid sample
Sample(sample_id="S001", replicate=1, condition="treated")

Sample(sample_id='S001', replicate=1, condition='treated')

In [None]:
# Accepted list of tissues
class BiologicalSample(BaseModel):
    sample_id: str
    tissue: Literal["tumor", "normal", "blood"]
    age_years: int = Field(..., ge=0, le=120)

In [None]:
BiologicalSample(sample_id='sample_a', tissue="normal", age_years=30)

BiologicalSample(sample_id='sample_a', tissue='normal', age_years=30)

In [None]:
# Sample validation class

class SampleMetadata(BaseModel):
    sample_id: str = Field(..., pattern=r"^[A-Z]{2}\d{6}$")
    tissue_type: Literal["tumor", "normal", "blood", "saliva"]
    age_years: int = Field(..., ge=0, le=120)
    sequencing_depth_x: float = Field(..., gt=0)
    batch_id: Optional[str] = None

    @field_validator("sequencing_depth_x")
    @classmethod
    def check_minimum_depth(cls, v):
        if v < 30:
            raise ValueError("Sequencing depth < 30X is insufficient for analysis")
        return v


In [None]:
# Valid sample
sample = SampleMetadata(
    sample_id="AB123456",
    tissue_type="tumor",
    age_years=52,
    sequencing_depth_x=38.4,
    batch_id="batch_01"
)

sample


SampleMetadata(sample_id='AB123456', tissue_type='tumor', age_years=52, sequencing_depth_x=38.4, batch_id='batch_01')

In [None]:
# Invalid sample and errors
try:
    SampleMetadata(
        sample_id="123",
        tissue_type="Tumor",
        age_years=-5,
        sequencing_depth_x=12.0
    )
except Exception as e:
    print(e)

4 validation errors for SampleMetadata
sample_id
  String should match pattern '^[A-Z]{2}\d{6}$' [type=string_pattern_mismatch, input_value='123', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/string_pattern_mismatch
tissue_type
  Input should be 'tumor', 'normal', 'blood' or 'saliva' [type=literal_error, input_value='Tumor', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/literal_error
age_years
  Input should be greater than or equal to 0 [type=greater_than_equal, input_value=-5, input_type=int]
    For further information visit https://errors.pydantic.dev/2.12/v/greater_than_equal
sequencing_depth_x
  Value error, Sequencing depth < 30X is insufficient for analysis [type=value_error, input_value=12.0, input_type=float]
    For further information visit https://errors.pydantic.dev/2.12/v/value_error


In [None]:

class SequencingRun(BaseModel):
    # defaults
    platform: Literal["Illumina", "ONT", "PacBio"] = "Illumina"
    run_date: date = Field(default_factory=date.today)
    expected_q30: float = 75.0

    # nesting other class Sample
    run_id: str
    samples: List[Sample]

    # Optional Field
    mean_q30: float = Field(..., ge=0, le=100)
    notes: Optional[str] = None

    # cross-field validation
    @model_validator(mode="after")
    def run_qc_checks(self):
        if self.mean_q30 < self.expected_q30:
            raise ValueError(
                f"mean_q30 {self.mean_q30} is below expected_q30 {self.expected_q30}"
            )
        if len(self.samples) == 0:
            raise ValueError("A sequencing run must contain at least one sample")
        return self


In [None]:
# Valid sample
run1 = SequencingRun(
    run_id="RUN_0001",
    mean_q30=86.5,
    samples=[{'sample_id':'S001', 'replicate':1, 'condition':'treated'},
        # overriding defaults per-sample
        {'sample_id':'S002', 'replicate':2, 'condition':'control'},
    ],
)

print(run1.platform)     # Default value
print(run1.run_date)      #Today's date
print(run1.samples[0].condition)  #Sample info

Illumina
2026-01-31
treated


In [None]:
#Invalid q30
run2 = SequencingRun(
    run_id="RUN_0001",
    mean_q30=2,
    samples=[{'sample_id':'S001', 'replicate':1, 'condition':'treated'},
        # overriding defaults per-sample
        {'sample_id':'S002', 'replicate':2, 'condition':'control'},
    ],
)


ValidationError: 1 validation error for SequencingRun
  Value error, mean_q30 2.0 is below expected_q30 75.0 [type=value_error, input_value={'run_id': 'RUN_0001', 'm...condition': 'control'}]}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/value_error