In [1]:
import pandas as pd
import pandera
from pandera.errors import SchemaErrors
from pandera import DataFrameSchema, Column, Check
from datetime import datetime
import logging
import math

In [2]:
# Set up logging
logging.basicConfig(filename='validation_errors.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [3]:
df = pd.DataFrame({
    "column1": [1, 4, 0, 10, 9],
    "column2": ["14-APR-24", "30-FEB-24", "14-APR-24", "14-FEB-24", "14-APR-24"],
    "column3": ["19770419", "19770419", "19770426", "19771239", "19770419"],
})

In [4]:
df

Unnamed: 0,column1,column2,column3
0,1,14-APR-24,19770419
1,4,30-FEB-24,19770419
2,0,14-APR-24,19770426
3,10,14-FEB-24,19771239
4,9,14-APR-24,19770419


In [5]:
def is_valid_date(date_string, date_format):
    try:
        if not date_string:
            return True
        
        if isinstance(date_string, float) and math.isnan(date_string):
            return True
        
        datetime.strptime(date_string, date_format)
        return True
    except ValueError:
        return False


In [6]:
OutputSchema = DataFrameSchema({
    "column1": Column(int, nullable=False),
    "column2": Column(str, nullable=True, checks=[
        Check(lambda x: is_valid_date(x, "%d-%b-%y"), element_wise=True, name="date_exists_and_is_valid"),
        ]),
    "column3": Column(str, nullable=False, checks=[
        Check(lambda x: is_valid_date(x, "%Y%m%d"), element_wise=True, name="date_exists_and_is_valid"),
        ]),
})

In [7]:
def main():
    try:
        listis = OutputSchema.validate(df, lazy=True)
        print(listis.head())
    except SchemaErrors as err:
        for x in err.schema_errors:
            print(f"Validation failed: {x} {err.failure_cases['index']}")
            logging.error(f"Validation failed: {x} {err.failure_cases['index']}")
        
        if err.failure_cases is not None:
            failed_indices = err.failure_cases['index'].tolist()
            print("Failed rows:")
            print(failed_indices)
            print(df.iloc[failed_indices])
            logging.error("Failed rows:")
            logging.error(failed_indices)
            logging.error(df.iloc[failed_indices])


In [8]:
if __name__ == "__main__":
    main()

Validation failed: Column 'column2' failed element-wise validator number 0: <Check date_exists_and_is_valid> failure cases: 30-FEB-24 0    1
1    3
Name: index, dtype: int64
Validation failed: Column 'column3' failed element-wise validator number 0: <Check date_exists_and_is_valid> failure cases: 19771239 0    1
1    3
Name: index, dtype: int64
Failed rows:
[1, 3]
   column1    column2   column3
1        4  30-FEB-24  19770419
3       10  14-FEB-24  19771239
