In [1]:
import sys
sys.path.insert(0, "/home/anadirov/Documents/Projects/datatest/")

import pandas as pd

from core.assertions.df import DFHasColumnsAssertion, PanderaSchemaAssertion
from core.validators import validate_url
from core.testpipe import TestPipe

In [2]:
df = pd.read_csv("test_files/sites.csv")

## Run some manually tests

In [3]:
t1 = DFHasColumnsAssertion(df, ['site_name', 'site_url'])
assert t1.run(), "Assertion failed"
assert t1.status == 'passed', "Has no success status"

In [4]:
t2 = DFHasColumnsAssertion(df, ['site_name', 'site_url', "domain"], strict=False)
assert t2.run() is False, "Assertion success but should fail"
assert t2.status == 'failed', "Has no failure status"

df.loc[:,"domain"] = None
t3 = DFHasColumnsAssertion(df, ['site_name', 'site_url'], strict=True)
assert t3.run() is False, f"Should have failed but True"
assert t3.status == "failed", "Has no failure status"
assert t3.error_message == "Columns: '['domain']' are in df, but not in your list: ['site_name', 'site_url']"

## Explore pandera

In [5]:
import pandera as pa

In [6]:
df_schema = pa.DataFrameSchema({
                                "site_name": pa.Column(str, checks=pa.Check(lambda site_name: site_name.str.isalpha())),
                                "site_url": pa.Column(str, checks=pa.Check(validate_url))})

pa1 = PanderaSchemaAssertion(df_schema, df)

In [8]:
assertions = [t1, t2, t3, pa1]
pipe = TestPipe("test_pipe", assertions=assertions)

In [11]:
pipe.run()

[test_pipe]: Running assertions
	1|DFHasColumns|: [passed]
	2|DFHasColumns|: [passed]
	3|DFHasColumns|: [failed] -> Columns: '['domain']' are in df, but not in your list: ['site_name', 'site_url']
	4|PanderaDataFrameSchema|: [passed]
Successfully ran [3/4] assertions
Done


  is_url = urls.str.contains(regex, regex=True, case=False)
