# Evaluating `Pointblank`

https://posit-dev.github.io/pointblank/

The following types of tables are supported:

- Polars DataFrame
- Pandas DataFrame
- DuckDB table
- MySQL table
- PostgreSQL table
- SQLite table
- Parquet

It uses `Narwhals` to work with `Polars` and `Pandas` DataFrames and also integrates with `Ibis` to enable the use of `DuckDB`, `MySQL`, `PostgreSQL`, `SQLite`, `Parquet`, and more!

In [1]:
import math
from pathlib import Path

import duckdb
import pandas as pd
import pointblank as pb

In [2]:
TITANIC_URL = 'https://hbiostat.org/data/repo/titanic3.csv'

In [3]:
def get_or_create_db(db_path='titanic.duckdb', url=TITANIC_URL, force_create=False):
    if not Path(db_path).exists() or force_create:
        # Connect to DuckDB and create a persistent database
        con = duckdb.connect(database=db_path, read_only=False)

        # Read the CSV files directly into DuckDB
        con.execute(f"CREATE TABLE IF NOT EXISTS titanic AS SELECT * FROM read_csv('{url}')")
        # Create a new table with a unique identifier
        con.execute("""
            CREATE TABLE titanic_with_id AS 
            SELECT *, ROW_NUMBER() OVER () AS id 
            FROM titanic
        """)
        # Drop the original table and rename the new table
        con.execute("DROP TABLE titanic")
        con.execute("ALTER TABLE titanic_with_id RENAME TO titanic")
    else:
        # Connect to the existing database
        con = duckdb.connect(database=db_path, read_only=False)
    
    return con

In [4]:
con = get_or_create_db()

In [5]:
con.sql("SELECT COUNT(*) FROM titanic")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│         1309 │
└──────────────┘

In [6]:
con.sql("SELECT * FROM titanic LIMIT 10")

┌────────┬──────────┬─────────────────────────────────────────────────┬─────────┬────────┬───────┬───────┬──────────┬──────────┬─────────┬──────────┬─────────┬───────┬─────────────────────────────────┬───────┐
│ pclass │ survived │                      name                       │   sex   │  age   │ sibsp │ parch │  ticket  │   fare   │  cabin  │ embarked │  boat   │ body  │            home.dest            │  id   │
│ int64  │  int64   │                     varchar                     │ varchar │ double │ int64 │ int64 │ varchar  │  double  │ varchar │ varchar  │ varchar │ int64 │             varchar             │ int64 │
├────────┼──────────┼─────────────────────────────────────────────────┼─────────┼────────┼───────┼───────┼──────────┼──────────┼─────────┼──────────┼─────────┼───────┼─────────────────────────────────┼───────┤
│      1 │        1 │ Allen, Miss. Elisabeth Walton                   │ female  │   29.0 │     0 │     0 │ 24160    │ 211.3375 │ B5      │ S        │ 2       │ 

In [7]:
def create_random_samples(con, n_sample):
    # Get the total number of rows in the titanic table
    total_rows = con.sql("SELECT COUNT(*) FROM titanic").fetchone()[0]
    
    # Calculate the number of rows per sample
    rows_per_sample = math.ceil(total_rows / n_sample)
    
    # Get all rows and shuffle them
    all_rows = con.sql("SELECT * FROM titanic").fetchdf()
    shuffled_rows = all_rows.sample(frac=1).reset_index(drop=True)
    
    samples = []
    for i in range(n_sample):
        offset = i * rows_per_sample
        sample = shuffled_rows.iloc[offset:offset + rows_per_sample]
        samples.append(sample)
    
    return samples

In [8]:
n_sample = 5  # Define the number of samples you want
sample = create_random_samples(con, n_sample)

# Display the first sample to verify
sample[0]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,id
0,3,0,"Bostandyeff, Mr. Guentcho",male,26.0,0,0,349224,7.8958,,S,,,"Bulgaria Chicago, IL",678
1,3,0,"Nancarrow, Mr. William Henry",male,33.0,0,0,A./5. 3338,8.0500,,S,,,,1052
2,3,1,"Nilsson, Miss. Berta Olivia",female,18.0,0,0,347066,7.7750,,S,D,,,1061
3,2,1,"Laroche, Miss. Louise",female,1.0,1,2,SC/Paris 2123,41.5792,,C,14,,Paris / Haiti,479
4,2,1,"Hamalainen, Mrs. William (Anna)",female,24.0,0,2,250649,14.5000,,S,4,,"Detroit, MI",429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,2,0,"Hodges, Mr. Henry Price",male,50.0,0,0,250643,13.0000,,S,,149,Southampton,451
258,2,0,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0000,,S,,,"St Ives, Cornwall / Calumet, MI",346
259,3,0,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S,,,,849
260,3,0,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41.0,0,2,370129,20.2125,,S,,,,1159


In [9]:
# Fetch the original dataframe from the database
original_df = con.sql("SELECT * FROM titanic").fetchdf()

# Concatenate all samples into a single dataframe
concatenated_samples = pd.concat(sample).sort_values('id').reset_index(drop=True)

# Check if the dataframes are identical
are_identical = original_df.equals(concatenated_samples)

print(f"Are the original dataframe and concatenated samples identical? {'Yes' if are_identical else 'No'}")


Are the original dataframe and concatenated samples identical? Yes


In [10]:
pb.preview(original_df)

"PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15","PandasRows1,309Columns15"
Unnamed: 0_level_1,pclassint64,survivedint64,nameobject,sexobject,agefloat64,sibspint64,parchint64,ticketobject,farefloat64,cabinobject,embarkedobject,boatobject,bodyInt64,home.destobject,idint64
1,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO",1
2,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON",2
3,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",3
4,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",4
5,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",5
1305,3,0,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C,,328.0,,1305
1306,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,,1306
1307,3,0,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C,,304.0,,1307
1308,3,0,"Zakarian, Mr. Ortin",male,27.0,0,0,2670,7.225,,C,,,,1308
1309,3,0,"Zimmerman, Mr. Leo",male,29.0,0,0,315082,7.875,,S,,,,1309


In [25]:
def validate_titanic_data_sample(sample_df):
    
    validation = (
        pb.Validate(data=sample_df, label="Example Validation")
        .col_vals_not_null("survived")
        .col_vals_not_null("pclass")
        .col_vals_not_null("sex")
        .col_vals_not_null("age")
        .col_vals_not_null("ticket")
        .col_vals_not_null("fare")
        .col_vals_not_null("embarked")
        .col_vals_between("age", 0, 70, na_pass=True)
        .col_vals_between("fare", 0, 500)
        .col_vals_in_set("pclass", {1, 2, 3})
        .col_vals_in_set("embarked", {"C", "Q", "S"})
        .col_vals_in_set("survived", {0, 1})
        .col_vals_in_set("sex", {"male", "female"})
        .interrogate()
    )

    return validation

In [26]:
validate_titanic_data_sample(original_df)

Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation
Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas
Unnamed: 0_level_2,Unnamed: 1_level_2,STEP,COLUMNS,VALUES,TBL,EVAL,UNITS,PASS,FAIL,W,E,C,EXT
#4CA64C,1,col_vals_not_null  col_vals_not_null(),survived,—,,✓,1309,1309 1.00,0 0.00,—,—,—,—
#4CA64C,2,col_vals_not_null  col_vals_not_null(),pclass,—,,✓,1309,1309 1.00,0 0.00,—,—,—,—
#4CA64C,3,col_vals_not_null  col_vals_not_null(),sex,—,,✓,1309,1309 1.00,0 0.00,—,—,—,—
#4CA64C66,4,col_vals_not_null  col_vals_not_null(),age,—,,✓,1309,1046 0.80,263 0.20,—,—,—,CSV
#4CA64C,5,col_vals_not_null  col_vals_not_null(),ticket,—,,✓,1309,1309 1.00,0 0.00,—,—,—,—
#4CA64C66,6,col_vals_not_null  col_vals_not_null(),fare,—,,✓,1309,1308 1.00,1 0.00,—,—,—,CSV
#4CA64C66,7,col_vals_not_null  col_vals_not_null(),embarked,—,,✓,1309,1307 1.00,2 0.00,—,—,—,CSV
#4CA64C66,8,col_vals_between  col_vals_between(),age,"[0, 70]",,✓,1309,1303 1.00,6 0.00,—,—,—,CSV
#4CA64C66,9,col_vals_between  col_vals_between(),fare,"[0, 500]",,✓,1309,1304 1.00,5 0.00,—,—,—,CSV
#4CA64C,10,col_vals_in_set  col_vals_in_set(),pclass,"1, 2, 3",,✓,1309,1309 1.00,0 0.00,—,—,—,—


TODO: Use the first sample to "define" the thresholds and then process the remaining samples with thresholds and actions.

https://posit-dev.github.io/pointblank/user-guide/thresholds.html

https://posit-dev.github.io/pointblank/user-guide/actions.html

In [30]:
for i, s in enumerate(sample):
    validation = validate_titanic_data_sample(s)
    display(f"Sample {i+1} - size: {len(s)} rows")
    display(validation)

'Sample 1 - size: 262 rows'

Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation
Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas
Unnamed: 0_level_2,Unnamed: 1_level_2,STEP,COLUMNS,VALUES,TBL,EVAL,UNITS,PASS,FAIL,W,E,C,EXT
#4CA64C,1,col_vals_not_null  col_vals_not_null(),survived,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,2,col_vals_not_null  col_vals_not_null(),pclass,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,3,col_vals_not_null  col_vals_not_null(),sex,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C66,4,col_vals_not_null  col_vals_not_null(),age,—,,✓,262,213 0.81,49 0.19,—,—,—,CSV
#4CA64C,5,col_vals_not_null  col_vals_not_null(),ticket,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,6,col_vals_not_null  col_vals_not_null(),fare,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,7,col_vals_not_null  col_vals_not_null(),embarked,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,8,col_vals_between  col_vals_between(),age,"[0, 70]",,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C66,9,col_vals_between  col_vals_between(),fare,"[0, 500]",,✓,262,260 0.99,2 0.01,—,—,—,CSV
#4CA64C,10,col_vals_in_set  col_vals_in_set(),pclass,"1, 2, 3",,✓,262,262 1.00,0 0.00,—,—,—,—


'Sample 2 - size: 262 rows'

Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation
Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas
Unnamed: 0_level_2,Unnamed: 1_level_2,STEP,COLUMNS,VALUES,TBL,EVAL,UNITS,PASS,FAIL,W,E,C,EXT
#4CA64C,1,col_vals_not_null  col_vals_not_null(),survived,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,2,col_vals_not_null  col_vals_not_null(),pclass,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,3,col_vals_not_null  col_vals_not_null(),sex,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C66,4,col_vals_not_null  col_vals_not_null(),age,—,,✓,262,210 0.80,52 0.20,—,—,—,CSV
#4CA64C,5,col_vals_not_null  col_vals_not_null(),ticket,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,6,col_vals_not_null  col_vals_not_null(),fare,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,7,col_vals_not_null  col_vals_not_null(),embarked,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C66,8,col_vals_between  col_vals_between(),age,"[0, 70]",,✓,262,259 0.99,3 0.01,—,—,—,CSV
#4CA64C,9,col_vals_between  col_vals_between(),fare,"[0, 500]",,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,10,col_vals_in_set  col_vals_in_set(),pclass,"1, 2, 3",,✓,262,262 1.00,0 0.00,—,—,—,—


'Sample 3 - size: 262 rows'

Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation
Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas
Unnamed: 0_level_2,Unnamed: 1_level_2,STEP,COLUMNS,VALUES,TBL,EVAL,UNITS,PASS,FAIL,W,E,C,EXT
#4CA64C,1,col_vals_not_null  col_vals_not_null(),survived,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,2,col_vals_not_null  col_vals_not_null(),pclass,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,3,col_vals_not_null  col_vals_not_null(),sex,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C66,4,col_vals_not_null  col_vals_not_null(),age,—,,✓,262,200 0.76,62 0.24,—,—,—,CSV
#4CA64C,5,col_vals_not_null  col_vals_not_null(),ticket,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,6,col_vals_not_null  col_vals_not_null(),fare,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C66,7,col_vals_not_null  col_vals_not_null(),embarked,—,,✓,262,261 1.00,1 0.00,—,—,—,CSV
#4CA64C66,8,col_vals_between  col_vals_between(),age,"[0, 70]",,✓,262,260 0.99,2 0.01,—,—,—,CSV
#4CA64C66,9,col_vals_between  col_vals_between(),fare,"[0, 500]",,✓,262,261 1.00,1 0.00,—,—,—,CSV
#4CA64C,10,col_vals_in_set  col_vals_in_set(),pclass,"1, 2, 3",,✓,262,262 1.00,0 0.00,—,—,—,—


'Sample 4 - size: 262 rows'

Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation
Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas
Unnamed: 0_level_2,Unnamed: 1_level_2,STEP,COLUMNS,VALUES,TBL,EVAL,UNITS,PASS,FAIL,W,E,C,EXT
#4CA64C,1,col_vals_not_null  col_vals_not_null(),survived,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,2,col_vals_not_null  col_vals_not_null(),pclass,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,3,col_vals_not_null  col_vals_not_null(),sex,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C66,4,col_vals_not_null  col_vals_not_null(),age,—,,✓,262,211 0.81,51 0.19,—,—,—,CSV
#4CA64C,5,col_vals_not_null  col_vals_not_null(),ticket,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C66,6,col_vals_not_null  col_vals_not_null(),fare,—,,✓,262,261 1.00,1 0.00,—,—,—,CSV
#4CA64C,7,col_vals_not_null  col_vals_not_null(),embarked,—,,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C,8,col_vals_between  col_vals_between(),age,"[0, 70]",,✓,262,262 1.00,0 0.00,—,—,—,—
#4CA64C66,9,col_vals_between  col_vals_between(),fare,"[0, 500]",,✓,262,261 1.00,1 0.00,—,—,—,CSV
#4CA64C,10,col_vals_in_set  col_vals_in_set(),pclass,"1, 2, 3",,✓,262,262 1.00,0 0.00,—,—,—,—


'Sample 5 - size: 261 rows'

Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation
Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas,Example ValidationPandas
Unnamed: 0_level_2,Unnamed: 1_level_2,STEP,COLUMNS,VALUES,TBL,EVAL,UNITS,PASS,FAIL,W,E,C,EXT
#4CA64C,1,col_vals_not_null  col_vals_not_null(),survived,—,,✓,261,261 1.00,0 0.00,—,—,—,—
#4CA64C,2,col_vals_not_null  col_vals_not_null(),pclass,—,,✓,261,261 1.00,0 0.00,—,—,—,—
#4CA64C,3,col_vals_not_null  col_vals_not_null(),sex,—,,✓,261,261 1.00,0 0.00,—,—,—,—
#4CA64C66,4,col_vals_not_null  col_vals_not_null(),age,—,,✓,261,212 0.81,49 0.19,—,—,—,CSV
#4CA64C,5,col_vals_not_null  col_vals_not_null(),ticket,—,,✓,261,261 1.00,0 0.00,—,—,—,—
#4CA64C,6,col_vals_not_null  col_vals_not_null(),fare,—,,✓,261,261 1.00,0 0.00,—,—,—,—
#4CA64C66,7,col_vals_not_null  col_vals_not_null(),embarked,—,,✓,261,260 1.00,1 0.00,—,—,—,CSV
#4CA64C66,8,col_vals_between  col_vals_between(),age,"[0, 70]",,✓,261,260 1.00,1 0.00,—,—,—,CSV
#4CA64C66,9,col_vals_between  col_vals_between(),fare,"[0, 500]",,✓,261,260 1.00,1 0.00,—,—,—,CSV
#4CA64C,10,col_vals_in_set  col_vals_in_set(),pclass,"1, 2, 3",,✓,261,261 1.00,0 0.00,—,—,—,—


In [24]:
for i in [4, 5, 6]:
    display(validation.get_data_extracts(i))

{4:       _row_num_  pclass  survived  \
 15           16       1         0   
 37           38       1         1   
 40           41       1         0   
 46           47       1         0   
 59           60       1         1   
 ...         ...     ...       ...   
 1293       1294       3         0   
 1297       1298       3         0   
 1302       1303       3         0   
 1303       1304       3         0   
 1305       1306       3         0   
 
                                                    name     sex  age  sibsp  \
 15                                  Baumann, Mr. John D    male  NaN      0   
 37        Bradley, Mr. George ("George Arthur Brayton")    male  NaN      0   
 40                            Brewe, Dr. Arthur Jackson    male  NaN      0   
 46                                Cairns, Mr. Alexander    male  NaN      0   
 59    Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...  female  NaN      0   
 ...                                                 ...   

{5: Empty DataFrame
 Columns: [_row_num_, pclass, survived, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked, boat, body, home.dest, id]
 Index: []}

{6:       _row_num_  pclass  survived                name   sex   age  sibsp  \
 1225       1226       3         0  Storey, Mr. Thomas  male  60.5      0   
 
       parch ticket  fare cabin embarked  boat  body home.dest    id  
 1225      0   3701   NaN  None        S  None   261      None  1226  }