In [None]:
import math
from pathlib import Path

import duckdb
import pandas as pd
import pointblank as pb

In [None]:
TITANIC_URL = 'https://hbiostat.org/data/repo/titanic3.csv'

In [None]:
def get_or_create_db(db_path='titanic.duckdb', url=TITANIC_URL, force_create=False):
    if not Path(db_path).exists() or force_create:
        # Connect to DuckDB and create a persistent database
        con = duckdb.connect(database=db_path, read_only=False)

        # Read the CSV files directly into DuckDB
        con.execute(f"CREATE TABLE IF NOT EXISTS titanic AS SELECT * FROM read_csv('{url}')")
        # Create a new table with a unique identifier
        con.execute("""
            CREATE TABLE titanic_with_id AS 
            SELECT *, ROW_NUMBER() OVER () AS id 
            FROM titanic
        """)
        # Drop the original table and rename the new table
        con.execute("DROP TABLE titanic")
        con.execute("ALTER TABLE titanic_with_id RENAME TO titanic")
    else:
        # Connect to the existing database
        con = duckdb.connect(database=db_path, read_only=False)
    
    return con

In [None]:
con = get_or_create_db()

In [None]:
con.sql("SELECT COUNT(*) FROM titanic")

In [None]:
con.sql("SELECT * FROM titanic LIMIT 10")

In [None]:
def create_random_samples(con, n_sample):
    # Get the total number of rows in the titanic table
    total_rows = con.sql("SELECT COUNT(*) FROM titanic").fetchone()[0]
    
    # Calculate the number of rows per sample
    rows_per_sample = math.ceil(total_rows / n_sample)
    
    # Get all rows and shuffle them
    all_rows = con.sql("SELECT * FROM titanic").fetchdf()
    shuffled_rows = all_rows.sample(frac=1).reset_index(drop=True)
    
    samples = []
    for i in range(n_sample):
        offset = i * rows_per_sample
        sample = shuffled_rows.iloc[offset:offset + rows_per_sample]
        samples.append(sample)
    
    return samples

In [None]:
n_sample = 5  # Define the number of samples you want
sample = create_random_samples(con, n_sample)

# Display the first sample to verify
sample[0]

In [None]:
# Fetch the original dataframe from the database
original_df = con.sql("SELECT * FROM titanic").fetchdf()

# Concatenate all samples into a single dataframe
concatenated_samples = pd.concat(sample).sort_values('id').reset_index(drop=True)

# Check if the dataframes are identical
are_identical = original_df.equals(concatenated_samples)

print(f"Are the original dataframe and concatenated samples identical? {'Yes' if are_identical else 'No'}")


In [None]:
pb.preview(original_df)

In [None]:
validation = (
    pb.Validate(data=original_df, label="Example Validation")
    .col_exists("name")
    .col_exists("age")
    .col_vals_not_null("survived")
    .col_vals_not_null("pclass")
    .col_vals_not_null("sex")
    .col_vals_not_null("ticket")
    .col_vals_not_null("fare")
    .col_vals_not_null("embarked")
    .col_vals_between("age", 0, 70, na_pass=True)
    .col_vals_between("fare", 0, 500)
    .col_vals_in_set("pclass", {1, 2, 3})
    .col_vals_in_set("embarked", {"C", "Q", "S"})
    .col_vals_in_set("survived", {0, 1})
    .col_vals_in_set("sex", {"male", "female"})
    .interrogate()
)

In [None]:
validation