# Exploring pyarrow and SQlite

In [1]:
import pyarrow as pa

import utils

In [2]:
ARROW_CONSECUTIVE_DATABASE_FILENAME = "databases/arrow_consecutive.sqlite"
ARROW_CONCATENATED_DATABASE_FILENAME = "databases/arrow_concatenated.sqlite"
SQLITE_DATABASE_FILENAME = "databases/direct_sqlite.sqlite"

In [3]:
schema = pa.schema(
    [
        pa.field(name="name", type=pa.string()), # argument names optional
        ("age", pa.int8())                       # implicit conversion to pa.field

        # N.B. bool_ works in schema, but fails with the sqlite driver with:
        # NotSupportedError: ADBC_STATUS_NOT_IMPLEMENTED (2): Column 2 has unsupported type bool
    ]
)

In [4]:
data_1 = pa.table(
    [
        ["Alice", "Bob", "Carol"],
        [42, 46, 11],
    ],
    schema=schema,
)

data_2 = pa.table(
    [
        ["Dave", "Erin"],
        [73, 49],
    ],
    schema=schema,
)

## Create and write pyarrow table to SQlite, append more data

In [5]:
rows_created = utils.write_data_to_db(f"file:{ARROW_CONSECUTIVE_DATABASE_FILENAME}", "people", data_1, mode="create")
rows_appended = utils.write_data_to_db(f"file:{ARROW_CONSECUTIVE_DATABASE_FILENAME}", "people", data_2, mode="append")

print(f"{rows_created} rows created, {rows_appended} rows appended")

3 rows created, 2 rows appended


## "Concatenate" two pyarrow tables into a new one, write to SQlite
This will create a new view without overhead. Writing this concatenated view to SQlite will produce the same file as the above commands.

In [6]:
concatenated_data = pa.concat_tables([data_1, data_2])
concatenated_rows_created = utils.write_data_to_db(f"file:{ARROW_CONCATENATED_DATABASE_FILENAME}", "people", concatenated_data, "create")

print(f"{concatenated_rows_created} rows created")

5 rows created


## Create the same type of database directly in SQlite

In [7]:
sqlite_command = "DROP TABLE IF EXISTS people;" +\
"CREATE TABLE people (name, age);" + \
"INSERT INTO people (name, age) VALUES ('Alice', 42);" + \
"INSERT INTO people (name, age) VALUES ('Bob', 46);" + \
"INSERT INTO people (name, age) VALUES ('Carol', 11);" + \
"INSERT INTO people (name, age) VALUES ('Dave', 73);" + \
"INSERT INTO people (name, age) VALUES ('Erin', 49);"

!sqlite3 "{SQLITE_DATABASE_FILENAME}" "{sqlite_command}" ".exit"

## Comparing all SQlite files
The databases are not binary identical.

In [8]:
!diff databases/arrow_concatenated.sqlite databases/arrow_consecutive.sqlite
!diff databases/arrow_concatenated.sqlite databases/direct_sqlite.sqlite

Binary files databases/arrow_concatenated.sqlite and databases/arrow_consecutive.sqlite differ
Binary files databases/arrow_concatenated.sqlite and databases/direct_sqlite.sqlite differ


Witness an empty return, meaning `sqldiff` sees them as identical:

In [9]:
!sqldiff databases/arrow_concatenated.sqlite databases/arrow_consecutive.sqlite
!sqldiff databases/arrow_concatenated.sqlite databases/direct_sqlite.sqlite

We can also compare if the contained records are identical via ADBC.

In [10]:
utils.database_equality("file:databases/arrow_concatenated.sqlite", "file:databases/arrow_consecutive.sqlite")

True

In [11]:
utils.database_equality("file:databases/arrow_concatenated.sqlite", "file:databases/direct_sqlite.sqlite")

True

The databases are not a carbon copy of each other, but identical for practical purposes.