# How does pyarrow handle metadata?

In [1]:
import adbc_driver_manager.dbapi
import adbc_driver_sqlite.dbapi
import pyarrow as pa

from typing import Any, Callable

In [2]:
SAMPLE_SQLITE_DATABASE_FILENAME = "databases/sample.sqlite"
RECREATED_DATABASE_FILENAME = "databases/recreated.sqlite"

In [3]:
def execute_on_sqlite(database_uri: str, my_function: Callable, *args: Any, **kwargs: Any) -> Any:
    """Execute a function `my_function` on an SQlite database at `database_uri`.
    
    Returns:
        Result of `my_function`.
    """
    with adbc_driver_sqlite.dbapi.connect(database_uri) as connection:
        with connection.cursor() as cursor:
            result = my_function(cursor, *args, **kwargs)
        connection.commit()
        return result

In [4]:
def insert_data_into_table(cursor: adbc_driver_manager.dbapi.Cursor,
                           table_name: str,
                           data: pa.lib.Table,
                           mode: str) -> int:
    """Create a table and fill it with data or append data to an existing one.
    
    Args:
        cursor: database cursor
        table_name: name of the table to create or append to
        data: pa table
        mode: "create" a new table, or "append" to an existing one

    Returns:
        Number of rows inserted
    """

    if mode == "create":
        cursor.execute(f"DROP TABLE IF EXISTS {table_name}")

    result = cursor.adbc_ingest(table_name, data, mode=mode)
    return result

## Population of the Spine database with SQlite
This is a practical example and part of the "Load SpineOpt Template" operation.

In [5]:
sqlite_command = "DROP TABLE IF EXISTS alembic_version;" + \
"CREATE TABLE alembic_version (" + \
"	version_num VARCHAR(32) NOT NULL," + \
"	CONSTRAINT alembic_version_pkc PRIMARY KEY (version_num)" + \
"); " + \
"INSERT INTO alembic_version(rowid,version_num) VALUES(1,'989fccf80441');"


!sqlite3 "{SAMPLE_SQLITE_DATABASE_FILENAME}" "{sqlite_command}" ".exit"

## Re-creation attempt with pyarrow

In [6]:
schema = pa.schema(
    [
        pa.field(name="version_num",
                 type=pa.string(),
                 nullable=False)
    ]
)

In [7]:
data = pa.table(
    [
        ["989fccf80441"],
    ],
    schema=schema,
)

In [8]:
rows_inserted = execute_on_sqlite(f"file:{RECREATED_DATABASE_FILENAME}", insert_data_into_table, "alembic_version", data, mode="create")
print(f"{rows_inserted} row(s) inserted")

1 row(s) inserted


## Comparison (pure arrow)

Not surprisingly, an inspection with `sqlitebrowser` shows that the schema differs. The field type is not transferred properly, and neither is `nullable`. There seems to be no way to create constraints, set primary keys or set SQlite-specific data types.

https://arrow.apache.org/adbc/0.5.1/driver/flight_sql.html#metadata

## Create database first, then populate with pyarrow

In [9]:
RECREATED_SQLITE_THEN_ARROW_DATABASE_FILENAME = "databases/recreated_sqlite_then_arrow.sqlite"

In [10]:
sqlite_command = "DROP TABLE IF EXISTS alembic_version;" + \
"CREATE TABLE alembic_version (" + \
"	version_num VARCHAR(32) NOT NULL," + \
"	CONSTRAINT alembic_version_pkc PRIMARY KEY (version_num)" + \
"); "

!sqlite3 "{RECREATED_SQLITE_THEN_ARROW_DATABASE_FILENAME}" "{sqlite_command}" ".exit"

In [11]:
rows_inserted = execute_on_sqlite(f"file:{RECREATED_SQLITE_THEN_ARROW_DATABASE_FILENAME}", insert_data_into_table, "alembic_version", data, mode="append")
print(f"{rows_inserted} row(s) inserted")

1 row(s) inserted


## Comparison (SQlite, then arrow)

Creating tables with SQlite, then populating it with arrow data via adbc keeps the schema intact.

In [12]:
!sqldiff databases/recreated_sqlite_then_arrow.sqlite databases/sample.sqlite 