# How does pyarrow handle metadata?

In [1]:
import adbc_driver_manager.dbapi
import adbc_driver_sqlite.dbapi
import pyarrow as pa

from typing import Any, Callable

In [2]:
SAMPLE_SQLITE_DATABASE_FILENAME = "databases/sample.sqlite"
RECREATED_DATABASE_FILENAME = "databases/recreated.sqlite"

In [3]:
def execute_on_sqlite(database_uri: str, my_function: Callable, *args: Any, **kwargs: Any) -> Any:
    """Execute a function `my_function` on an SQlite database at `database_uri`.
    
    Returns:
        Result of `my_function`.
    """
    with adbc_driver_sqlite.dbapi.connect(database_uri) as connection:
        with connection.cursor() as cursor:
            result = my_function(cursor, *args, **kwargs)
        connection.commit()
        return result

In [4]:
def insert_data_into_table(cursor: adbc_driver_manager.dbapi.Cursor,
                           table_name: str,
                           data: pa.lib.Table,
                           mode: str) -> int:
    """Create a table and fill it with data or append data to an existing one.
    
    Args:
        cursor: database cursor
        table_name: name of the table to create or append to
        data: pa table
        mode: "create" a new table, or "append" to an existing one

    Returns:
        Number of rows inserted
    """

    if mode == "create":
        cursor.execute(f"DROP TABLE IF EXISTS {table_name}")

    result = cursor.adbc_ingest(table_name, data, mode=mode)
    return result

In [5]:
def database_equality(uri_database_1: str,
                    uri_database_2: str) -> bool:

    connection: adbc_driver_manager.dbapi.Connection
    
    with adbc_driver_sqlite.dbapi.connect(uri_database_1) as connection:
        objects_1: py.lib.RecordBatchReader = connection.adbc_get_objects()

    with adbc_driver_sqlite.dbapi.connect(uri_database_2) as connection:
        objects_2: py.lib.RecordBatchReader = connection.adbc_get_objects()

    equal: bool = objects_1.read_all().equals(objects_2.read_all())
    return equal

## Population of the Spine database with SQlite
This is a practical example and part of the "Load SpineOpt Template" operation.

In [6]:
sqlite_command = "DROP TABLE IF EXISTS alembic_version;" + \
"CREATE TABLE alembic_version (" + \
"	version_num VARCHAR(32) NOT NULL," + \
"	CONSTRAINT alembic_version_pkc PRIMARY KEY (version_num)" + \
"); " + \
"INSERT INTO alembic_version(rowid,version_num) VALUES(1,'989fccf80441');"


!sqlite3 "{SAMPLE_SQLITE_DATABASE_FILENAME}" "{sqlite_command}" ".exit"

## Re-creation attempt with pyarrow

In [7]:
schema = pa.schema(
    [
        pa.field(name="version_num",
                 type=pa.string(),
                 nullable=False)
    ]
)

In [8]:
data = pa.table(
    [
        ["989fccf80441"],
    ],
    schema=schema,
)

In [9]:
rows_inserted = execute_on_sqlite(f"file:{RECREATED_DATABASE_FILENAME}", insert_data_into_table, "alembic_version", data, mode="create")
print(f"{rows_inserted} row(s) inserted")

1 row(s) inserted


### Comparison (pure arrow)

Not surprisingly, an inspection with `sqlitebrowser` shows that the schema differs. The field type is not transferred properly, and neither is `nullable`. There seems to be no way to create constraints, set primary keys or set SQlite-specific data types.

https://arrow.apache.org/adbc/0.5.1/driver/flight_sql.html#metadata

`sqldiff` does not show differences though. Maybe because the content is the same?

In [10]:
!sqldiff {SAMPLE_SQLITE_DATABASE_FILENAME} {RECREATED_DATABASE_FILENAME}

### Comparison with ADBC (pure arrow)

In [11]:
with adbc_driver_sqlite.dbapi.connect("file:" + RECREATED_DATABASE_FILENAME) as connection:
    objects = connection.adbc_get_objects()

objects.read_all().to_pydict()

{'catalog_name': ['main'],
 'catalog_db_schemas': [[{'db_schema_name': None,
    'db_schema_tables': [{'table_name': 'alembic_version',
      'table_type': 'table',
      'table_columns': [{'column_name': 'version_num',
        'ordinal_position': 1,
        'remarks': None,
        'xdbc_data_type': None,
        'xdbc_type_name': '',
        'xdbc_column_size': None,
        'xdbc_decimal_digits': None,
        'xdbc_num_prec_radix': None,
        'xdbc_nullable': 1,
        'xdbc_column_def': None,
        'xdbc_sql_data_type': None,
        'xdbc_datetime_sub': None,
        'xdbc_char_octet_length': None,
        'xdbc_is_nullable': 'YES',
        'xdbc_scope_catalog': None,
        'xdbc_scope_schema': None,
        'xdbc_scope_table': None,
        'xdbc_is_autoincrement': None,
        'xdbc_is_generatedcolumn': None}],
      'table_constraints': []}]}]]}

In [12]:
with adbc_driver_sqlite.dbapi.connect("file:" + SAMPLE_SQLITE_DATABASE_FILENAME) as connection:
    objects = connection.adbc_get_objects()

objects.read_all().to_pydict()

{'catalog_name': ['main'],
 'catalog_db_schemas': [[{'db_schema_name': None,
    'db_schema_tables': [{'table_name': 'alembic_version',
      'table_type': 'table',
      'table_columns': [{'column_name': 'version_num',
        'ordinal_position': 1,
        'remarks': None,
        'xdbc_data_type': None,
        'xdbc_type_name': 'VARCHAR(32)',
        'xdbc_column_size': None,
        'xdbc_decimal_digits': None,
        'xdbc_num_prec_radix': None,
        'xdbc_nullable': 0,
        'xdbc_column_def': None,
        'xdbc_sql_data_type': None,
        'xdbc_datetime_sub': None,
        'xdbc_char_octet_length': None,
        'xdbc_is_nullable': 'NO',
        'xdbc_scope_catalog': None,
        'xdbc_scope_schema': None,
        'xdbc_scope_table': None,
        'xdbc_is_autoincrement': None,
        'xdbc_is_generatedcolumn': None}],
      'table_constraints': [{'constraint_name': None,
        'constraint_type': 'PRIMARY KEY',
        'constraint_column_names': ['version_num'],
  

In [13]:
db_equal_only_adbc = database_equality("file:" + SAMPLE_SQLITE_DATABASE_FILENAME, "file:" + RECREATED_DATABASE_FILENAME)

print(f"Can we re-create an SQlite database with only arrow? {db_equal_only_adbc}")

Can we re-create an SQlite database with only arrow? False


## Create database first, then populate with pyarrow

In [14]:
RECREATED_SQLITE_THEN_ARROW_DATABASE_FILENAME = "databases/recreated_sqlite_then_arrow.sqlite"

In [15]:
sqlite_command = "DROP TABLE IF EXISTS alembic_version;" + \
"CREATE TABLE alembic_version (" + \
"	version_num VARCHAR(32) NOT NULL," + \
"	CONSTRAINT alembic_version_pkc PRIMARY KEY (version_num)" + \
"); "

!sqlite3 "{RECREATED_SQLITE_THEN_ARROW_DATABASE_FILENAME}" "{sqlite_command}" ".exit"

In [16]:
rows_inserted = execute_on_sqlite(f"file:{RECREATED_SQLITE_THEN_ARROW_DATABASE_FILENAME}", insert_data_into_table, "alembic_version", data, mode="append")
print(f"{rows_inserted} row(s) inserted")

1 row(s) inserted


### Comparison (SQlite, then arrow)

Creating tables with SQlite, then populating it with arrow data via adbc keeps the schema intact.

`sqlitebrowser` shows the exact same schema. Of course `sqldiff` finds no differences as well.

In [17]:
!sqldiff databases/recreated_sqlite_then_arrow.sqlite databases/sample.sqlite 

### Comparison with ADBC (SQlite, then arrow)

In [18]:
db_equal_sqlite_and_adbc = database_equality("file:" + SAMPLE_SQLITE_DATABASE_FILENAME, "file:" + RECREATED_SQLITE_THEN_ARROW_DATABASE_FILENAME)

print(f"Can we re-create an SQlite database with an SQlite schema and arrow data? {db_equal_sqlite_and_adbc}")

Can we re-create an SQlite database with an SQlite schema and arrow data? True
