# Ingestion Experimentation notebook

This notebook largely exists to experiment with different config methods

## Configuration

These are some fields to configure if you wish to configure how the data is injected.

In [1]:
from lib import notebook_constants as nc
from lib.service.database.defaults import instance_2_config

# If you mark this as true, the table `nsw_valuer_general.raw_entries`
# will be dropped. If you have space limitations and no desire to debug
# the data than dropping this makes sense. If you wish to debug some values
# then keeping this around may make some sense.
GLOBAL_FLAGS = {
    'drop_raw_nsw_valuer_general_entries': True,
    'reinitialise_container': True,
}

db_service_config = instance_2_config
docker_container_name = 'gnaf_db_test'
docker_image_tag = "20240908_19_53"

## Download Static Files

In [2]:
import logging
from lib.service.io import IoService
from lib.tasks.fetch_static_files import initialise, get_session

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

io_service = IoService.create(None)
async with get_session(io_service) as session:
    environment = await initialise(io_service, session)

land_value_dis = environment.land_value
w_sale_price = environment.sale_price_weekly
a_sale_price = environment.sale_price_annual
gnaf_dis = environment.gnaf

2024-10-02 19:49:18,642 - INFO - Checking Target "abs_main_structures.zip"
2024-10-02 19:49:18,642 - INFO - Checking Target "non_abs_shape.zip"
2024-10-02 19:49:18,643 - INFO - Checking Target "g-naf_aug24_allstates_gda2020_psv_1016.zip"
2024-10-02 19:49:18,643 - INFO - Checking Target "nswvg_lv_01_Oct_2024.zip"
2024-10-02 19:49:18,644 - INFO - Checking Target "nswvg_wps_01_Jan_2024.zip"
2024-10-02 19:49:18,644 - INFO - Checking Target "nswvg_wps_08_Jan_2024.zip"
2024-10-02 19:49:18,645 - INFO - Checking Target "nswvg_wps_15_Jan_2024.zip"
2024-10-02 19:49:18,645 - INFO - Checking Target "nswvg_wps_22_Jan_2024.zip"
2024-10-02 19:49:18,646 - INFO - Checking Target "nswvg_wps_29_Jan_2024.zip"
2024-10-02 19:49:18,646 - INFO - Checking Target "nswvg_wps_05_Feb_2024.zip"
2024-10-02 19:49:18,646 - INFO - Checking Target "nswvg_wps_12_Feb_2024.zip"
2024-10-02 19:49:18,647 - INFO - Checking Target "nswvg_wps_19_Feb_2024.zip"
2024-10-02 19:49:18,647 - INFO - Checking Target "nswvg_wps_26_Feb_202

## Create Docker Container and Database

In [3]:
from lib.gnaf_db import GnafContainer, GnafImage
from lib.gnaf.init_schema import init_target_schema
from lib.service.database import DatabaseService

if GLOBAL_FLAGS['reinitialise_container']:
    image = GnafImage.create(tag=docker_image_tag)
    image.prepare()
    
    container = GnafContainer.create(container_name=docker_container_name, image=image)
    container.clean()
    container.prepare(db_service_config)
    container.start()
else:
    print('skipping container initialisation')

db_service = DatabaseService(db_service_config)
await db_service.wait_till_running()

if GLOBAL_FLAGS['reinitialise_container']:
    await init_target_schema(gnaf_dis.publication, io_service, db_service)
else:
    print('skipping DB initialisation')
    raise Exception()

2024-10-02 19:49:24,687 - INFO - running ./_out_zip/g-naf_aug24_allstates_gda2020_psv_1016/G-NAF/Extras/GNAF_TableCreation_Scripts/create_tables_ansi.sql
2024-10-02 19:49:24,718 - INFO - running ./_out_zip/g-naf_aug24_allstates_gda2020_psv_1016/G-NAF/Extras/GNAF_TableCreation_Scripts/add_fk_constraints.sql
2024-10-02 19:49:24,766 - INFO - running sql/move_gnaf_to_schema.sql


## Init DB Schema

In [4]:
from lib.tasks.update_schema import update_schema, UpdateSchemaConfig

await update_schema(
    UpdateSchemaConfig(apply=True),
    db_service,
    io_service,
)

2024-10-02 19:49:24,785 - INFO - initalising nsw_vg db schema
2024-10-02 19:49:24,795 - INFO - running sql/meta/schema/001_APPLY_init.sql
2024-10-02 19:49:24,803 - INFO - running sql/abs/schema/001_APPLY_init.sql
2024-10-02 19:49:24,804 - INFO - running sql/nsw_lrs/schema/001_APPLY_init.sql
2024-10-02 19:49:24,807 - INFO - running sql/nsw_environment/schema/001_APPLY_init.sql
2024-10-02 19:49:24,812 - INFO - running sql/nsw_property/schema/001_APPLY_init.sql
2024-10-02 19:49:24,822 - INFO - running sql/nsw_vg/schema/001_APPLY_init.sql
2024-10-02 19:49:24,824 - INFO - running sql/nsw_vg/schema/002_APPLY_create_raw.sql
2024-10-02 19:49:24,834 - INFO - running sql/nsw_vg/schema/003_APPLY_shared_tables.sql
2024-10-02 19:49:24,846 - INFO - running sql/nsw_vg/schema/004_APPLY_land_value_tables.sql


## Ingest ABS Data

In [5]:
from lib.tasks.ingest_abs import ingest_all

await ingest_all(db_service)

2024-10-02 19:49:28,271 - INFO - Populated abs_main_structures.state with 10/10 rows.
2024-10-02 19:49:31,105 - INFO - Populated abs_main_structures.gccsa with 35/35 rows.
2024-10-02 19:49:34,893 - INFO - Populated abs_main_structures.sa4 with 108/108 rows.
2024-10-02 19:49:39,466 - INFO - Populated abs_main_structures.sa3 with 359/359 rows.
2024-10-02 19:49:45,869 - INFO - Populated abs_main_structures.sa2 with 2473/2473 rows.
2024-10-02 19:50:01,536 - INFO - Populated abs_main_structures.sa1 with 61845/61845 rows.
2024-10-02 19:50:47,701 - INFO - Populated abs_main_structures.meshblock with 368286/368286 rows.
2024-10-02 19:51:00,971 - INFO - Populated non_abs_main_structures.localities with 15353/15353 rows.
2024-10-02 19:51:05,245 - INFO - Populated non_abs_main_structures.state_electoral_division_2021 with 452/452 rows.
2024-10-02 19:51:09,633 - INFO - Populated non_abs_main_structures.state_electoral_division_2022 with 452/452 rows.
2024-10-02 19:51:14,090 - INFO - Populated non_

## Ingest NSW Valuer General Land Values

In [6]:
from lib.tasks.nsw_vg.ingest_land_values import ingest_land_values, NswVgLandValueIngestionConfig

await ingest_land_values(
    NswVgLandValueIngestionConfig(
        keep_raw=not GLOBAL_FLAGS['drop_raw_nsw_valuer_general_entries'],
    ),
    db_service,
    environment.land_value.latest,
)

2024-10-02 19:51:52,723 - INFO - Step 1: Ingest raw files
  df = pd.read_csv(full_file_path, encoding='utf-8')
  df = pd.read_csv(full_file_path, encoding='utf-8')
  df = pd.read_csv(full_file_path, encoding='utf-8')
  df = pd.read_csv(full_file_path, encoding='utf-8')
  df = pd.read_csv(full_file_path, encoding='utf-8')
  df = pd.read_csv(full_file_path, encoding='utf-8')
  df = pd.read_csv(full_file_path, encoding='utf-8')
  df = pd.read_csv(full_file_path, encoding='utf-8')
  df = pd.read_csv(full_file_path, encoding='utf-8')
  df = pd.read_csv(full_file_path, encoding='utf-8')
  df = pd.read_csv(full_file_path, encoding='utf-8')
  df = pd.read_csv(full_file_path, encoding='utf-8')
  df = pd.read_csv(full_file_path, encoding='utf-8')
  df = pd.read_csv(full_file_path, encoding='utf-8')
2024-10-02 19:55:22,034 - INFO - Step 2: Create Valuer General Tables
2024-10-02 19:57:06,265 - INFO - district 128
2024-10-02 19:57:06,276 - INFO - suburb 5074
2024-10-02 19:57:06,297 - INFO - street

## Ingest Gnaf

In [None]:
from lib.tasks.ingest_gnaf import ingest_gnaf
await ingest_gnaf(gnaf_dis.publication, db_service)

2024-10-02 20:13:12,368 - INFO - Populating from Authority_Code_STREET_TYPE_AUT_psv.psv
2024-10-02 20:13:12,370 - INFO - Populating from Authority_Code_ADDRESS_CHANGE_TYPE_AUT_psv.psv
2024-10-02 20:13:12,374 - INFO - Populating from Authority_Code_GEOCODE_TYPE_AUT_psv.psv
2024-10-02 20:13:12,380 - INFO - Populating from Authority_Code_MB_MATCH_CODE_AUT_psv.psv
2024-10-02 20:13:12,380 - INFO - Populating from Authority_Code_FLAT_TYPE_AUT_psv.psv
2024-10-02 20:13:12,381 - INFO - Populating from Authority_Code_GEOCODED_LEVEL_TYPE_AUT_psv.psv
2024-10-02 20:13:12,383 - INFO - Populating from Authority_Code_GEOCODE_RELIABILITY_AUT_psv.psv
2024-10-02 20:13:12,385 - INFO - Populating from Authority_Code_LOCALITY_CLASS_AUT_psv.psv
2024-10-02 20:13:12,385 - INFO - Populating from Authority_Code_LEVEL_TYPE_AUT_psv.psv
2024-10-02 20:13:12,385 - INFO - Populating from Authority_Code_STREET_SUFFIX_AUT_psv.psv
2024-10-02 20:13:12,390 - INFO - Populating from Authority_Code_ADDRESS_ALIAS_TYPE_AUT_psv.

## Done

In [None]:
async with db_service.async_connect() as c, c.cursor() as cursor:
    for schema in ['nsw_valuer_general', 'gnaf', 'abs_main_structures', 'non_abs_main_structures']:
        # Get the list of all tables
        cursor.execute(f"""
            SELECT table_name
            FROM information_schema.tables
            WHERE table_schema = '{schema}'
        """)
        tables = cursor.fetchall()
    
        # Get row count for each table
        for table in tables:
            await cursor.execute(f'SELECT COUNT(*) FROM {schema}.{table[0]}')
            count = cursor.fetchone()[0]
            print(f"Table {schema}.{table[0]} has {count} rows")