The purpose of this notebook is to build a catalog/index of FIM maps using BigQuery

In [5]:
import os
import typer

import pandas as pd
from pathlib import Path
from typing import Optional
from datetime import datetime
from google.cloud import storage
from google.cloud import bigquery

from typing import Tuple, Optional

from rich.progress import Progress

In [6]:
def upload_to_gcs(local_file, bucket_name, dest_blob_name):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(dest_blob_name)
    blob.upload_from_filename(local_file)
    print(f"Uploaded to gs://{bucket_name}/{dest_blob_name}")

In [7]:
def upload_directory_to_gcs_with_progress(local_dir, bucket_name, gcs_prefix=""):
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    # Collect all files to upload
    file_list = []
    for root, _, files in os.walk(local_dir):
        for file_name in files:
            local_path = os.path.join(root, file_name)
            relative_path = os.path.relpath(local_path, local_dir)
            blob_path = os.path.join(gcs_prefix, relative_path).replace("\\", "/")
            file_list.append((local_path, blob_path))

    with Progress() as progress:
        task = progress.add_task("Uploading", total=len(file_list), filename="")
    
        for local_path, blob_path in file_list:
            progress.update(task, filename=os.path.basename(local_path))
            blob = bucket.blob(blob_path)
            blob.upload_from_filename(local_path)
            progress.advance(task)

## Build BigQuery Catalog

In [11]:
gcs_bucket = 'com_res_fim_output'
prefix = '' # TODO: replace with ''
extension = ".cog"

# Connect to the GCS bucket and all the files matching the extension "cog" in
# the all subdirectories. Save these as a list of paths.
print(
    f"Gathering COG files in {gcs_bucket} with prefix '{prefix}' and extension '{extension}'...",
    end="",
)
client = storage.Client()
bucket = client.bucket(gcs_bucket)
blobs = client.list_blobs(bucket, prefix=prefix)
matching_files = []
for blob in blobs:
    if blob.name.endswith(extension):
        matching_files.append(f"gs://{gcs_bucket}/{blob.name}")
print("done")

print(f'Found {len(matching_files)} matching files')

Gathering COG files in com_res_fim_output with prefix '' and extension '.cog'...done
Found 224497 matching files


In [12]:
%%time 

# organize the matching files by reach id
# extract FIM attributes and save them for building
# that catalog later on.
items = []
for url in matching_files:
    filename_parts = url.split('/')[-1].split('__')
    item_id = url.split('/')[-1].replace(extension, "")
    reach_id = int(filename_parts[0])
    stage = float('.'.join(filename_parts[1].split('_')[0:2]))
    flow = float(filename_parts[2].split('_')[0])

    dat = {
        'item_id': item_id,
        'reach_id': reach_id,
        'stage': stage,
        'flow': flow,
        "asset_url": url,
        "public_url": f'https://storage.googleapis.com/{matching_files[0].replace("gs://", "")}'
           }
    
    items.append(dat)

df = pd.DataFrame(items)


CPU times: user 931 ms, sys: 81 ms, total: 1.01 s
Wall time: 1.07 s


In [13]:
df.to_json("fim_catalog_index.jsonl", orient="records", lines=True)

In [14]:
upload_to_gcs("fim_catalog_index.jsonl", "com_res_fim_output", "fim_catalog_index.jsonl")

Uploaded to gs://com_res_fim_output/fim_catalog_index.jsonl


In [15]:
def load_stac_to_bigquery(gcs_uri: str, table_id: str):
    client = bigquery.Client()

    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
        autodetect=True,
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
    )

    load_job = client.load_table_from_uri(gcs_uri, table_id, job_config=job_config)
    load_job.result()  # Wait for job to complete
    print(f"Loaded {load_job.output_rows} rows to {table_id}")

In [16]:
load_stac_to_bigquery("gs://com_res_fim_output/fim_catalog_index.jsonl", "com-res.flood_data.fim_catalog")

Loaded 224497 rows to com-res.flood_data.fim_catalog


Test searching for a specific file.

In [80]:
%%time

conn = sqlite3.connect("fim_catalog_index.db")
c = conn.cursor()

reach_id = 8584888
stage = 12.5
flow = 3668

results = c.execute("""
    SELECT cog_path
    FROM inundation
    WHERE reach_id = ? AND stage = ? AND flow = ?
""", (reach_id, stage, flow)).fetchall()

for row in results:
    print(row[0])

conn.close()

gs://com_res_fim_output/flood_11010001/11010001_inundation/8584888/8584888__12_5_m__3668_cms_inundation.cog
CPU times: user 1.26 ms, sys: 1.1 ms, total: 2.37 ms
Wall time: 1.18 ms


Test searching for a range of files

In [82]:
%%time

conn = sqlite3.connect("fim_catalog_index.db")
c = conn.cursor()

reach_id = 8584888

results = c.execute("""
    SELECT cog_path
    FROM inundation
    WHERE reach_id = ? 
""", (reach_id,)).fetchall()

for row in results:
    print(row[0])

conn.close()

gs://com_res_fim_output/flood_11010001/11010001_inundation/8584888/8584888__0_5_m__13_cms_inundation.cog
gs://com_res_fim_output/flood_11010001/11010001_inundation/8584888/8584888__1_0_m__40_cms_inundation.cog
gs://com_res_fim_output/flood_11010001/11010001_inundation/8584888/8584888__1_5_m__78_cms_inundation.cog
gs://com_res_fim_output/flood_11010001/11010001_inundation/8584888/8584888__2_0_m__125_cms_inundation.cog
gs://com_res_fim_output/flood_11010001/11010001_inundation/8584888/8584888__2_5_m__180_cms_inundation.cog
gs://com_res_fim_output/flood_11010001/11010001_inundation/8584888/8584888__3_0_m__244_cms_inundation.cog
gs://com_res_fim_output/flood_11010001/11010001_inundation/8584888/8584888__3_5_m__315_cms_inundation.cog
gs://com_res_fim_output/flood_11010001/11010001_inundation/8584888/8584888__4_0_m__394_cms_inundation.cog
gs://com_res_fim_output/flood_11010001/11010001_inundation/8584888/8584888__4_5_m__485_cms_inundation.cog
gs://com_res_fim_output/flood_11010001/11010001_i

Unfortunately, this won't work if we want to access the database remotely :( 

## Query BigQuery

Query a single item

In [17]:
from google.cloud import bigquery

client = bigquery.Client(project="com-res")

In [31]:
%%time 

query = """
SELECT *
FROM `com-res.flood_data.fim_catalog`
WHERE reach_id = @reach_id
ORDER BY stage ASC
"""

job_config = bigquery.QueryJobConfig(
    query_parameters=[
        bigquery.ScalarQueryParameter("reach_id", "INT64", 8584970),
    ]
)

query_job = client.query(query, job_config=job_config)

results = dict(files = [],
           flows_cms = [],
           stages_m = [])

for row in query_job:
    results['files'].append(row['public_url'])
    results['stages_m'].append(row['stage'])
    results['flows_cms'].append(row['flow'])

CPU times: user 11.8 ms, sys: 2.96 ms, total: 14.8 ms
Wall time: 930 ms


In [32]:
results

{'files': ['https://storage.googleapis.com/com_res_fim_output/flood_01080103/01080103_inundation/9325928/9325928__0_5_m__36_cms_inundation.cog',
  'https://storage.googleapis.com/com_res_fim_output/flood_01080103/01080103_inundation/9325928/9325928__0_5_m__36_cms_inundation.cog',
  'https://storage.googleapis.com/com_res_fim_output/flood_01080103/01080103_inundation/9325928/9325928__0_5_m__36_cms_inundation.cog',
  'https://storage.googleapis.com/com_res_fim_output/flood_01080103/01080103_inundation/9325928/9325928__0_5_m__36_cms_inundation.cog',
  'https://storage.googleapis.com/com_res_fim_output/flood_01080103/01080103_inundation/9325928/9325928__0_5_m__36_cms_inundation.cog',
  'https://storage.googleapis.com/com_res_fim_output/flood_01080103/01080103_inundation/9325928/9325928__0_5_m__36_cms_inundation.cog',
  'https://storage.googleapis.com/com_res_fim_output/flood_01080103/01080103_inundation/9325928/9325928__0_5_m__36_cms_inundation.cog',
  'https://storage.googleapis.com/com_r