# Gather Connectivity Search `PathCount` Table

Negar mentioned needing data from a PostgreSQL database archive,
`connectivity-search-pg_dump.sql.gz`, which was created as part of
https://github.com/greenelab/connectivity-search-backend/blob/main/README.md .
The archive is available under https://zenodo.org/records/3978766 .
Only the `PathCount` Table is needed in order to extract single metapaths
at a time (needed for other work).

In [1]:
import gzip
import pathlib

import duckdb
import requests
from pyarrow import parquet

from hetionet_utils.sql import extract_and_write_sql_block

# create the data dir
pathlib.Path("data").mkdir(exist_ok=True)

# url for source data
url = (
    "https://zenodo.org/records/3978766/files/"
    "connectivity-search-pg_dump.sql.gz?download=1"
)

# local archive file location
sql_file = "data/connectivity-search-pg_dump.sql.gz"

# expected number of tables within dump
expected_table_count = 15

# table which is targeted within the sql archive above
target_table_name = "public.dj_hetmech_app_pathcount"

# duckdb filename
duckdb_filename = "data/connectivity-search.duckdb"

In [2]:
# gather postgresql database archive

# if the file doesn't exist, download it
if not pathlib.Path(sql_file).exists():
    # Download the file in streaming mode
    response = requests.get(url, stream=True)

    # Check if the request was successful
    response.raise_for_status()

    # Write the response content to a file in chunks
    with open(sql_file, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                file.write(chunk)

pathlib.Path(sql_file).exists()

True

In [3]:
# show the tables
count = 0
with gzip.open(sql_file, "rt") as f:
    for line in f:
        # seek table creation lines
        if "CREATE TABLE" in line:
            print(line)
            count += 1
            # there are roughly 15 tables
            # so we break here to avoid further processing
            if count == expected_table_count:
                break

CREATE TABLE public.auth_group (

CREATE TABLE public.auth_group_permissions (

CREATE TABLE public.auth_permission (

CREATE TABLE public.auth_user (

CREATE TABLE public.auth_user_groups (

CREATE TABLE public.auth_user_user_permissions (

CREATE TABLE public.dj_hetmech_app_degreegroupedpermutation (

CREATE TABLE public.dj_hetmech_app_metanode (

CREATE TABLE public.dj_hetmech_app_metapath (

CREATE TABLE public.dj_hetmech_app_node (

CREATE TABLE public.dj_hetmech_app_pathcount (

CREATE TABLE public.django_admin_log (

CREATE TABLE public.django_content_type (

CREATE TABLE public.django_migrations (

CREATE TABLE public.django_session (



In [4]:
# gather the create table statement
extract_and_write_sql_block(
    sql_file=sql_file,
    sql_start=f"CREATE TABLE {target_table_name}",
    sql_end=";",
    output_file=(create_table_file := f"create_table.{target_table_name}.sql"),
)

True

In [5]:
# show the create table statement
with open(create_table_file, "r") as create_file:
    create_sql = "".join(create_file.readlines())

print(create_sql)

CREATE TABLE public.dj_hetmech_app_pathcount (
    id integer NOT NULL,
    path_count integer NOT NULL,
    dwpc double precision NOT NULL,
    p_value double precision,
    metapath_id character varying(20) NOT NULL,
    source_id integer NOT NULL,
    target_id integer NOT NULL,
    dgp_id integer NOT NULL,
    CONSTRAINT dj_hetmech_app_pathcount_path_count_check CHECK ((path_count >= 0))
);



In [6]:
# gather the data for the table
extract_and_write_sql_block(
    sql_file=sql_file,
    sql_start=f"COPY {target_table_name}",
    sql_end="\\.",
    output_file=(copy_data_file := f"copy_data.{target_table_name}.sql"),
)

True

In [7]:
# replace the first and last lines of the copy file
# as these are the header and data termination lines
# which have no actual values.
input_file = pathlib.Path(copy_data_file)
# Temporary file with .tmp extension
temp_file = input_file.with_suffix(".tmp")

with input_file.open("r") as infile, temp_file.open("w") as outfile:
    # Skip the first line
    first_line = next(infile, None)

    # Only proceed if the file is not empty
    if first_line is not None:
        # Start with the second line
        prev_line = next(infile, None)
        for line in infile:
            # Write the previous line
            outfile.write(prev_line)
            # Update the previous line buffer
            prev_line = line

        # Note: the last line is in `prev_line` and is not written

# Replace the original file with the temporary file
temp_file.replace(input_file)

PosixPath('copy_data.public.dj_hetmech_app_pathcount.sql')

In [8]:
# create the table
with duckdb.connect(duckdb_filename) as ddb:
    ddb.execute(create_sql.replace("public.", ""))

In [9]:
# copy the data from the file to duckdb database
# as tab-delimited file.
with duckdb.connect(duckdb_filename) as ddb:
    ddb.execute(
        f"""
        COPY dj_hetmech_app_pathcount
        FROM '{copy_data_file}'
        (DELIMITER '\t', HEADER false);
        """
    )

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [10]:
# read and export data to parquet for simpler use
with duckdb.connect(duckdb_filename) as ddb:
    tbl_pathcount = ddb.execute(
        """
        SELECT *
        FROM dj_hetmech_app_pathcount
        """
    ).arrow()

parquet.write_table(
    table=tbl_pathcount,
    where="data/dj_hetmech_app_pathcount.parquet",
    # compress with zstd for higher compression than snappy
    compression="zstd",
)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))