# Gather Connectivity Search `PathCount` Table

Negar mentioned needing data from a PostgreSQL database archive, `connectivity-search-pg_dump.sql.gz`, which was created as part of https://github.com/greenelab/connectivity-search-backend/blob/main/README.md .
The archive is available under https://zenodo.org/records/3978766 . Only the `PathCount` Table is needed in order to extract single metapaths at a time (needed for other work).
                                                                                                                  This PR was mentioned as a resource in case it's needed greenelab/connectivity-search-backend#79 .
                                                                                                                                                                                                           

In [5]:
import gzip
import pathlib
from typing import List, Optional

import duckdb
import requests
from pyarrow import parquet

from hetionet_utils.sql import extract_and_write_sql_block

# create the data dir
pathlib.Path("data").mkdir(exist_ok=True)

# url for source data
url = "https://zenodo.org/records/3978766/files/connectivity-search-pg_dump.sql.gz?download=1"
# local archive file location
sql_file = "data/connectivity-search-pg_dump.sql.gz"
# table which is targeted within the sql archive above
target_table_name = "public.dj_hetmech_app_pathcount"
# duckdb filename
duckdb_filename = "data/connectivity-search.duckdb"

In [6]:
# gather postgresql database archive

# if the file doesn't exist, download it
if not pathlib.Path(sql_file).exists():
    # Download the file in streaming mode
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Check if the request was successful

    # Write the response content to a file in chunks
    with open(output_file, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                file.write(chunk)

pathlib.Path(sql_file).exists()

True

In [7]:
# show the tables
count = 0
with gzip.open(sql_file, "rt") as f:
    for line in f:
        # seek table creation lines
        if "CREATE TABLE" in line:
            print(line)
            count += 1
            # there are roughly 15 tables
            # so we break here to avoid further processing
            if count == 15:
                break

CREATE TABLE public.auth_group (

CREATE TABLE public.auth_group_permissions (

CREATE TABLE public.auth_permission (

CREATE TABLE public.auth_user (

CREATE TABLE public.auth_user_groups (

CREATE TABLE public.auth_user_user_permissions (

CREATE TABLE public.dj_hetmech_app_degreegroupedpermutation (

CREATE TABLE public.dj_hetmech_app_metanode (

CREATE TABLE public.dj_hetmech_app_metapath (

CREATE TABLE public.dj_hetmech_app_node (

CREATE TABLE public.dj_hetmech_app_pathcount (

CREATE TABLE public.django_admin_log (

CREATE TABLE public.django_content_type (

CREATE TABLE public.django_migrations (

CREATE TABLE public.django_session (



In [8]:
# gather the create table statement
extract_and_write_sql_block(
    sql_file=sql_file,
    sql_start=f"CREATE TABLE {target_table_name}",
    sql_end=";",
    output_file=(create_table_file := f"create_table.{target_table_name}.sql"),
)

'create_table.public.dj_hetmech_app_pathcount.sql'

In [12]:
# show the create table statement
with open(create_table_file, "r") as create_file:
    print("".join(create_file.readlines()))

CREATE TABLE public.dj_hetmech_app_pathcount (
    id integer NOT NULL,
    path_count integer NOT NULL,
    dwpc double precision NOT NULL,
    p_value double precision,
    metapath_id character varying(20) NOT NULL,
    source_id integer NOT NULL,
    target_id integer NOT NULL,
    dgp_id integer NOT NULL,
    CONSTRAINT dj_hetmech_app_pathcount_path_count_check CHECK ((path_count >= 0))
);



In [18]:
# gather the data for the table
# note: we avoid printing the file below as it is
# relatively large at ~14.3 GB.
extract_and_write_sql_block(
    sql_file=sql_file,
    sql_start=f"COPY {target_table_name}",
    sql_end="\\.",
    output_file=f"copy_data.{target_table_name}.sql",
)

'copy_data.public.dj_hetmech_app_pathcount.sql'

In [27]:
# create the table
with duckdb.connect(duckdb_filename) as ddb:
    ddb.execute(create_sql.replace("public.", ""))

CatalogException: Catalog Error: Table with name "dj_hetmech_app_pathcount" already exists!

In [26]:
# remove the first and last line of the data to avoid conflicts with a load
# note: we use sed from a macos terminal, which may vary from system to system.
# sed was used here to help avoid unnecessary data duplication and complexity
# in processing through python.
!sed -i '' '1d;$d' copy_data.public.dj_hetmech_app_pathcount.sql

In [28]:
# copy the data from the file to duckdb database
# as tab-delimited file.
with duckdb.connect(duckdb_filename) as ddb:
    ddb.execute(
        """
        COPY dj_hetmech_app_pathcount
        FROM 'copy_data.public.dj_hetmech_app_pathcount.sql'
        (DELIMITER '\t', HEADER false);
        """
    )

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [31]:
# read and export data to parquet for ease of use
with duckdb.connect(duckdb_filename) as ddb:
    tbl_pathcount = ddb.execute(
        """
        SELECT *
        FROM dj_hetmech_app_pathcount
        """
    ).arrow()

parquet.write_table(
    table=tbl_pathcount,
    where="data/dj_hetmech_app_pathcount.parquet",
    # compress with zstd for higher compression than snappy
    compression="zstd",
)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))