# Biological Process and Gene Metapath Data Gathering

- Each value from `BP.csv` is a source and each value from `Gene.csv` is a target. 
- Each source + target pairing may have a metapath which is found within `metapaths.csv`.
- For each pair metapath we need the DWPC and p-value stored in a table for reference.
- Ignore metapaths found within `metapaths_ignore.csv`.

In [33]:
import pathlib
from itertools import product
from typing import Generator, Iterator, Tuple

import lancedb
import pyarrow as pa
import pyarrow.csv as csv

import hetionet_utils.database

In [34]:
# create results folder
pathlib.Path("data/results").mkdir(exist_ok=True)

# Initialize your LanceDB database and table
db = lancedb.connect("data/results/bioprocess_and_gene_metapaths")
table_name = "bioprocess_gene_metapath_combinations"

# If the table does not exist, create it with an initial empty DataFrame
if table_name not in db.table_names():
    db.create_table(
        table_name, pd.DataFrame(columns=["source_id", "target_id", "metapath"])
    )

table = db.open_table(table_name)

In [35]:
def load_arrow_table(file_path: str, column_name: str) -> pa.Table:
    """Loads a single-column Arrow Table from a CSV file.

    Args:
        file_path (str): The path to the CSV file.
        column_name (str): The name of the column to extract.

    Returns:
        pa.Table: Arrow Table with the specified column.
    """
    table = csv.read_csv(file_path)
    return table.select([column_name])


def generate_combinations(
    table_bioprocesses: pa.Table, table_genes: pa.Table, table_metapaths: pa.Table
) -> Generator[Tuple[str, str, str], None, None]:
    """Generates all possible combinations of IDs from three Arrow tables.

    Args:
        table_bioprocesses (pa.Table): Arrow Table containing bioprocess IDs in an 'id' column.
        table_genes (pa.Table): Arrow Table containing gene IDs in an 'id' column.
        table_metapaths (pa.Table): Arrow Table containing metapath values in a 'metapath' column.

    Yields:
        Tuple[str, str, str]: A tuple with a bioprocess ID, a gene ID, and a metapath value.
    """
    for combo in product(
        table_bioprocesses["id"].to_pylist(),
        table_genes["id"].to_pylist(),
        table_metapaths["metapath"].to_pylist(),
    ):
        yield combo


def process_in_chunks(
    generator: Iterator[Tuple[str, str, str]], chunk_size: int = 1000
) -> Iterator[pa.Table]:
    """Processes combinations from a generator in smaller chunks as Arrow Tables.

    Args:
        generator (Iterator[Tuple[str, str, str]]): A generator that yields tuples of combinations.
        chunk_size (int, optional): The number of rows per chunk. Defaults to 1000.

    Yields:
        pa.Table: An Arrow Table containing a chunk of combinations with columns ['source_id', 'target_id', 'metapath'].
    """
    chunk = []
    for i, combo in enumerate(generator):
        # Convert each combination to a tuple of strings to avoid None values
        combo = tuple(str(x) if x is not None else "" for x in combo)
        chunk.append(combo)

        if (i + 1) % chunk_size == 0:
            # Create Arrow Table from the chunk
            yield pa.table(
                {
                    "source_id": [row[0] for row in chunk],
                    "target_id": [row[1] for row in chunk],
                    "metapath": [row[2] for row in chunk],
                }
            )
            chunk = []

    # Yield any remaining combinations as an Arrow Table
    if chunk:
        yield pa.table(
            {
                "source_id": [row[0] for row in chunk],
                "target_id": [row[1] for row in chunk],
                "metapath": [row[2] for row in chunk],
            }
        )

In [36]:
# gather metapaths which are not in the metapaths_ignore.csv
df_metapaths = pd.read_csv("data/sources/metapaths.csv")
df_metapaths_ignore = pd.read_csv("data/sources/metapaths_ignore.csv")
df_metapaths = df_metapaths[
    ~df_metapaths["metapath"].isin(df_metapaths_ignore["metapath"])
]
df_metapaths.head()

Unnamed: 0,metapath
5,BPpGdAdG
6,BPpGdAeG
7,BPpGdAuG
8,BPpGeAdG
9,BPpGeAeG


In [37]:
# read the biological processes and genes
df_bioprocesses = pd.read_csv("data/sources/BP.csv")
df_genes = pd.read_csv("data/sources/Gene.csv")

print("bioprocesses:\n", df_bioprocesses.head(), "\n\n", "genes:\n", df_genes.head())

bioprocesses:
            id                                 name
0  GO:0000002     mitochondrial genome maintenance
1  GO:0000012           single strand break repair
2  GO:0000018      regulation of DNA recombination
3  GO:0000019  regulation of mitotic recombination
4  GO:0000022           mitotic spindle elongation 

 genes:
       id  name
0      1  A1BG
1     10  NAT2
2    100   ADA
3   1000  CDH2
4  10000  AKT3


In [49]:
# Load input CSV files into Arrow Tables
table_bioprocesses = load_arrow_table("data/sources/BP.csv", "id")
table_genes = load_arrow_table("data/sources/Gene.csv", "id")
table_metapaths = pa.Table.from_pandas(df_metapaths)

# Generate combinations
generator = generate_combinations(table_bioprocesses, table_genes, table_metapaths)

count = 1
# Process and print chunks
for chunk_table in process_in_chunks(generator):
    # add the chunk to the table
    print(f"Adding chunk {count}")
    print(type(chunk_table))
    print(chunk_table.schema)
    table.add(chunk_table)
    count += 1
    break

Adding chunk 1
<class 'pyarrow.lib.Table'>
source_id: string
target_id: string
metapath: string


ArrowNotImplementedError: Unsupported cast from string to null using function cast_null

In [47]:
chunk_df

Unnamed: 0,source_id,target_id,metapath
0,GO:0000002,1,BPpGdAdG
1,GO:0000002,1,BPpGdAeG
2,GO:0000002,1,BPpGdAuG
3,GO:0000002,1,BPpGeAdG
4,GO:0000002,1,BPpGeAeG
...,...,...,...
95,GO:0000002,100,BPpGdAeG
96,GO:0000002,100,BPpGdAuG
97,GO:0000002,100,BPpGeAdG
98,GO:0000002,100,BPpGeAeG


In [18]:
chunk_df.to_csv("testing.csv")

In [None]:
# After inserting all chunks, show the shape of the table
num_rows = table.count()
num_columns = len(table.schema().names)

print(f"Table shape: ({num_rows}, {num_columns})")