# BioMedGraphica Pathway

## 1. Data Access  
### Direct Download Links  
**Reactome**: Can be downloaded directly via the link without the need for registration. [Link](https://reactome.org/download/current/ReactomePathways.txt)  
**KEGG**: Fetch data by R  
**Pathway Ontology**: Can be downloaded directly via the link without the need for registration. [Link](https://download.rgd.mcw.edu/ontology/pathway/pathway.obo)  
**ComPath**: Can be downloaded directly via the link without the need for registration. [Link](https://compath.scai.fraunhofer.de/export_mappings)  

### KEGG API

In [None]:
from bioservices import KEGG
import pandas as pd

k = KEGG()
k.organism = "hsa"

# human
pathway_ids = k.pathwayIds

def format_value(value):
    if isinstance(value, list):
        return ';'.join(format_value(item) for item in value)
    elif isinstance(value, dict):
        return ';'.join(f"{k}: {v}" for k, v in value.items())
    else:
        return str(value)

def format_for_dataframe(data):
    return {key: format_value(value) for key, value in data.items()}

full_pathway = pd.DataFrame()

for pid in pathway_ids:
    try:
        pathway_info = k.get(pid)  
        dict_data = k.parse(pathway_info)
        formatted_dict_data = format_for_dataframe(dict_data) 
        df = pd.DataFrame([formatted_dict_data])
        full_pathway = pd.concat([full_pathway, df], ignore_index=True)
    except Exception as e:
        print(f"Error processing pathway {pid}: {e}")

print(full_pathway.head())

full_pathway.to_csv('full_kegg_pathways.csv', index=False)

### WikiPathways API

In [1]:
import requests
import pandas as pd

url = "https://webservice.wikipathways.org/listPathways"
params = {
    "format": "json"
}

response = requests.get(url, params=params)
data = response.json()

pathways = data['pathways']

human_pathways = []

for pathway in pathways:
    if pathway['species'] == "Homo sapiens":
        human_pathways.append({
            'id': pathway['id'],
            'name': pathway['name'],
            'url': pathway['url'],
            'revision': pathway['revision']
        })

df = pd.DataFrame(human_pathways)
df.to_csv('human_pathways.csv', index=False)

## 2. Load Data

### 2.1 Reactome

In [17]:
df_reactome = pd.read_csv('ReactomePathways.txt', sep='\t', header=None, names=['reactome_id', 'Reactome_Name', 'species'])
df_reactome = df_reactome[df_reactome['species'] == 'Homo sapiens']
df_reactome = df_reactome.drop(columns=['species'])
df_reactome

Unnamed: 0,reactome_id,Reactome_Name
10624,R-HSA-164843,2-LTR circle formation
10625,R-HSA-9909438,3-Methylcrotonyl-CoA carboxylase deficiency
10626,R-HSA-9916722,3-hydroxyisobutyryl-CoA hydrolase deficiency
10627,R-HSA-9914274,3-methylglutaconic aciduria
10628,R-HSA-73843,5-Phosphoribose 1-diphosphate biosynthesis
...,...,...
13370,R-HSA-9703009,tamatinib-resistant FLT3 mutants
13371,R-HSA-9702636,tandutinib-resistant FLT3 mutants
13372,R-HSA-199992,trans-Golgi Network Vesicle Budding
13373,R-HSA-192814,vRNA Synthesis


### 2.2 KEGG

In [7]:
df_kegg = pd.read_csv('full_kegg_pathways.csv')

df_kegg_filter = df_kegg[['ENTRY', 'NAME']]
df_kegg_filter.columns = ['kegg_id', 'Name']
df_kegg_filter['Name'] = df_kegg_filter['Name'].str.split(' - ').str[0]
df_kegg_filter['kegg_id'] = df_kegg_filter['kegg_id'].str.split(' ').str[0]
df_kegg_filter = df_kegg_filter.rename(columns={'Name':'KEGG_Name'})
df_kegg_filter

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_kegg_filter['Name'] = df_kegg_filter['Name'].str.split(' - ').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_kegg_filter['kegg_id'] = df_kegg_filter['kegg_id'].str.split(' ').str[0]


Unnamed: 0,kegg_id,KEGG_Name
0,hsa01100,Metabolic pathways
1,hsa01200,Carbon metabolism
2,hsa01210,2-Oxocarboxylic acid metabolism
3,hsa01212,Fatty acid metabolism
4,hsa01230,Biosynthesis of amino acids
...,...,...
360,hsa04934,Cushing syndrome
361,hsa01521,EGFR tyrosine kinase inhibitor resistance
362,hsa01524,Platinum drug resistance
363,hsa01523,Antifolate resistance


### 2.3 WikiPathways

In [18]:
df_wikipathways = pd.read_csv('human_pathways.csv')

df_wikipathways_filter = df_wikipathways.drop(columns=['revision','url'])
df_wikipathways_filter = df_wikipathways_filter.rename(columns={'name':'WikiPathways_Name'})
df_wikipathways_filter

Unnamed: 0,id,WikiPathways_Name
0,WP100,Glutathione metabolism
1,WP106,Alanine and aspartate metabolism
2,WP107,Translation factors
3,WP111,Electron transport chain: OXPHOS system in mit...
4,WP117,"GPCRs, other"
...,...,...
1529,WP734,Serotonin receptor 4/6/7 and NR3C signaling
1530,WP75,Toll-like receptor signaling
1531,WP78,TCA cycle (aka Krebs or citric acid cycle)
1532,WP80,Nucleotide GPCRs


### 2.4 Pathway Ontology

In [8]:
# https://download.rgd.mcw.edu/ontology/pathway/pathway.obo 
# Define the file path
file_path = 'pathway.obo'

# Read the file
with open(file_path, 'r') as file:
    content = file.readlines()

# Initialize a list to store term dictionaries
terms = []

# Initialize a dictionary to store current term attributes
current_term = {}

# Iterate through each line in the file to extract term attributes
for line in content:
    line = line.strip()
    if line == "[Term]":
        # If we encounter a new term, save the previous term
        if current_term:
            terms.append(current_term)
            current_term = {}
    elif line:
        key_value = line.split(": ", 1)
        if len(key_value) == 2:
            key, value = key_value
            current_term[key] = value

# Append the last term
if current_term:
    terms.append(current_term)

# Convert the list of dictionaries to a DataFrame
df_po = pd.DataFrame(terms)
df_po = df_po.drop(columns=['format-version', 'data-version', 'date', 'saved-by', 'auto-generated-by', 'default-namespace', 'ontology'])
df_po.reset_index().drop('index', axis=1)
df_po = df_po.drop(0)
df_po = df_po.drop(1)
df_po = df_po.drop(df_po.index[-1])

df_po_filter = df_po[['id', 'name', 'def']]
df_po_filter = df_po_filter.reset_index().drop('index', axis=1)
df_po_filter = df_po_filter.rename(columns={'name':'PO_Name'})
df_po_filter

Unnamed: 0,id,PO_Name,def
0,PW:0000001,pathway,"""A pathway is a set of inter-connected reacti..."
1,PW:0000002,classic metabolic pathway,"""The various, enzyme-controlled, series of rea..."
2,PW:0000003,signaling pathway,"""The pathways where a signal - hormone, neurot..."
3,PW:0000004,regulatory pathway,"""The pathways that control the processes by wh..."
4,PW:0000005,carbohydrate metabolic pathway,"""Those metabolic reactions and pathways involv..."
...,...,...,...
2672,PW:0002680,NADPH regeneration pathway,"""Those metabolic reactions involved in the gen..."
2673,PW:0002681,very long-chain fatty acid beta degradation pa...,"""Fatty acid degradation via beta oxidation, ac..."
2674,PW:0002682,muscle contraction pathway,"""A pathway in which force is generated within ..."
2675,PW:0002683,striated muscle contraction pathway,"""A pathway in which force is generated within ..."


In [9]:
import re
# Convert NaNs to empty strings for processing
df_po_filter['def'] = df_po_filter['def'].fillna('')

# Function to extract text within square brackets
def extract_square_brackets(text):
    match = re.search(r'\[(.*?)\]', text)
    if match:
        return match.group(1)
    return None

df_po_xref = df_po_filter.copy()
# Apply the function to create a new column 'brackets_content'
df_po_xref['xref'] = df_po_xref['def'].apply(extract_square_brackets)
# Remove the square brackets content from the 'def' column
df_po_xref['def'] = df_po_xref['def'].apply(lambda x: re.sub(r'\[.*?\]', '', x))
# Remove leading and trailing quotes from the 'def' column
df_po_xref['def'] = df_po_xref['def'].str.replace('"', '')

df_po_xref

Unnamed: 0,id,PO_Name,def,xref
0,PW:0000001,pathway,A pathway is a set of inter-connected reactio...,
1,PW:0000002,classic metabolic pathway,"The various, enzyme-controlled, series of reac...","GO:0008152, http://www.onelook.com/ ""OneLook"",..."
2,PW:0000003,signaling pathway,"The pathways where a signal - hormone, neurotr...","GO:0007165, OneLook:www.onelook.com, Reactome:..."
3,PW:0000004,regulatory pathway,The pathways that control the processes by whi...,OneLook:www.onelook.com
4,PW:0000005,carbohydrate metabolic pathway,Those metabolic reactions and pathways involve...,"GO:0005975, OneLook:www.onelook.com, Reactome:..."
...,...,...,...,...
2672,PW:0002680,NADPH regeneration pathway,Those metabolic reactions involved in the gene...,"GO:0006740, Reactome:R-HSA-389542"
2673,PW:0002681,very long-chain fatty acid beta degradation pa...,"Fatty acid degradation via beta oxidation, act...","GO:0140493, Reactome:R-HSA-390247"
2674,PW:0002682,muscle contraction pathway,A pathway in which force is generated within m...,"GO:0006936, Reactome:R-HSA-397014"
2675,PW:0002683,striated muscle contraction pathway,A pathway in which force is generated within s...,"GO:0006941, Reactome:R-HSA-390522"


In [10]:
# Function to filter 'brackets_content' for KEGG and Reactome entries only
def filter_kegg_reactome(text):
    if isinstance(text, str):  # Check if the text is a string
        items = text.split(',')
        filtered_items = [item.strip() for item in items if 'KEGG' in item or 'Reactome' in item]
        return ', '.join(filtered_items)
    return ''

# Apply the function to filter the 'xref' column
df_po_xref['xref'] = df_po_xref['xref'].apply(filter_kegg_reactome)

# Display the modified DataFrame
df_po_xref

Unnamed: 0,id,PO_Name,def,xref
0,PW:0000001,pathway,A pathway is a set of inter-connected reactio...,
1,PW:0000002,classic metabolic pathway,"The various, enzyme-controlled, series of reac...",Reactome:R-HSA-1430728
2,PW:0000003,signaling pathway,"The pathways where a signal - hormone, neurotr...",Reactome:R-HSA-162582
3,PW:0000004,regulatory pathway,The pathways that control the processes by whi...,
4,PW:0000005,carbohydrate metabolic pathway,Those metabolic reactions and pathways involve...,Reactome:R-HSA-71387
...,...,...,...,...
2672,PW:0002680,NADPH regeneration pathway,Those metabolic reactions involved in the gene...,Reactome:R-HSA-389542
2673,PW:0002681,very long-chain fatty acid beta degradation pa...,"Fatty acid degradation via beta oxidation, act...",Reactome:R-HSA-390247
2674,PW:0002682,muscle contraction pathway,A pathway in which force is generated within m...,Reactome:R-HSA-397014
2675,PW:0002683,striated muscle contraction pathway,A pathway in which force is generated within s...,Reactome:R-HSA-390522


In [11]:
# Function to extract KEGG entries
def extract_kegg(text):
    if isinstance(text, str) and 'KEGG' in text:
        items = [item.strip() for item in text.split(',') if 'KEGG' in item]
        return ', '.join(items)
    return None

# Function to extract Reactome entries
def extract_reactome(text):
    if isinstance(text, str) and 'Reactome' in text:
        items = [item.strip() for item in text.split(',') if 'Reactome' in item]
        return ', '.join(items)
    return None

df_po_xref_split = df_po_xref.copy()
# Create separate columns for KEGG and Reactome
df_po_xref_split['KEGG'] = df_po_xref_split['xref'].apply(extract_kegg)
df_po_xref_split['Reactome'] = df_po_xref_split['xref'].apply(extract_reactome)

# Display the modified DataFrame
df_po_xref_split

Unnamed: 0,id,PO_Name,def,xref,KEGG,Reactome
0,PW:0000001,pathway,A pathway is a set of inter-connected reactio...,,,
1,PW:0000002,classic metabolic pathway,"The various, enzyme-controlled, series of reac...",Reactome:R-HSA-1430728,,Reactome:R-HSA-1430728
2,PW:0000003,signaling pathway,"The pathways where a signal - hormone, neurotr...",Reactome:R-HSA-162582,,Reactome:R-HSA-162582
3,PW:0000004,regulatory pathway,The pathways that control the processes by whi...,,,
4,PW:0000005,carbohydrate metabolic pathway,Those metabolic reactions and pathways involve...,Reactome:R-HSA-71387,,Reactome:R-HSA-71387
...,...,...,...,...,...,...
2672,PW:0002680,NADPH regeneration pathway,Those metabolic reactions involved in the gene...,Reactome:R-HSA-389542,,Reactome:R-HSA-389542
2673,PW:0002681,very long-chain fatty acid beta degradation pa...,"Fatty acid degradation via beta oxidation, act...",Reactome:R-HSA-390247,,Reactome:R-HSA-390247
2674,PW:0002682,muscle contraction pathway,A pathway in which force is generated within m...,Reactome:R-HSA-397014,,Reactome:R-HSA-397014
2675,PW:0002683,striated muscle contraction pathway,A pathway in which force is generated within s...,Reactome:R-HSA-390522,,Reactome:R-HSA-390522


In [12]:
df_po_xref_split = df_po_xref_split.drop(columns=['xref'])
df_po_xref_split['Reactome'] = df_po_xref_split['Reactome'].str.replace('Reactome:', '')
df_po_xref_split['KEGG'] = df_po_xref_split['KEGG'].str.replace('KEGG:', '')
df_po_xref_split['KEGG'] = df_po_xref_split['KEGG'].str.replace('map', '')
df_po_xref_split['KEGG'] = df_po_xref_split['KEGG'].apply(lambda x: 'hsa' + x if x != None else None)
df_po_xref_split

Unnamed: 0,id,PO_Name,def,KEGG,Reactome
0,PW:0000001,pathway,A pathway is a set of inter-connected reactio...,,
1,PW:0000002,classic metabolic pathway,"The various, enzyme-controlled, series of reac...",,R-HSA-1430728
2,PW:0000003,signaling pathway,"The pathways where a signal - hormone, neurotr...",,R-HSA-162582
3,PW:0000004,regulatory pathway,The pathways that control the processes by whi...,,
4,PW:0000005,carbohydrate metabolic pathway,Those metabolic reactions and pathways involve...,,R-HSA-71387
...,...,...,...,...,...
2672,PW:0002680,NADPH regeneration pathway,Those metabolic reactions involved in the gene...,,R-HSA-389542
2673,PW:0002681,very long-chain fatty acid beta degradation pa...,"Fatty acid degradation via beta oxidation, act...",,R-HSA-390247
2674,PW:0002682,muscle contraction pathway,A pathway in which force is generated within m...,,R-HSA-397014
2675,PW:0002683,striated muscle contraction pathway,A pathway in which force is generated within s...,,R-HSA-390522


In [13]:
import re

# Define a function to check if the KEGG entry is in the correct format
def validate_kegg(kegg_entry):
    if pd.isna(kegg_entry):
        return kegg_entry
    elif re.match(r'^[a-z]{3}\d{5}$', kegg_entry):
        return kegg_entry
    else:
        return None

# Apply the function to the KEGG column
df_po_xref_split['KEGG'] = df_po_xref_split['KEGG'].apply(validate_kegg)
df_po_xref_split = df_po_xref_split.drop(columns=['def'])
df_po_xref_split

Unnamed: 0,id,PO_Name,KEGG,Reactome
0,PW:0000001,pathway,,
1,PW:0000002,classic metabolic pathway,,R-HSA-1430728
2,PW:0000003,signaling pathway,,R-HSA-162582
3,PW:0000004,regulatory pathway,,
4,PW:0000005,carbohydrate metabolic pathway,,R-HSA-71387
...,...,...,...,...
2672,PW:0002680,NADPH regeneration pathway,,R-HSA-389542
2673,PW:0002681,very long-chain fatty acid beta degradation pa...,,R-HSA-390247
2674,PW:0002682,muscle contraction pathway,,R-HSA-397014
2675,PW:0002683,striated muscle contraction pathway,,R-HSA-390522


### 2.5 ComPath

In [19]:
# https://compath.scai.fraunhofer.de/export_mappings 
df_mapping = pd.read_csv('curated_mappings.tsv', sep='\t', header=None, names=['name1', 'id1', 'source1', 'relation', 'name2', 'id2', 'source2'])
df_mapping['id1'] = df_mapping['id1'].str.replace('path:', '')
df_mapping['id2'] = df_mapping['id2'].str.replace('path:', '')
df_mapping

Unnamed: 0,name1,id1,source1,relation,name2,id2,source2
0,2-Oxocarboxylic acid metabolism - Homo sapiens...,hsa01210,kegg,isPartOf,Amino Acid metabolism,WP3925,wikipathways
1,AMPK signaling pathway - Homo sapiens (human),hsa04152,kegg,equivalentTo,AMP-activated Protein Kinase (AMPK) Signaling,WP1403,wikipathways
2,Leptin and adiponectin,WP3934,wikipathways,isPartOf,Adipocytokine signaling pathway - Homo sapiens...,hsa04920,kegg
3,"Alanine, aspartate and glutamate metabolism - ...",hsa00250,kegg,isPartOf,Amino Acid metabolism,WP3925,wikipathways
4,Alanine and aspartate metabolism,WP106,wikipathways,isPartOf,"Alanine, aspartate and glutamate metabolism - ...",hsa00250,kegg
...,...,...,...,...,...,...,...
1587,"GPCRs, Class B Secretin-like",WP334,wikipathways,isPartOf,"GPCRs, Other",WP117,wikipathways
1588,Monoamine GPCRs,WP58,wikipathways,isPartOf,"GPCRs, Class A Rhodopsin-like",WP455,wikipathways
1589,MAPK Cascade,WP422,wikipathways,isPartOf,MAPK Signaling Pathway,WP382,wikipathways
1590,Aryl Hydrocarbon Receptor,WP2586,wikipathways,equivalentTo,Aryl Hydrocarbon Receptor Pathway,WP2873,wikipathways


In [20]:
df_mapping_equivalent = df_mapping[df_mapping['relation'] == 'equivalentTo']
df_mapping_equivalent = df_mapping_equivalent.drop(df_mapping_equivalent[df_mapping_equivalent['source1'] == df_mapping_equivalent['source2']].index)
df_mapping_equivalent

Unnamed: 0,name1,id1,source1,relation,name2,id2,source2
1,AMPK signaling pathway - Homo sapiens (human),hsa04152,kegg,equivalentTo,AMP-activated Protein Kinase (AMPK) Signaling,WP1403,wikipathways
7,Allograft rejection - Homo sapiens (human),hsa05330,kegg,equivalentTo,Allograft Rejection,WP2328,wikipathways
8,Alzheimer disease - Homo sapiens (human),hsa05010,kegg,equivalentTo,Alzheimers Disease,WP2059,wikipathways
10,Amyotrophic lateral sclerosis (ALS) - Homo sap...,hsa05014,kegg,equivalentTo,Amyotrophic lateral sclerosis (ALS),WP2447,wikipathways
11,Apoptosis - Homo sapiens (human),hsa04210,kegg,equivalentTo,Apoptosis,WP254,wikipathways
...,...,...,...,...,...,...,...
1546,VEGFA-VEGFR2 Pathway,R-HSA-4420097,reactome,equivalentTo,VEGFA-VEGFR2 Signaling Pathway,WP3888,wikipathways
1555,Vitamin D (calciferol) metabolism,R-HSA-196791,reactome,equivalentTo,Vitamin D Metabolism,WP1531,wikipathways
1558,Signaling by WNT,R-HSA-195721,reactome,equivalentTo,Wnt Signaling Pathway,WP428,wikipathways
1573,MicroRNA (miRNA) biogenesis,R-HSA-203927,reactome,equivalentTo,miRNA Biogenesis,WP2338,wikipathways


In [21]:
df_mapping_equivalent_kegg = df_mapping_equivalent[df_mapping_equivalent['source1'] == 'kegg']

df_mapping_equivalent_kegg_wiki = df_mapping_equivalent_kegg[df_mapping_equivalent_kegg['source2'] == 'wikipathways']
df_mapping_equivalent_kegg_wiki = df_mapping_equivalent_kegg_wiki[['id1', 'id2']]
df_mapping_equivalent_kegg_wiki = df_mapping_equivalent_kegg_wiki.rename(columns={'id1': 'kegg_id', 'id2': 'wikipathway_id'})

df_mapping_equivalent_kegg_reactome = df_mapping_equivalent_kegg[df_mapping_equivalent_kegg['source2'] == 'reactome']
df_mapping_equivalent_kegg_reactome = df_mapping_equivalent_kegg_reactome[['id1', 'id2']]
df_mapping_equivalent_kegg_reactome = df_mapping_equivalent_kegg_reactome.rename(columns={'id1': 'kegg_id', 'id2': 'reactome_id'})

df_mapping_equivalent_reactome = df_mapping_equivalent[df_mapping_equivalent['source1'] == 'reactome']
df_mapping_equivalent_reactome_wiki = df_mapping_equivalent_reactome[['id1', 'id2']]
df_mapping_equivalent_reactome_wiki = df_mapping_equivalent_reactome_wiki.rename(columns={'id1': 'reactome_id', 'id2': 'wikipathway_id'})

## 3. Merge Data

In [15]:
# check duplicates inside the dataframe
def merge_column(df, column1, column2, new_column):
    df[column1] = df[column1].fillna('')
    df[column2] = df[column2].fillna('')
    df[new_column] = df.apply(lambda row: f"{row[column1]} {row[column2]}".strip(), axis=1)

    expanded_rows = df[new_column].str.split(expand=True).stack().reset_index(level=1, drop=True)
    expanded_rows.name = new_column

    df = df.drop(columns=[new_column]).join(expanded_rows)
    df.drop(columns=[column1, column2], inplace=True)
    df.drop_duplicates(inplace=True)
    
    return df

def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

### 3.1 Pathway Ontology + KEGG

In [16]:
df_po_kegg = pd.merge(df_po_xref_split, df_kegg_filter, left_on='KEGG', right_on='kegg_id', how='outer')
df_po_kegg = merge_column(df_po_kegg, 'KEGG', 'kegg_id', 'KEGG_ID')
df_po_kegg

Unnamed: 0,id,PO_Name,Reactome,KEGG_Name,KEGG_ID
0,PW:0000025,glycolysis/gluconeogenesis pathway,,Glycolysis / Gluconeogenesis,hsa00010
1,PW:0000640,glycolysis pathway,R-HSA-70171,Glycolysis / Gluconeogenesis,hsa00010
2,PW:0000641,gluconeogenesis pathway,R-HSA-70263,Glycolysis / Gluconeogenesis,hsa00010
3,PW:0000026,citric acid cycle pathway,R-HSA-71403,Citrate cycle (TCA cycle),hsa00020
4,PW:0000045,pentose phosphate pathway,R-HSA-71336,Pentose phosphate pathway,hsa00030
...,...,...,...,...,...
2895,PW:0002680,NADPH regeneration pathway,R-HSA-389542,,
2896,PW:0002681,very long-chain fatty acid beta degradation pa...,R-HSA-390247,,
2897,PW:0002682,muscle contraction pathway,R-HSA-397014,,
2898,PW:0002683,striated muscle contraction pathway,R-HSA-390522,,


### 3.2 Add Reactome

In [22]:
df_po_kegg_reactome = pd.merge(df_po_kegg, df_reactome, left_on='Reactome', right_on='reactome_id', how='outer')
df_po_kegg_reactome = merge_column(df_po_kegg_reactome, 'Reactome', 'reactome_id', 'Reactome_ID')
df_po_kegg_reactome

Unnamed: 0,id,PO_Name,KEGG_Name,KEGG_ID,Reactome_Name,Reactome_ID
0,PW:0000516,interleukin-6 signaling pathway,,,Interleukin-6 signaling,R-HSA-1059683
1,PW:0000009,apoptotic cell death pathway,Apoptosis,hsa04210,Apoptosis,R-HSA-109581
2,PW:0000475,hemostasis pathway,,,Hemostasis,R-HSA-109582
3,PW:0000104,intrinsic apoptotic pathway,Apoptosis,hsa04210,Intrinsic Pathway for Apoptosis,R-HSA-109606
4,,,,,PKB-mediated events,R-HSA-109703
...,...,...,...,...,...,...
5336,PW:0002639,heme transport pathway,,,,
5337,PW:0002640,acrolein response pathway,,,,
5338,PW:0002641,Toll-like receptor 1 signaling pathway,,,,
5339,PW:0002646,Toll-like receptor 6 signaling pathway,,,,


### 3.3 Use equivalent_kegg_reactome to combine

In [23]:
df_po_kegg_reactome['KEGG_ID'] = df_po_kegg_reactome['KEGG_ID'].str.strip()
df_po_kegg_reactome['Reactome_ID'] = df_po_kegg_reactome['Reactome_ID'].str.strip()

# Create dictionaries for quick look-up
kegg_to_reactome = dict(zip(df_mapping_equivalent_kegg_reactome['kegg_id'], df_mapping_equivalent_kegg_reactome['reactome_id']))
reactome_to_kegg = dict(zip(df_mapping_equivalent_kegg_reactome['reactome_id'], df_mapping_equivalent_kegg_reactome['kegg_id']))

# Fill missing KEGG ID using Reactome ID
for idx, row in df_po_kegg_reactome.iterrows():
    if pd.isna(row['KEGG_ID']) and row['Reactome_ID'] in reactome_to_kegg:
        df_po_kegg_reactome.at[idx, 'KEGG_ID'] = reactome_to_kegg[row['Reactome_ID']]
        
# Fill missing Reactome ID using KEGG ID
for idx, row in df_po_kegg_reactome.iterrows():
    if pd.isna(row['Reactome_ID']) and row['KEGG_ID'] in kegg_to_reactome:
        df_po_kegg_reactome.at[idx, 'Reactome_ID'] = kegg_to_reactome[row['KEGG_ID']]

df_po_kegg_reactome

Unnamed: 0,id,PO_Name,KEGG_Name,KEGG_ID,Reactome_Name,Reactome_ID
0,PW:0000516,interleukin-6 signaling pathway,,,Interleukin-6 signaling,R-HSA-1059683
1,PW:0000009,apoptotic cell death pathway,Apoptosis,hsa04210,Apoptosis,R-HSA-109581
2,PW:0000475,hemostasis pathway,,,Hemostasis,R-HSA-109582
3,PW:0000104,intrinsic apoptotic pathway,Apoptosis,hsa04210,Intrinsic Pathway for Apoptosis,R-HSA-109606
4,,,,,PKB-mediated events,R-HSA-109703
...,...,...,...,...,...,...
5336,PW:0002639,heme transport pathway,,,,
5337,PW:0002640,acrolein response pathway,,,,
5338,PW:0002641,Toll-like receptor 1 signaling pathway,,,,
5339,PW:0002646,Toll-like receptor 6 signaling pathway,,,,


In [24]:
# Function to fill missing values for rows with identical KEGG_ID and Reactome_ID
def fill_missing_values_for_identical_ids(df):
    df_copy = df.copy()
    for idx, row in df.iterrows():
        if pd.notna(row['KEGG_ID']) and pd.notna(row['Reactome_ID']):
            matching_rows = df[(df['KEGG_ID'] == row['KEGG_ID']) & (df['Reactome_ID'] == row['Reactome_ID'])]
            for column in df.columns:
                if pd.isna(row[column]):
                    for _, match_row in matching_rows.iterrows():
                        if pd.notna(match_row[column]):
                            df_copy.at[idx, column] = match_row[column]
                            break
    return df_copy

# Apply the function to fill missing values
df_po_kegg_reactome_compath = fill_missing_values_for_identical_ids(df_po_kegg_reactome)
df_po_kegg_reactome_compath = df_po_kegg_reactome_compath.drop_duplicates()
df_po_kegg_reactome_compath

Unnamed: 0,id,PO_Name,KEGG_Name,KEGG_ID,Reactome_Name,Reactome_ID
0,PW:0000516,interleukin-6 signaling pathway,,,Interleukin-6 signaling,R-HSA-1059683
1,PW:0000009,apoptotic cell death pathway,Apoptosis,hsa04210,Apoptosis,R-HSA-109581
2,PW:0000475,hemostasis pathway,,,Hemostasis,R-HSA-109582
3,PW:0000104,intrinsic apoptotic pathway,Apoptosis,hsa04210,Intrinsic Pathway for Apoptosis,R-HSA-109606
4,,,,,PKB-mediated events,R-HSA-109703
...,...,...,...,...,...,...
5336,PW:0002639,heme transport pathway,,,,
5337,PW:0002640,acrolein response pathway,,,,
5338,PW:0002641,Toll-like receptor 1 signaling pathway,,,,
5339,PW:0002646,Toll-like receptor 6 signaling pathway,,,,


### 3.4 Add WikiPathways

In [28]:
df_wikipathways_kegg = pd.merge(df_wikipathways_filter, df_mapping_equivalent_kegg_wiki, left_on='id', right_on='wikipathway_id', how='outer')
df_wikipathways_kegg = merge_column(df_wikipathways_kegg, 'id', 'wikipathway_id', 'wikipathway_ID')

df_wikipathways_kegg_null = df_wikipathways_kegg[df_wikipathways_kegg['kegg_id'].isnull()]
df_wikipathways_kegg_full = df_wikipathways_kegg.dropna(subset=['kegg_id'])

df_combined_filtered = df_wikipathways_kegg_full.groupby('kegg_id').agg({
    'WikiPathways_Name': lambda x: ';'.join(x.dropna()),
    'wikipathway_ID': lambda x: ';'.join(x.dropna())
}).reset_index()

df_wikipathways_kegg_final = pd.concat([df_combined_filtered, df_wikipathways_kegg_null], axis=0)
df_wikipathways_kegg_final

Unnamed: 0,kegg_id,WikiPathways_Name,wikipathway_ID
0,hsa00010,Glycolysis and gluconeogenesis,WP534
1,hsa00020,TCA cycle (aka Krebs or citric acid cycle),WP78
2,hsa00030,Pentose phosphate metabolism,WP134
3,hsa00061,Fatty acid biosynthesis,WP357
4,hsa00072,,WP311
...,...,...,...
1528,,Monoamine transport,WP727
1529,,Serotonin receptor 2 and ELK-SRF/GATA4 signaling,WP732
1530,,Serotonin receptor 4/6/7 and NR3C signaling,WP734
1533,,Nucleotide GPCRs,WP80


In [29]:
df_wikipathways_kegg_final_null = df_wikipathways_kegg_final[df_wikipathways_kegg_final['kegg_id'].isnull()]
df_wikipathways_kegg_final_full = df_wikipathways_kegg_final.dropna(subset=['kegg_id'])

df_po_kegg_reactome_compath_wikipathway = pd.merge(df_po_kegg_reactome_compath, df_wikipathways_kegg_final_full, left_on='KEGG_ID', right_on='kegg_id', how='outer')
df_po_kegg_reactome_compath_wikipathway = merge_column(df_po_kegg_reactome_compath_wikipathway, 'KEGG_ID', 'kegg_id', 'KEGG_id')
df_po_kegg_reactome_compath_wikipathway

Unnamed: 0,id,PO_Name,KEGG_Name,Reactome_Name,Reactome_ID,WikiPathways_Name,wikipathway_ID,KEGG_id
0,PW:0000640,glycolysis pathway,Glycolysis / Gluconeogenesis,Glycolysis,R-HSA-70171,Glycolysis and gluconeogenesis,WP534,hsa00010
1,PW:0000641,gluconeogenesis pathway,Glycolysis / Gluconeogenesis,Gluconeogenesis,R-HSA-70263,Glycolysis and gluconeogenesis,WP534,hsa00010
2,PW:0000025,glycolysis/gluconeogenesis pathway,Glycolysis / Gluconeogenesis,,,Glycolysis and gluconeogenesis,WP534,hsa00010
3,PW:0000026,citric acid cycle pathway,Citrate cycle (TCA cycle),Citric acid cycle (TCA cycle),R-HSA-71403,TCA cycle (aka Krebs or citric acid cycle),WP78,hsa00020
4,PW:0000045,pentose phosphate pathway,Pentose phosphate pathway,Pentose phosphate pathway,R-HSA-71336,Pentose phosphate metabolism,WP134,hsa00030
...,...,...,...,...,...,...,...,...
5308,PW:0002639,heme transport pathway,,,,,,
5309,PW:0002640,acrolein response pathway,,,,,,
5310,PW:0002641,Toll-like receptor 1 signaling pathway,,,,,,
5311,PW:0002646,Toll-like receptor 6 signaling pathway,,,,,,


In [30]:
df_wikipathways_kegg_final_null = df_wikipathways_kegg_final_null.rename(columns={'kegg_id': 'KEGG_id'})
df_po_kegg_reactome_compath_wikipathway_final = pd.concat([df_po_kegg_reactome_compath_wikipathway, df_wikipathways_kegg_final_null], axis=0)
df_po_kegg_reactome_compath_wikipathway_final

Unnamed: 0,id,PO_Name,KEGG_Name,Reactome_Name,Reactome_ID,WikiPathways_Name,wikipathway_ID,KEGG_id
0,PW:0000640,glycolysis pathway,Glycolysis / Gluconeogenesis,Glycolysis,R-HSA-70171,Glycolysis and gluconeogenesis,WP534,hsa00010
1,PW:0000641,gluconeogenesis pathway,Glycolysis / Gluconeogenesis,Gluconeogenesis,R-HSA-70263,Glycolysis and gluconeogenesis,WP534,hsa00010
2,PW:0000025,glycolysis/gluconeogenesis pathway,Glycolysis / Gluconeogenesis,,,Glycolysis and gluconeogenesis,WP534,hsa00010
3,PW:0000026,citric acid cycle pathway,Citrate cycle (TCA cycle),Citric acid cycle (TCA cycle),R-HSA-71403,TCA cycle (aka Krebs or citric acid cycle),WP78,hsa00020
4,PW:0000045,pentose phosphate pathway,Pentose phosphate pathway,Pentose phosphate pathway,R-HSA-71336,Pentose phosphate metabolism,WP134,hsa00030
...,...,...,...,...,...,...,...,...
1528,,,,,,Monoamine transport,WP727,
1529,,,,,,Serotonin receptor 2 and ELK-SRF/GATA4 signaling,WP732,
1530,,,,,,Serotonin receptor 4/6/7 and NR3C signaling,WP734,
1533,,,,,,Nucleotide GPCRs,WP80,


### 3.5 Final Data Cleaning

In [31]:
df = df_po_kegg_reactome_compath_wikipathway_final.copy()

# filter by id column
po_empty = df[df['id'].isnull()]
po_exist = df.dropna(subset=['id'])

po_groupby = po_exist.groupby('id').agg(lambda x: ';'.join(sorted(set(x.dropna().astype(str))))).replace('',pd.NA).reset_index()

# filter by Reactome column
reactome_empty = po_empty[po_empty['Reactome_ID'].isnull()]
reactome_exist = po_empty.dropna(subset=['Reactome_ID'])

reactome_groupby = reactome_exist.groupby('Reactome_ID').agg(lambda x: ';'.join(sorted(set(x.dropna().astype(str))))).replace('',pd.NA).reset_index()

# filter by wikiPathways column
wiki_empty = reactome_empty[reactome_empty['wikipathway_ID'].isnull()]
wiki_exist = reactome_empty.dropna(subset=['wikipathway_ID'])

wiki_groupby = wiki_exist.groupby('wikipathway_ID').agg(lambda x: ';'.join(sorted(set(x.dropna().astype(str))))).replace('',pd.NA).reset_index()

# filter by KEGG column
kegg_empty = wiki_empty[wiki_empty['KEGG_id'].isnull()]
kegg_exist = wiki_empty.dropna(subset=['KEGG_id'])

kegg_groupby = kegg_exist.groupby('KEGG_id').agg(lambda x: ';'.join(sorted(set(x.dropna().astype(str))))).replace('',pd.NA).reset_index()

# merge all the dataframes
df_final = pd.concat([po_groupby, reactome_groupby, wiki_groupby, kegg_groupby, kegg_empty], axis=0)
df_final

Unnamed: 0,id,PO_Name,KEGG_Name,Reactome_Name,Reactome_ID,WikiPathways_Name,wikipathway_ID,KEGG_id
0,PW:0000001,pathway,,,,,,
1,PW:0000002,classic metabolic pathway,Metabolic pathways,Metabolism,R-HSA-1430728,,,hsa01100
2,PW:0000003,signaling pathway,,Signal Transduction,R-HSA-162582,,,
3,PW:0000004,regulatory pathway,,,,,,
4,PW:0000005,carbohydrate metabolic pathway,Carbon metabolism,Metabolism of carbohydrates,R-HSA-71387,,,hsa01200
...,...,...,...,...,...,...,...,...
179,,,Graft-versus-host disease,,,,,hsa05332
180,,,Primary immunodeficiency,,,,,hsa05340
181,,,Diabetic cardiomyopathy,,,,,hsa05415
182,,,Lipid and atherosclerosis,,,,,hsa05417


## 4. BioMedGraphica ID

In [47]:
biomedgraphica_pathway = df_final.sort_values(by=['id', 'Reactome_ID', 'wikipathway_ID','KEGG_id'], na_position='last')
biomedgraphica_pathway.reset_index(drop=True, inplace=True)

max_length = len(str(len(biomedgraphica_pathway)))
biomedgraphica_pathway['BioMedGraphica_ID'] = ['BMG_PW' + str(i).zfill(max_length) for i in range(1, len(biomedgraphica_pathway) + 1)]

biomedgraphica_pathway.rename(columns={'id': 'PO_ID', 'wikipathway_ID': 'WikiPathways_ID',
                                    'KEGG_id': 'KEGG_ID', 'Name': 'Pathway_Name'}, inplace=True)
columns = ['BioMedGraphica_ID'] + [col for col in biomedgraphica_pathway.columns if col != 'BioMedGraphica_ID']  # re-order columns
biomedgraphica_pathway = biomedgraphica_pathway[columns]
biomedgraphica_pathway

Unnamed: 0,BioMedGraphica_ID,PO_ID,PO_Name,KEGG_Name,Reactome_Name,Reactome_ID,WikiPathways_Name,WikiPathways_ID,KEGG_ID
0,BMG_PW0001,PW:0000001,pathway,,,,,,
1,BMG_PW0002,PW:0000002,classic metabolic pathway,Metabolic pathways,Metabolism,R-HSA-1430728,,,hsa01100
2,BMG_PW0003,PW:0000003,signaling pathway,,Signal Transduction,R-HSA-162582,,,
3,BMG_PW0004,PW:0000004,regulatory pathway,,,,,,
4,BMG_PW0005,PW:0000005,carbohydrate metabolic pathway,Carbon metabolism,Metabolism of carbohydrates,R-HSA-71387,,,hsa01200
...,...,...,...,...,...,...,...,...,...
6788,BMG_PW6789,,,Graft-versus-host disease,,,,,hsa05332
6789,BMG_PW6790,,,Primary immunodeficiency,,,,,hsa05340
6790,BMG_PW6791,,,Diabetic cardiomyopathy,,,,,hsa05415
6791,BMG_PW6792,,,Lipid and atherosclerosis,,,,,hsa05417


In [48]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Pathway.csv'
biomedgraphica_pathway.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Pathway\BioMedGraphica_Pathway.csv


## 5. Description

In [1]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway' / 'BioMedGraphica_Pathway.csv'
biomedgraphica_pathway = pd.read_csv(target_dir, dtype=str)

### 5.1 From Pathway Ontology

In [2]:
import csv

def parse_obo_to_csv(obo_file_path, csv_file_path):
    with open(obo_file_path, 'r') as obo_file:
        lines = obo_file.readlines()

    terms = []
    current_term = {}
    is_in_term_block = False

    for line in lines:
        line = line.strip()

        if line == "[Term]":
            # Save the previous term if it exists
            if current_term:
                terms.append(current_term)
            # Start a new term
            current_term = {}
            is_in_term_block = True
        elif is_in_term_block and line == "":
            # End of the current term block
            if current_term:
                terms.append(current_term)
            current_term = {}
            is_in_term_block = False
        elif is_in_term_block:
            # Parse lines within a term block
            if line.startswith("id: "):
                current_term['id'] = line.split("id: ")[1]
            elif line.startswith("def: "):
                current_term['def'] = line.split("def: ")[1]

    # Add the last term if it exists
    if current_term:
        terms.append(current_term)

    # Write to CSV
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["id", "def"])
        writer.writeheader()
        for term in terms:
            writer.writerow(term)

obo_file = "pathway.obo"
csv_file = "po_def.csv"

parse_obo_to_csv(obo_file, csv_file)

print(f"Finished: {csv_file}")

Finished: po_def.csv


In [3]:
po_def = pd.read_csv('po_def.csv')
po_def['def'] = po_def['def'].str.replace('"', '')
po_def

Unnamed: 0,id,def
0,PW:0000000,
1,PW:0000001,A pathway is a set of inter-connected reactio...
2,PW:0000002,"The various, enzyme-controlled, series of reac..."
3,PW:0000003,"The pathways where a signal - hormone, neurotr..."
4,PW:0000004,The pathways that control the processes by whi...
...,...,...
2674,PW:0002681,"Fatty acid degradation via beta oxidation, act..."
2675,PW:0002682,A pathway in which force is generated within m...
2676,PW:0002683,A pathway in which force is generated within s...
2677,PW:0002684,A pathway in which force is generated within s...


In [4]:
bmg_po = biomedgraphica_pathway[['BioMedGraphica_ID','PO_ID']]

pathway_description_po = pd.merge(bmg_po, po_def, left_on='PO_ID', right_on='id', how='left')
pathway_description_po.drop(columns=['id', 'PO_ID'], inplace=True)
pathway_description_po.rename(columns={'def':'PO'}, inplace=True)
pathway_description_po

Unnamed: 0,BioMedGraphica_ID,PO
0,BMG_PW0001,A pathway is a set of inter-connected reactio...
1,BMG_PW0002,"The various, enzyme-controlled, series of reac..."
2,BMG_PW0003,"The pathways where a signal - hormone, neurotr..."
3,BMG_PW0004,The pathways that control the processes by whi...
4,BMG_PW0005,Those metabolic reactions and pathways involve...
...,...,...
6788,BMG_PW6789,
6789,BMG_PW6790,
6790,BMG_PW6791,
6791,BMG_PW6792,


### 5.2 From KEGG

In [None]:
from bioservices import KEGG
import pandas as pd

k = KEGG()
k.organism = "hsa"

# human
pathway_ids = k.pathwayIds

def format_value(value):
    if isinstance(value, list):
        return ';'.join(format_value(item) for item in value)
    elif isinstance(value, dict):
        return ';'.join(f"{k}: {v}" for k, v in value.items())
    else:
        return str(value)

def format_for_dataframe(data):
    return {key: format_value(value) for key, value in data.items()}

full_pathway = pd.DataFrame()

for pid in pathway_ids:
    try:
        pathway_info = k.get(pid)  
        dict_data = k.parse(pathway_info)
        formatted_dict_data = format_for_dataframe(dict_data) 
        df = pd.DataFrame([formatted_dict_data])
        full_pathway = pd.concat([full_pathway, df], ignore_index=True)
    except Exception as e:
        print(f"Error processing pathway {pid}: {e}")

print(full_pathway.head())

full_pathway.to_csv('full_kegg_pathways.csv', index=False)

In [5]:
kegg_def = pd.read_csv('full_kegg_pathways.csv')
kegg_def = kegg_def[['ENTRY', 'DESCRIPTION']]
kegg_def['ENTRY'] = kegg_def['ENTRY'].str.split(' ').str[0]
kegg_def

Unnamed: 0,ENTRY,DESCRIPTION
0,hsa01100,
1,hsa01200,Carbon metabolism is the most basic aspect of ...
2,hsa01210,"2-Oxocarboxylic acids, also called 2-oxo acids..."
3,hsa01212,
4,hsa01230,This map presents a modular architecture of th...
...,...,...
360,hsa04934,Cushing syndrome (CS) is a rare disorder resul...
361,hsa01521,EGFR is a tyrosine kinase that participates in...
362,hsa01524,"Platinum-based drugs cisplatin, carboplatin an..."
363,hsa01523,"Since the 1940s, antifolates have played a piv..."


In [6]:
bmg_kegg = biomedgraphica_pathway[['BioMedGraphica_ID','KEGG_ID']]
pathway_description_kegg = pd.merge(bmg_kegg, kegg_def, left_on='KEGG_ID', right_on='ENTRY', how='left')
pathway_description_kegg.drop(columns=['ENTRY', 'KEGG_ID'], inplace=True)
pathway_description_kegg.rename(columns={'DESCRIPTION':'KEGG'}, inplace=True)
pathway_description_kegg

Unnamed: 0,BioMedGraphica_ID,KEGG
0,BMG_PW0001,
1,BMG_PW0002,
2,BMG_PW0003,
3,BMG_PW0004,
4,BMG_PW0005,Carbon metabolism is the most basic aspect of ...
...,...,...
6788,BMG_PW6789,Graft-versus-host disease (GVHD) is a lethal c...
6789,BMG_PW6790,Primary immunodeficiencies (PIs) are a heterog...
6790,BMG_PW6791,Diabetic cardiomyopathy has been defined as le...
6791,BMG_PW6792,Atherosclerosis is a chronic inflammatory dise...


### 5.3 From Reactome

In [43]:
reactome = biomedgraphica_pathway['Reactome_ID'].dropna().unique()
reactome = pd.DataFrame(reactome, columns=['Reactome_ID'])
reactome

Unnamed: 0,Reactome_ID
0,R-HSA-1430728
1,R-HSA-162582
2,R-HSA-71387
3,R-HSA-5683057
4,R-HSA-195721
...,...
2757,R-HSA-9925561
2758,R-HSA-9926550
2759,R-HSA-9927353
2760,R-HSA-9927354


In [11]:
import requests
import pandas as pd
from tqdm import tqdm

def get_pathway_summation(pathway_id):
    url = f"https://reactome.org/ContentService/data/query/{pathway_id}/Summation"
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text.strip()
    except Exception as e:
        return f"Error: {e}"

summations = []
print("Fetching Summations...")
for pathway_id in tqdm(reactome["Reactome_ID"], desc="Progress", unit="pathway"):
    summation = get_pathway_summation(pathway_id)
    summations.append(summation)

reactome["Summation"] = summations

output_file = "reactome_def.csv"
reactome.to_csv(output_file, index=False)
print(f"Results saved to '{output_file}'")

Fetching Summations...


Progress: 100%|██████████| 2762/2762 [06:48<00:00,  6.75pathway/s]

Results saved to 'reactome_def.csv'





In [7]:
import re

def extract_content(summation):
    match = re.match(r"^\d+\s+(.*?)\s+Summation$", summation)
    if match:
        return match.group(1)
    return summation

reactome_df = pd.read_csv('reactome_def.csv')
reactome_df = reactome_df[~reactome_df['Summation'].str.contains('Error', case = False, na = False)]
reactome_df['Summation'] = reactome_df['Summation'].apply(extract_content)
reactome_df

Unnamed: 0,Reactome_ID,Summation
0,R-HSA-1430728,Metabolic processes in human cells generate en...
1,R-HSA-162582,Signal transduction is a process in which extr...
2,R-HSA-71387,Starches and sugars are major constituents of ...
3,R-HSA-5683057,The mitogen activated protein kinases (MAPKs) ...
4,R-HSA-195721,WNT signaling pathways control a wide range of...
...,...,...
2757,R-HSA-9925561,"The exocrine pancreas, which comprises more th..."
2758,R-HSA-9926550,Studies in melanoma cells have identified MITF...
2759,R-HSA-9927353,BTLA (B and T Lymphocyte Attenuator) is a co-i...
2760,R-HSA-9927354,ICOS (Inducible T-cell COStimulator) is a crit...


In [8]:
bmg_reactome = biomedgraphica_pathway[['BioMedGraphica_ID','Reactome_ID']]
pathway_description_reactome = pd.merge(bmg_reactome, reactome_df, on = 'Reactome_ID', how = 'left')
pathway_description_reactome = pathway_description_reactome.drop(columns=['Reactome_ID'])
pathway_description_reactome = pathway_description_reactome.rename(columns={'Summation': 'Reactome'})
pathway_description_reactome

Unnamed: 0,BioMedGraphica_ID,Reactome
0,BMG_PW0001,
1,BMG_PW0002,Metabolic processes in human cells generate en...
2,BMG_PW0003,Signal transduction is a process in which extr...
3,BMG_PW0004,
4,BMG_PW0005,Starches and sugars are major constituents of ...
...,...,...
6788,BMG_PW6789,
6789,BMG_PW6790,
6790,BMG_PW6791,
6791,BMG_PW6792,


### 5.4 Final Description

In [9]:
pathway_description = pd.merge(pathway_description_po, pathway_description_kegg)
pathway_description = pd.merge(pathway_description, pathway_description_reactome)
pathway_description

Unnamed: 0,BioMedGraphica_ID,PO,KEGG,Reactome
0,BMG_PW0001,A pathway is a set of inter-connected reactio...,,
1,BMG_PW0002,"The various, enzyme-controlled, series of reac...",,Metabolic processes in human cells generate en...
2,BMG_PW0003,"The pathways where a signal - hormone, neurotr...",,Signal transduction is a process in which extr...
3,BMG_PW0004,The pathways that control the processes by whi...,,
4,BMG_PW0005,Those metabolic reactions and pathways involve...,Carbon metabolism is the most basic aspect of ...,Starches and sugars are major constituents of ...
...,...,...,...,...
6788,BMG_PW6789,,Graft-versus-host disease (GVHD) is a lethal c...,
6789,BMG_PW6790,,Primary immunodeficiencies (PIs) are a heterog...,
6790,BMG_PW6791,,Diabetic cardiomyopathy has been defined as le...,
6791,BMG_PW6792,,Atherosclerosis is a chronic inflammatory dise...,


In [10]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Pathway_Description.csv'
pathway_description.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Pathway\BioMedGraphica_Pathway_Description.csv


### 5.5 Combined Description

In [11]:
comb_description = pathway_description.copy()

# add the column name at the beginning of the string
# first, we need to get the column names
column_names = comb_description.columns.tolist()
column_names = [col for col in column_names if col != 'BioMedGraphica_ID']
# then we can apply the function to each column
for col in column_names:
    comb_description[col] = comb_description[col].apply(lambda x: ' | '.join([f"{col}: {i}" for i in x.split(' | ')]) if pd.notna(x) else x)

# now we can merge the columns into one
comb_description['Description'] = comb_description[column_names].apply(lambda x: ' | '.join(x.dropna()), axis=1)
comb_description = comb_description[['BioMedGraphica_ID', 'Description']]
comb_description

Unnamed: 0,BioMedGraphica_ID,Description
0,BMG_PW0001,PO: A pathway is a set of inter-connected rea...
1,BMG_PW0002,"PO: The various, enzyme-controlled, series of ..."
2,BMG_PW0003,"PO: The pathways where a signal - hormone, neu..."
3,BMG_PW0004,PO: The pathways that control the processes by...
4,BMG_PW0005,PO: Those metabolic reactions and pathways inv...
...,...,...
6788,BMG_PW6789,KEGG: Graft-versus-host disease (GVHD) is a le...
6789,BMG_PW6790,KEGG: Primary immunodeficiencies (PIs) are a h...
6790,BMG_PW6791,KEGG: Diabetic cardiomyopathy has been defined...
6791,BMG_PW6792,KEGG: Atherosclerosis is a chronic inflammator...


In [12]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Pathway_Description_Combined.csv'
comb_description.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Pathway\BioMedGraphica_Pathway_Description_Combined.csv


## 6. File Generation

In [1]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway' / 'BioMedGraphica_Pathway.csv'
biomedgraphica_pathway = pd.read_csv(target_dir, dtype=str)

### 6.1 Name and ID

GUI Name

In [13]:
def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

gui_name = biomedgraphica_pathway.copy()
gui_name = merge_string_columns(gui_name, ['PO_Name', 'KEGG_Name', 'Reactome_Name', 'WikiPathways_Name'], 'Pathway_Name_List')
gui_name = gui_name[['BioMedGraphica_ID', 'Pathway_Name_List']]
gui_name

Unnamed: 0,BioMedGraphica_ID,Pathway_Name_List
0,BMG_PW0001,pathway
1,BMG_PW0002,Metabolic pathways | Metabolism | classic meta...
2,BMG_PW0003,signaling pathway | Signal Transduction
3,BMG_PW0004,regulatory pathway
4,BMG_PW0005,Metabolism of carbohydrates | Carbon metabolis...
...,...,...
6788,BMG_PW6789,Graft-versus-host disease
6789,BMG_PW6790,Primary immunodeficiency
6790,BMG_PW6791,Diabetic cardiomyopathy
6791,BMG_PW6792,Lipid and atherosclerosis


In [14]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Pathway_GUI_Name.csv'
gui_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Pathway\BioMedGraphica_Pathway_GUI_Name.csv


LLM Name and ID

In [15]:
llm_name_id = biomedgraphica_pathway.copy()

llm_name_id['PO_ID'] = llm_name_id['PO_ID'].apply(
    lambda x: ' | '.join(f"PO ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['KEGG_ID'] = llm_name_id['KEGG_ID'].apply(
    lambda x: ' | '.join(f"KEGG ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['Reactome_ID'] = llm_name_id['Reactome_ID'].apply(
    lambda x: ' | '.join(f"Reactome ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['WikiPathways_ID'] = llm_name_id['WikiPathways_ID'].apply(
    lambda x: ' | '.join(f"WikiPathways ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

column_order = ['BioMedGraphica_ID', 'PO_Name', 'PO_ID', 'Reactome_Name', 'Reactome_ID', 'WikiPathways_Name', 'WikiPathways_ID', 'KEGG_Name', 'KEGG_ID']
llm_name_id = llm_name_id[column_order] 
llm_name_id

Unnamed: 0,BioMedGraphica_ID,PO_Name,PO_ID,Reactome_Name,Reactome_ID,WikiPathways_Name,WikiPathways_ID,KEGG_Name,KEGG_ID
0,BMG_PW0001,pathway,PO ID:PW:0000001,,,,,,
1,BMG_PW0002,classic metabolic pathway,PO ID:PW:0000002,Metabolism,Reactome ID:R-HSA-1430728,,,Metabolic pathways,KEGG ID:hsa01100
2,BMG_PW0003,signaling pathway,PO ID:PW:0000003,Signal Transduction,Reactome ID:R-HSA-162582,,,,
3,BMG_PW0004,regulatory pathway,PO ID:PW:0000004,,,,,,
4,BMG_PW0005,carbohydrate metabolic pathway,PO ID:PW:0000005,Metabolism of carbohydrates,Reactome ID:R-HSA-71387,,,Carbon metabolism,KEGG ID:hsa01200
...,...,...,...,...,...,...,...,...,...
6788,BMG_PW6789,,,,,,,Graft-versus-host disease,KEGG ID:hsa05332
6789,BMG_PW6790,,,,,,,Primary immunodeficiency,KEGG ID:hsa05340
6790,BMG_PW6791,,,,,,,Diabetic cardiomyopathy,KEGG ID:hsa05415
6791,BMG_PW6792,,,,,,,Lipid and atherosclerosis,KEGG ID:hsa05417


In [16]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Pathway_LLM_Name_ID.csv'
llm_name_id.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Pathway\BioMedGraphica_Pathway_LLM_Name_ID.csv


LLM Name and ID Combined

In [17]:
llm_combined = llm_name_id.copy()

def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

llm_combined = merge_string_columns(llm_combined, llm_combined.columns[llm_combined.columns != 'BioMedGraphica_ID'], 'Names_and_IDs')
llm_combined

Unnamed: 0,BioMedGraphica_ID,Names_and_IDs
0,BMG_PW0001,PO ID:PW:0000001 | pathway
1,BMG_PW0002,Metabolism | Reactome ID:R-HSA-1430728 | Metab...
2,BMG_PW0003,signaling pathway | Reactome ID:R-HSA-162582 |...
3,BMG_PW0004,regulatory pathway | PO ID:PW:0000004
4,BMG_PW0005,PO ID:PW:0000005 | KEGG ID:hsa01200 | Metaboli...
...,...,...
6788,BMG_PW6789,Graft-versus-host disease | KEGG ID:hsa05332
6789,BMG_PW6790,Primary immunodeficiency | KEGG ID:hsa05340
6790,BMG_PW6791,KEGG ID:hsa05415 | Diabetic cardiomyopathy
6791,BMG_PW6792,Lipid and atherosclerosis | KEGG ID:hsa05417


In [18]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Pathway_LLM_Name_ID_Combined.csv'
llm_combined.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Pathway\BioMedGraphica_Pathway_LLM_Name_ID_Combined.csv


Display Name

In [19]:
display_name = biomedgraphica_pathway.copy()

display_name['BMG_Pathway_Name'] = display_name['PO_Name'].fillna(display_name['Reactome_Name']).fillna(display_name['WikiPathways_Name']).fillna(display_name['KEGG_Name'])
display_name = display_name[['BioMedGraphica_ID', 'BMG_Pathway_Name']]
display_name

Unnamed: 0,BioMedGraphica_ID,BMG_Pathway_Name
0,BMG_PW0001,pathway
1,BMG_PW0002,classic metabolic pathway
2,BMG_PW0003,signaling pathway
3,BMG_PW0004,regulatory pathway
4,BMG_PW0005,carbohydrate metabolic pathway
...,...,...
6788,BMG_PW6789,Graft-versus-host disease
6789,BMG_PW6790,Primary immunodeficiency
6790,BMG_PW6791,Diabetic cardiomyopathy
6791,BMG_PW6792,Lipid and atherosclerosis


In [20]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Pathway_Display_Name.csv'
display_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Pathway\BioMedGraphica_Pathway_Display_Name.csv
