## Versions

Python: 3.7.2

pandas: 1.3.5

Pip: 23.0.1

Biopython: 1.81

keggtools: 1.0.1

# Retrieve EC codes from Mgx data

Install Pandas package from pip:

> python -m pip install pandas

In [2]:
import pandas as pd

Mgx = pd.read_table('../analysis/data/mgxData')

# split the column by ":" and put EC codes in Mgx_EC
Mgx_EC = Mgx["Gene.Family"].str.split(":", expand=True)[0]

# KEGG pathways from EC codes

Install Biopython package from pip:

> python -m pip install Biopython

## Downloading the dictionnary of pathways (per enzyme)

This step can be skipped as the dictionnary is stored in /data

In [None]:
from Bio.KEGG import REST
import json

pathways_dict = {}
for ec_number in Mgx_EC:
    if not ec_number.islower(): # against Bad request error from preliminary EC codes (e.g. "1.3.1.n3" -> Mgx_EC[430] )
        pathways = REST.kegg_link("pathway", ec_number).read().strip().split("\n")
        if not pathways == [""]: # against no responses to request
            for pathway in pathways:
                pathway_id, pathway_name = pathway.split("\tpath:")
                pathway_id = pathway_id.split(":")[1]
                if pathway_id not in pathways_dict:
                    pathways_dict[pathway_id] = []
                pathways_dict[pathway_id].append(pathway_name)

with open("data/pathways_dict.json", "w") as file:
    json.dump(pathways_dict, file)  # encode dict into JSON

## Loading the dictionnary of pathways (per enzyme)

In [3]:
import json

# Open the file for reading
with open("data/pathways_dict.json", "r") as file:
    # Load the dictionary from the file
    pathways_dict = json.load(file)

# Enrichment preparation

The remaining steps will be done in R (cf. enrichment_analysis.Rmd)

## Downloading the dictionnary of enzymes (per pathway)

This step can also be skipped as the dictionnary is stored in /data

Download all ec pathways from KEGG.

In [1]:
import requests

# Define the URL for the KEGG REST API
url = 'http://rest.kegg.jp/list/pathway/ec'

# Send a GET request to the URL to retrieve the pathway IDs
response = requests.get(url)

# Parse the response to get the pathway IDs
all_pathway_ids = [line.split('\t')[0] for line in response.text.strip().split('\n')]

Attribute the related enzymes for all the pathways (also filters out pathways without enzymes)

In [None]:
import re
import json

ec_pathways = {}

for pathway_id in all_pathway_ids:
    pathway_url = f'http://rest.kegg.jp/get/{pathway_id}'
    pathway_response = requests.get(pathway_url)
    pathway_text = pathway_response.text.strip()
    
    # find the lines containing enzyme information
    pattern = r"ENZYME\s+(.*?)\s+COMPOUND"
    enzyme_lines = re.search(pattern, pathway_text, re.DOTALL)
    if enzyme_lines:
        enzymes = [s.lstrip() for s in enzyme_lines.group(1).strip().split('\n')]
        ec_pathways[pathway_id] = enzymes

with open("data/all_pathways_enzymes_dict.json", "w") as file:
    json.dump(ec_pathways, file)  # encode dict into JSON

## Loading the dictionnary of enzymes (per pathway)

In [1]:
import json

# Open the file for reading
with open("data/all_pathways_enzymes_dict.json", "r") as file:
    # Load the dictionary from the file
    all_pathways_enzymes_dict = json.load(file)

## Downloading the dictionnary of compounds (per pathway)

This step can also be skipped as the dictionnary is stored in /data

Attribute the related compounds (metabolites) for all the pathways

In [3]:
import re
import json

mb_pathways = {}

for pathway_id in all_pathway_ids:
    pathway_url = f'http://rest.kegg.jp/get/{pathway_id}'
    pathway_response = requests.get(pathway_url)
    pathway_text = pathway_response.text.strip()
    
    # find the lines containing enzyme information
    pattern = r"COMPOUND\s+(.*?)\s+REFERENCE"
    compound_lines = re.search(pattern, pathway_text, re.DOTALL)
    if compound_lines:
        compound = [s.lstrip() for s in compound_lines.group(1).strip().split('\n')]
        mb_pathways[pathway_id] = compound

with open("data/all_pathways_compounds_dict.json", "w") as file:
    json.dump(mb_pathways, file)  # encode dict into JSON

Associate all the KEGG ec pathways with their name

In [15]:
from bioservices import KEGG
import json

# initialize the KEGG API object and pathway name dictionary
kegg = KEGG()

pathway_names = {}

# define the EC pathway code
for ec_code in all_pathways_enzymes_dict.keys():
    # get the pathway information for the given EC code
    pathway_info = kegg.get(ec_code)
    # extract the pathway name from the information
    pathway_name = pathway_info.split('\n')[1]
    # split the string at the first occurrence of 'NAME'
    pathway_name = pathway_name.split('NAME', 1)[1]
    # remove leading and trailing whitespace characters
    pathway_name = pathway_name.strip()
    # add the new pathway name entry to the dict
    pathway_names[ec_code] = pathway_name

with open("data/ec_pathway_names_dict.json", "w") as file:
    json.dump(pathway_names, file)  # encode dict into JSON

{'ec00010': 'Glycolysis / Gluconeogenesis', 'ec00020': 'Citrate cycle (TCA cycle)', 'ec00030': 'Pentose phosphate pathway', 'ec00040': 'Pentose and glucuronate interconversions', 'ec00051': 'Fructose and mannose metabolism', 'ec00052': 'Galactose metabolism', 'ec00053': 'Ascorbate and aldarate metabolism', 'ec00500': 'Starch and sucrose metabolism', 'ec00520': 'Amino sugar and nucleotide sugar metabolism', 'ec00620': 'Pyruvate metabolism', 'ec00630': 'Glyoxylate and dicarboxylate metabolism', 'ec00640': 'Propanoate metabolism', 'ec00650': 'Butanoate metabolism', 'ec00660': 'C5-Branched dibasic acid metabolism', 'ec00562': 'Inositol phosphate metabolism', 'ec00190': 'Oxidative phosphorylation', 'ec00195': 'Photosynthesis', 'ec00710': 'Carbon fixation in photosynthetic organisms', 'ec00720': 'Carbon fixation pathways in prokaryotes', 'ec00680': 'Methane metabolism', 'ec00910': 'Nitrogen metabolism', 'ec00920': 'Sulfur metabolism', 'ec00061': 'Fatty acid biosynthesis', 'ec00062': 'Fatty a

## Loading the dictionnary of compounds (per pathway)

In [16]:
import json

# Open the file for reading
with open("data/all_pathways_compounds_dict.json", "r") as file:
    # Load the dictionary from the file
    mb_pathways = json.load(file)