In [33]:
from lxml import html
import requests
import json
from rdkit import Chem
from rdkit.Chem import MACCSkeys
import csv

In [34]:
class KEGGCompound:
    def __init__(self, id):
        self.id = id
        self.__molFileText = None
        self.__molFileFetched = False
        #self.mol = None
        self.pathways = []
 
    #
    # Get the SMILES string associated with this compound.
    #
    def getSMILES(self):
        # Use RDKit to convert the mol file string to a SMILES. 
        mol = Chem.MolFromMolBlock(self.fetchMOLFileString())
        try:
            smilesString = Chem.rdmolfiles.MolToSmiles(mol)
        except:
            return("Unhandled")        # This was a polymer, which RDKit can't handle.
        return(smilesString)

    #
    # Get the MACCS fingerprint associated with this compound.
    #
    def getMACCS(self):
        mol = Chem.MolFromMolBlock(self.fetchMOLFileString())
        maccsFp = MACCSkeys.FingerprintMol(mol).ToBitString()
        return(maccsFp)

    #
    #
    #
    def fetchMOLFileString(self):
        if (self.__molFileFetched == True):
            return(self.__molFileText)
        urlStr = "https://www.genome.jp/dbget-bin/www_bget?-f+m+compound+" + self.id
        molFile = requests.get(urlStr)
        self.__molFileText = str(molFile.text)
        self.__molFileFetched = True
        #self.mol = Chem.MolFromMolBlock(self.__molFileText)
        return(self.__molFileText)

    #
    #
    #
    def addPathway(self, mod):
        self.pathways.append(mod)

In [35]:
import requests

class KEGGPathway:
    def __init__(self, id, name, className):
        self.id = id
        self.name = name
        self.className = className
        self.compounds = []
        self.__compoundsFetched = False

    #
    # REST API URL to get compounds linked to this pathway
    #
    def getCompoundsURL(self):
        return f"https://rest.kegg.jp/link/compound/path:map{self.id}"

    #
    # Fetch a list of KEGGCompound objects associated with this pathway
    #
    def fetchCompounds(self):
        if self.__compoundsFetched:
            return self.compounds

        retval = []
        url = self.getCompoundsURL()
        response = requests.get(url)
        response.raise_for_status()  # raise error if request fails

        # Each line looks like: "path:map00010\tcpd:C00022"
        lines = response.text.strip().splitlines()
        compoundIDs = [line.split("\t")[1].split(":")[1] for line in lines]

        for cid in compoundIDs:
            retval.append(KEGGCompound(cid))  # assuming KEGGCompound class exists

        self.compounds = retval
        self.__compoundsFetched = True
        return retval


In [36]:
def getPathways():
    retval = []        # This will be a list of KEGGModule objects.
    # Get the KEGG modules page as a string.
    #page = requests.get("https://www.genome.jp/kegg-bin/download_htext?htext=ko00002.keg&format=json&filedir=")
    #jsonFile = open("../../data/kegg-data/ko00002.json")
    #jsonStr = jsonFile.read()
    #jsonFile.close()
    #print(page.text)
    #jsonData = json.loads(jsonStr)
    #print(jsonData.keys())
    #print(type(jsonData))
    #tree = html.fromstring(page.content)

    page = requests.get("https://www.genome.jp/kegg/pathway.html")
    tree = html.fromstring(page.content)
    #.xpath('//div[@id="definition"]/table/tr/td[text()="Reaction"]/../td[2]/a')
    for bNode in tree.xpath('//b'):
        pathwayClassNum = bNode.xpath('text()')[0].split(' ')[0]
        pathwayClassName = bNode.xpath('text()')[0].split(' ')[1]

        if (pathwayClassNum != '1.0' and pathwayClassNum != '1.' and pathwayClassNum != '1.12' and pathwayClassNum[0] == '1'):
            print("b node = " + bNode.xpath('text()')[0])
            print("Pathway class number and name: " + pathwayClassNum + ", " + pathwayClassName)

            # We are interested in this pathway.
            pathwayIDNodes = bNode.xpath('following-sibling::div')[0].xpath('dl/dt')
            for pathwayIDNode in pathwayIDNodes:
                # Get the id.
                pathwayID = pathwayIDNode.xpath('text()')[0]
                pathwayName = pathwayIDNode.xpath('following-sibling::dd/a/text()')[0]
                print("Pathway id and name: " + str(pathwayID) + ", " + str(pathwayName))
                newPathway = KEGGPathway(pathwayID, pathwayName, pathwayClassName)
                retval.append(newPathway)
        #print(tree.xpath('//b/text()'))
    return(retval)

In [None]:
##################################################################
# MAIN
##################################################################
#compoundDict = {}
#moduleDict = {}


#
# Assemble database of compounds in compoundDict.
#
pathways = getPathways()
print("Number of pathways: " + str(len(pathways)))
#for pw in pathways:
    
#
# Convert the list of modules to a CSV.
#
with open('keggdb2.csv', 'w') as csvfile:
    headers = ['ID', 'SMILES', 'MACCS', 'Pathway ID', 'Pathway name', 'Pathway class']
    writer = csv.DictWriter(csvfile, fieldnames=headers)
    writer.writeheader()
    for mod in pathways:
        print(mod.className + ", " + mod.name + "," + mod.id)
        compounds = mod.fetchCompounds()
        for comp in compounds:
            #print(comp.fetchMOLFileString())
            #print(comp.getSMILES())
    #        print(comp.getMACCS())
    #        smiles = comp.getSMILES()
    #        if (smiles not in compoundDict):
    #            compoundDict[smiles] = comp
            comp.addPathway(mod)
        #for comp in mod.compounds:
            if ("Unhandled" in comp.getSMILES()):
                # Skip this compound, as rdkit can't handle it.
                print("Skipping a compound because RDKit can't handle it.")
                continue
            #if (comp.id in compoundWrittenSet):
            #    continue
            # Indicate that this compound has already been written.
            #compoundWrittenSet.add(comp.id)
            
            #TODO: Write the row for this compound.
            writer.writerow({'ID':comp.id, 'SMILES':comp.getSMILES(), 'MACCS':comp.getMACCS(), 'Pathway ID':mod.id, 'Pathway name':mod.name, 'Pathway class':mod.className })
    
    
            #for modComp in comp.modules:
                #TODO: Append to the modules string some representation of
                # these modules..
            #print(comp.getSMILES() + "," + 
            #print(comp.getMACCS())


b node = 1.1 Carbohydrate metabolism
Pathway class number and name: 1.1, Carbohydrate
Pathway id and name: 00010 , Glycolysis / Gluconeogenesis
Pathway id and name: 00020 , Citrate cycle (TCA cycle)
Pathway id and name: 00030 , Pentose phosphate pathway
Pathway id and name: 00040 , Pentose and glucuronate interconversions
Pathway id and name: 00051 , Fructose and mannose metabolism
Pathway id and name: 00052 , Galactose metabolism
Pathway id and name: 00053 , Ascorbate and aldarate metabolism
Pathway id and name: 00500 , Starch and sucrose metabolism
Pathway id and name: 00620 , Pyruvate metabolism
Pathway id and name: 00630 , Glyoxylate and dicarboxylate metabolism
Pathway id and name: 00640 , Propanoate metabolism
Pathway id and name: 00650 , Butanoate metabolism
Pathway id and name: 00660 , C5-Branched dibasic acid metabolism
Pathway id and name: 00562 , Inositol phosphate metabolism
b node = 1.2 Energy metabolism
Pathway class number and name: 1.2, Energy
Pathway id and name: 00190