Modify this line to briefly discribe the functionality of pubmedpy.ipynb<br/><br/>Copyright (C) 2017  Martin Engqvist Lab<br/>This program is free software: you can redistribute it and/or modify<br/>it under the terms of the GNU General Public License as published by<br/>the Free Software Foundation, either version 3 of the License, or<br/>(at your option) any later version.<br/>This program is distributed in the hope that it will be useful,<br/>but WITHOUT ANY WARRANTY; without even the implied warranty of<br/>MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the<br/>GNU General Public License for more details.<br/>You should have received a copy of the GNU General Public License<br/>along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [7]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory paths
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory


# make folders specific for certain data
folder_name = ''
if folder_name != '':
    #make folders if they don't exist
    if not exists(join(RAW_EXTERNAL, folder_name)):
        os.makedirs(join(RAW_EXTERNAL, folder_name))

    if not exists(join(INTERMEDIATE, folder_name)):
        os.makedirs(join(INTERMEDIATE, folder_name))

    if not exists(join(FINAL, folder_name)):
        os.makedirs(join(FINAL, folder_name))

print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


In [1]:
import json
from pubchempy import Compound, get_compounds
from rdkit import Chem
from rdkit.Chem import Descriptors

filepath = join(RAW_EXTERNAL,"BRENDA_data_2019_1","2019-04-02_substrate_cache.json")
with open(filepath, 'r') as f:
        data = json.loads(f.read())
        
def separate_nones(data):
    
    """Takes a dictionary as input and adds every entry giving a valid 'value' to a new dictionary, while appending all
       all entries that return "None" to a list. It then returns the SMILES dictionary (index 0) and the Nones list (index 1)"""
    
    print("Separating compounds without SMILES...")
    Nones = []
    Smiles = {}
    for key in data:
        if data[key] == None:
            Nones.append(key)
        else:
            Smiles[key] = data[key]
    print("Done.")
    return Smiles, Nones

def filter_uninteresting(nones): 
    
    """Filter out terms indicating that the "substrate" is not a molecule, or is a polymer."""
    
    unwanted_terms = ['protein', 'ase', 'factor', 'rna', 'dna', 'subunit', 'ribosom', 'receptor', 'active', 'cell', 'hormone']
    Uninteresting = [x for x in nones if any(word in x for word in unwanted_terms)]
    Still_interesting = []
    print("Filtering out uninteresting compounds...")
    for i in range(0,len(nones)):
        if nones[i] not in Uninteresting:
            Still_interesting.append(nones[i])
    print("Filtered out " + str(len(Uninteresting)) + " compounds. " + str(len(Still_interesting)) + " compounds of interest remain.")
    return Still_interesting    


def test_smiles(smiles):
    
    """Takes as input a dictionary of substrate names and their SMILES, converts to mol-format and returns a dictionary of substrate names and 
    and their respective molecular weight. Done with the intention of checking that rdkit can read the SMILES string. Outputs a list
    containing a dictionary with the compounds whose SMILES passed (index 0), and a list of compounds that did not. (index 1)"""
    
    print("Testing new SMILES...")
    PassedSmiles = {}
    InvalidSmiles = {}
    for key in smiles:
        try:
            mol = Chem.MolFromSmiles(smiles[key])
            PassedSmiles[key] = smiles[key]
        except:
            print("Invalid SMILES string.")
            InvalidSmiles[key] = smiles[key]
    print(str(len(PassedSmiles)) +" SMILES passed. " + str(len(RemainingNones)) + " unidentified substrates remain")    
    return PassedSmiles, InvalidSmiles

def test_remainingnones(remainingnones):
    """Takes the SMILES that failed to convert into mol-format in test_smiles and """

def smiles_from_pubchem(compound_names):
    
    """Takes a list of compound names and checks for matching entries in the PubMed database. If an entry exists, the 
    canonical (no isomery) SMILES key is extracted and added to a dictionary.  Compounds with no matches are added to a 
    list. This dictionary and this list are returned in index 0 and 1, respectively."""
    
    print("Fetching entries from PubChem...")
    NoSmiles = []
    NewSmiles = {}
    for i in range(0,len(compound_names)):
        x = get_compounds(compound_names[i],'name')    # Painfully slow.
        # Check that compound is listed on PubChem.
        if len(x) > 0:       
            for result in x: 
                # Fetch canonical smiles (ignore stereoisomers)
                NewSmiles[compound_names[i]] = result.canonical_smiles 
        else:
            NoSmiles.append(compound_names[i])
    print("Found " + str(len(NewSmiles)) + " new SMILES. Returning " + str(len(NoSmiles)) + " undetermined substrates.")
    return NewSmiles, NoSmiles
        
def salvage_and_test(incomplete_smiles_list):
    """Run the whole shaboom."""
    SORT = separate_nones(incomplete_smiles_list)
    SMILES = SORT[0]
    NONES = SORT[1]
  
    PUBCHEM = smiles_from_pubchem(NONES)
    NEW_SMILES = PUBCHEM[0]
    NONES = PUBCHEM[1]
    
    TESTED = test_smiles(NEW_SMILES,NONES)
    SMILES.update(TESTED[0])
    NONES = filter_uninteresting(TESTED[1])
    
    print("Number of SMILES: " + str(len(SMILES)))
    print("Number of untranslated compounds: " + str(len(NONES)))
    
    return SMILES, NONES
    
[Smiles, Nones] = salvage_and_test(data)




SyntaxError: invalid syntax (<ipython-input-1-940f47280e2b>, line 60)

In [2]:
# Found these by manual curation. 
Smiles["5'-dcmp"] = 'C1C(C(OC1N2C=CC(=NC2=O)N)COP(=O)(O)O)O'
Smiles["plugin glutamine"] = 'C(CC(=O)N)C(C(=O)O)N'
Smiles["5'-dimp"] = 'C1C(C(OC1N2C=NC3=C2N=CNC3=O)COP(=O)(O)O)O'
Smiles["linear maltohexaose"] = 'C(C1C(C(C(C(O1)OC2C(OC(C(C2O)O)OC3C(OC(C(C3O)O)OC4C(OC(C(C4O)O)OC5C(OC(C(C5O)O)OC6C(OC(C(C6O)O)O)CO)CO)CO)CO)CO)O)O)O)O'
Smiles["methionine (s)-sulfoxide"] = 'CS(=O)CCC(C(=O)O)N'
Smiles["tdp-3,4-didehydro-2,6-dideoxy-alpha-d-glucose"] = 'CC1C(=O)C(=O)CC(O1)OP(=O)([O-])OP(=O)([O-])OCC2C(CC(O2)N3C=C(C(=O)NC3=O)C)O'
Smiles["oyster glycogen"] = 'C(C1C(C(C(C(O1)OCC2C(C(C(C(O2)OC3C(OC(C(C3O)O)O)CO)O)O)OC4C(C(C(C(O4)CO)O)O)O)O)O)O)O'
Smiles["alpha-d-man-(1-2)-alpha-d-man-(1-2)-alpha-d-man-(1-3)-[alpha-d-man-(1-2)-alpha-d-man-(1-3)-[alpha-d-man-(1-2)-alpha-d-man-(1-6)]-alpha-d-man-(1-6)]-beta-d-man-(1-4)-beta-d-glcnac"] = 'CC(=O)NC1C(C(C(OC1O)CO)OC2C(C(C(C(O2)COC3C(C(C(C(O3)COC4C(C(C(C(O4)CO)O)O)OC5C(C(C(C(O5)CO)O)O)O)O)OC6C(C(C(C(O6)CO)O)O)OC7C(C(C(C(O7)CO)O)O)O)O)O)OC8C(C(C(C(O8)CO)O)O)OC9C(C(C(C(O9)CO)O)O)OC1C(C(C(C(O1)CO)O)O)O)O)O'
Smiles["beta-d-galactosyl-(1-&gt;4)-l-rhamnose"] = 'CC1C(C(C(C(O1)O)O)O)OC2C(C(C(C(O2)CO)O)O)O'
Smiles["cmp-alpha-n-acetylneuraminate"] = 'CC(=O)NC1C(CC(OC1C(C(CO)O)O)(C(=O)O)OP(=O)(O)OCC2C(C(C(O2)N3C=CC(=NC3=O)N)O)O)O'
Smiles["pyrogallate"] = 'COC1=C(C(=CC=C1)OC)O'
Smiles["nucleoside 5'-diphosphate"] = 'O[C@H]1[C@@H]([*])[C@H]([*])O[C@@H]1COP(O)(O)=O'
Smiles["n-formyl-l-met-leu-phe"] = 'CC(C)CC(C(=O)NC(CC1=CC=CC=C1)C(=O)O)NC(=O)C(CCSC)NC=O'
Smiles["hyaluronan"] = 'CC(=O)NC1C(C(C(OC1O)CO)O)OC2C(C(C(C(O2)C(=O)O)OC3C(C(C(C(O3)CO)O)OC4C(C(C(C(O4)C(=O)O)O)O)O)NC(=O)C)O)O.[Na+]'
Smiles["(+)-(s)-allyl-l-cysteine sulfoxide"] = 'C=CCS(=O)CC(C(=O)O)N'
Smiles["1,4-alpha-d-glucooligosaccharide"] = 'C1([R])(C(CO)OC(C(C1O)O)OC2(C(CO)OC(C(C2O)O)OC3(C(CO)OC(C(C3O)O)O[R])))'
Smiles["ala-d-glu"] = 'CC(C(=O)NC(CCC(=O)[O-])C(=O)[O-])[NH3+]'
Smiles["heparosan n-sulfate d-glucuronate"] = 'C(C1C(C(C(C(O1)O)NS(=O)(=O)[O-])O)OC2C(C(C(C(O2)C(=O)[O-])O)O)O)O'
Smiles["nucleoside 5'-triphosphate"] = 'C(OP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])C1(OC([a nucleobase])C([R])C(O)1)'
Smiles["3-hydroxy-4-trimethylaminobutyrate"] = 'C(C(O)CC(=O)[O-])[N+](C)(C)C'
Smiles["9-cis,11-trans-linoleic acid"] = 'CCCCCCC=CC=CCCCCCCCC(=O)O'
Smiles["ch2br-ch2cl"] = 'C(CBr)Cl'
Smiles["dynorphin a1-8"] = 'CCC(C)C(C(=O)O)NC(=O)C(CCCN=C(N)N)NC(=O)C(CCCN=C(N)N)NC(=O)C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)CNC(=O)CNC(=O)C(CC2=CC=C(C=C2)O)N'
Smiles["omega-n-(l-arginino)succinate"] = 'C(CC(C(=O)[O-])[NH3+])C[NH+]=C(N)NC(CC(=O)[O-])C(=O)[O-]'
Smiles["a 1-linoleoyl 2-acyl-sn-glycerol 3-phosphate"] = 'CCCCCC=CCC=CCCCCCCCC(=O)OCC(OC([R])=O)COP([O-])(=O)[O-]'
Smiles["5'-dump"] = 'C1C(C(OC1N2C=CC(=O)NC2=O)COP(=O)(O)O)O'

NameError: name 'Smiles' is not defined

In [14]:
import json
from rdkit import Chem
from rdkit.Chem import Descriptors

filepath = join(RAW_EXTERNAL,"BRENDA_data_2019_1","2019-04-02_substrate_cache.json")
with open(filepath, 'r') as f:
        data = json.loads(f.read())
        
def separate_nones(data):
    
    """Takes a dictionary as input and adds every entry giving a valid 'value' to a new dictionary, while appending all
       all entries that return "None" to a list. It then returns the SMILES dictionary (index 0) and the Nones list (index 1)"""
    
    print("Separating compounds without SMILES...")
    Nones = []
    Smiles = {}
    for key in data:
        if data[key] == None:
            Nones.append(key)
        else:
            Smiles[key] = data[key]
    print("Done.")
    return Smiles, Nones

def filter_uninteresting(nones): 
    
    """Filter out terms indicating that the "substrate" is not a molecule, or is a polymer."""
    
    unwanted_terms = ['protein', 'ase', 'factor', 'rna', 'dna', 'subunit', 'ribosom', 'receptor', 'active', 'cell', 'hormone']
    Uninteresting = [x for x in nones if any(word in x for word in unwanted_terms)]
    Still_interesting = []
    print("Filtering out uninteresting compounds...")
    for i in range(0,len(nones)):
        if nones[i] not in Uninteresting:
            Still_interesting.append(nones[i])
    print("Filtered out " + str(len(Uninteresting)) + " compounds. " + str(len(Still_interesting)) + " compounds of interest remain.")
    return Still_interesting    


def test_smiles(smiles):
    
    """Takes as input a dictionary of substrate names and their SMILES, converts to mol-format and returns a dictionary of substrate names and 
    and their respective molecular weight. Done with the intention of checking that rdkit can read the SMILES string. Outputs a list
    containing a dictionary with the compounds whose SMILES passed (index 0), and a list of compounds that did not. (index 1)"""
    
    print("Testing new SMILES...")
    PassedSmiles = {}
    InvalidSmiles = {}
    for key in smiles:
        try:
            mol = Chem.MolFromSmiles(smiles[key])
            PassedSmiles[key] = smiles[key]
        except:
            print("Invalid SMILES string.")
            InvalidSmiles[key] = smiles[key]
    print(str(len(PassedSmiles)) +" SMILES passed. " + str(len(InvalidSmiles)) + " unidentified substrates remain")    
    return PassedSmiles, InvalidSmiles
x=separate_nones(data)[0]
print(test_smiles(data)[1])


Separating compounds without SMILES...
Done.
Testing new SMILES...
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILE

Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMI

Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMI

Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMI

Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMI

Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMI

Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
Invalid SMILES string.
2403 SMILES passed. 3197 unidentified substrates remain
{'1,2-linked glucuronic acid of non-reducing xylose-oligosaccahrides': None, '(3e)-3-[(4s)-4-hydroxycyclohex-2-en-1-yl]-2-oxopropanoate': None, 'alpha-d-xyl-(1-&gt;6)-beta-d-glc-(1-&gt;4)-[alpha-d-xyl-(1-&gt;6)]-beta-d-glc-(1-&gt;4)-[alpha-d-xyl-(1-&gt;6)]-beta-d-glc-(1-&gt;4)-d-glc': None, '(3-hydroxy-phenylalkanoic acid