Modify this line to briefly discribe the functionality of pubmedpy.ipynb<br/><br/>Copyright (C) 2017  Martin Engqvist Lab<br/>This program is free software: you can redistribute it and/or modify<br/>it under the terms of the GNU General Public License as published by<br/>the Free Software Foundation, either version 3 of the License, or<br/>(at your option) any later version.<br/>This program is distributed in the hope that it will be useful,<br/>but WITHOUT ANY WARRANTY; without even the implied warranty of<br/>MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the<br/>GNU General Public License for more details.<br/>You should have received a copy of the GNU General Public License<br/>along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [244]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory paths
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory


# make folders specific for certain data
folder_name = ''
if folder_name != '':
    #make folders if they don't exist
    if not exists(join(RAW_EXTERNAL, folder_name)):
        os.makedirs(join(RAW_EXTERNAL, folder_name))

    if not exists(join(INTERMEDIATE, folder_name)):
        os.makedirs(join(INTERMEDIATE, folder_name))

    if not exists(join(FINAL, folder_name)):
        os.makedirs(join(FINAL, folder_name))

print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


In [255]:
import json
from pubchempy import Compound
from rdkit import Chem
from rdkit.Chem import Descriptors

filepath = join(RAW_EXTERNAL,"BRENDA_data_2019_1","2019-04-02_substrate_cache.json")
with open(filepath, 'r') as f:
        data = json.loads(f.read())
        
def separate_nones(data):
    """Takes a dictionary as input and adds every entry giving a valid 'value' to a new dictionary, while appending all
       all entries that return "None" to a list. It then returns the SMILES dictionary (index 0) and the Nones list (index 1)"""
    print("Separating compounds without SMILES...")
    Nones = []
    Smiles = {}
    for key in data:
        if data[key] == None:
            Nones.append(key)
        else:
            Smiles[key] = data[key]
    print("Done.")
    return Smiles, Nones

def filter_uninteresting(nones): 
    """Filter out terms indicating that the "substrate" is not a molecule, or is a polymer."""
    unwanted_terms = ['protein', 'ase', 'factor', 'rna', 'dna', 'subunit', 'ribosom', 'receptor', 'active', 'cell', 'hormone']
    Uninteresting = [x for x in nones if any(word in x for word in unwanted_terms)]
    Still_interesting = []
    print("Filtering out uninteresting compounds...")
    for i in range(0,len(nones)):
        if nones[i] not in Uninteresting:
            Still_interesting.append(nones[i])
    print("Filtered out " + str(len(Uninteresting)) + " compounds. " + str(len(Still_interesting)) + " compounds of interest remain.")
    return Still_interesting    


def test_smiles(smiles,nones):
    """Takes as input a dictionary of substrate names and their SMILES, converts to mol-format and returns a dictionary of substrate names and 
    and their respective molecular weight. Done with the intention of checking that rdkit can read the SMILES string. Outputs a list
    containing a dictionary with the compounds whose SMILES passed (index 0), and a list of compounds that did not. (index 1)"""
    print("Testing new SMILES...")
    PassedSmiles = {}
    RemainingNones = nones
    for key in smiles:
        try:
            mol = Chem.MolFromSmiles(smiles[key])
            PassedSmiles[key] = smiles[key]
        except:
            print("Invalid SMILES string.")
            RemainingNones.append(key)
    print(str(len(PassedSmiles)) +" SMILES passed. " + str(len(RemainingNones)) + " unidentified substrates remain")    
    return PassedSmiles, RemainingNones

def smiles_from_pubchem(compound_names):
    """Takes a list of compound names and checks for matching entries in the PubMed database. If an entry exists, the 
    canonical (no isomery) SMILES key is extracted and added to a dictionary.  Compounds with no matches are added to a 
    list. This dictionary and this list are returned in index 0 and 1, respectively."""
    print("Fetching entries from PubChem...")
    NoSmiles = []
    NewSmiles = {}
    for i in range(0,len(compound_names)):
        x = get_compounds(compound_names[i],'name')    # Painfully slow.
        # Check that compound is listed on PubChem.
        if len(x) > 0:       
            for result in x: 
                NewSmiles[compound_names[i]] = result.canonical_smiles 
        else:
            NoSmiles.append(compound_names[i])
    print("Found " + str(len(NewSmiles)) + " new SMILES. Returning " + str(len(NoSmiles)) + " undetermined substrates.")
    return NewSmiles, NoSmiles
        
def salvage_and_test(incomplete_smiles_list):
    """Run the whole shaboom."""
    SMILES = separate_nones(incomplete_smiles_list)[0]
    NONES = separate_nones(incomplete_smiles_list)[1]
  
    PUBCHEM = smiles_from_pubchem(NONES)
    NEW_SMILES = PUBCHEM[0]
    NONES = PUBCHEM[1]
    
    TESTED = test_smiles(NEW_SMILES,NONES)
    SMILES.update(TESTED[0])
    NONES = filter_uninteresting(TESTED[1])
    
    print("Number of SMILES: " + str(len(SMILES)))
    print("Number of untranslated compounds: " + str(len(NONES)))
    
    return SMILES, NONES
    
[Smiles, Nones] = salvage_and_test(data)

# Found these by manual curation. 
Smiles["5'-dcmp"] = 'C1C(C(OC1N2C=CC(=NC2=O)N)COP(=O)(O)O)O'
Smiles["Plugin glutamine"] = 'C(CC(=O)N)C(C(=O)O)N'
Smiles["5'-dimp"] = 'C1C(C(OC1N2C=NC3=C2N=CNC3=O)COP(=O)(O)O)O'


Separating compounds without SMILES...
Done.
Separating compounds without SMILES...
Done.
Fetching entries from PubChem...
Found 436 new SMILES. Returning 2761 undetermined substrates.
Testing new SMILES...
436 SMILES passed.2761 unidentified substrates remain
Filtering out uninteresting compounds...
Filtered out 951 compounds. 1810 compounds of interest remain.
Number of SMILES: 2839
Number of untranslated compounds: 1810


In [265]:
print(Nones)


['1,2-linked glucuronic acid of non-reducing xylose-oligosaccahrides', '(3e)-3-[(4s)-4-hydroxycyclohex-2-en-1-yl]-2-oxopropanoate', 'alpha-d-xyl-(1-&gt;6)-beta-d-glc-(1-&gt;4)-[alpha-d-xyl-(1-&gt;6)]-beta-d-glc-(1-&gt;4)-[alpha-d-xyl-(1-&gt;6)]-beta-d-glc-(1-&gt;4)-d-glc', '(3-hydroxy-phenylalkanoic acid)n', 'neuropeptides', 'ompx', 'histone h3 n6-dimethyl-l-lysine4', 'beta-d-galactosyl-(1-&gt;4)-l-rhamnose', 'gum arabic', 'pyrogallate', "nucleoside 5'-diphosphate", 'alpha-chitin', 'cmp-alpha-n-acetylneuraminate', 'pro-mcasp-7', 'tdp-3,4-didehydro-2,6-dideoxy-alpha-d-glucose', 'orosomucoid', 'plugin glutamine', 'sidekick-1', 'oyster glycogen', 'oxidized rubredoxin', '(e/z)-isocapronaldoxime', 'n-formyl-l-met-leu-phe', 'glcman9glcnac', 'hyaluronan', 'flab2', 'lamp1', "5'-dcmp", 'phosphorylated-erbb-2', '(+)-(s)-allyl-l-cysteine sulfoxide', 'poly-sumo-2', 'fibronectin', 'dimethylated histone 3 lysine 9', 'enamelin', '1,4-alpha-d-glucooligosaccharide', 'man6-glycopeptide', '7-ethyl-10[4-(