This notebook is extracts all uniprot identifiers from the 1.1.3.15 BRENDA html page.<br/><br/>Copyright (C) 2017  Martin Engqvist Lab<br/>This program is free software: you can redistribute it and/or modify<br/>it under the terms of the GNU General Public License as published by<br/>the Free Software Foundation, either version 3 of the License, or<br/>(at your option) any later version.<br/>This program is distributed in the hope that it will be useful,<br/>but WITHOUT ANY WARRANTY; without even the implied warranty of<br/>MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the<br/>GNU General Public License for more details.<br/>You should have received a copy of the GNU General Public License<br/>along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory paths
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory


# make folders specific for certain data
folder_name = 'similarity_data'
if folder_name != '':
#     #make folders if they don't exist
#     if not exists(join(RAW_EXTERNAL, folder_name)):
#         os.makedirs(join(RAW_EXTERNAL, folder_name))

#     if not exists(join(INTERMEDIATE, folder_name)):
#         os.makedirs(join(INTERMEDIATE, folder_name))

    if not exists(join(FINAL, folder_name)):
        os.makedirs(join(FINAL, folder_name))

print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


In [2]:
import re 

def get_all_uniprot_id():
    '''Use regex to get all the uniprot identifiers. Intended as an alternate method that does not depend on parsing the html.'''

    #read the html page
    filepath = join(RAW_EXTERNAL, 'BRENDA', '1.1.3.15.html')
    with open(filepath, 'r') as f:
        document = f.read()

    #http://www.uniprot.org/help/accession_numbers
    m = re.findall('([OPQ][0-9](?:[A-Z0-9]){3}[0-9])|([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})', document)

    uids = set([])
    for result in m:
        for uid in result:
            if uid == '':
                continue
                
            uids.add(uid)
    return list(uids)
    

data = get_all_uniprot_id()
    
data

['P05414',
 'K2V617',
 'Q10CE4',
 'Q56ZN0',
 'Q19U05',
 'A9QH69',
 'A9QH71',
 'Q5SM78',
 'Q24JJ8',
 'Q9NYQ3',
 'Q07523',
 'Q9LRR9',
 'Q9NYQ2',
 'B7FUG8',
 'Q9UJM8',
 'Q9WU19',
 'Q9LJH5',
 'Q9CG58']