This notebook is for obtaining Pfam domains for all BRENDA sequences.<br/><br/>Copyright (C) 2020-2021  Martin Engqvist Lab<br/>This program is free software: you can redistribute it and/or modify<br/>it under the terms of the GNU General Public License as published by<br/>the Free Software Foundation, either version 3 of the License, or<br/>(at your option) any later version.<br/>This program is distributed in the hope that it will be useful,<br/>but WITHOUT ANY WARRANTY; without even the implied warranty of<br/>MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the<br/>GNU General Public License for more details.<br/>You should have received a copy of the GNU General Public License<br/>along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory paths
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory


folder_name = 'brenda_domains'
if folder_name != '':
    #make folders if they don't exist
    if not exists(join(RAW_EXTERNAL, 'BRENDA_for_paper', folder_name)):
        os.makedirs(join(RAW_EXTERNAL, 'BRENDA_for_paper', folder_name))
    
    if not exists(join(INTERMEDIATE, 'BRENDA_for_paper', folder_name)):
        os.makedirs(join(INTERMEDIATE, 'BRENDA_for_paper', folder_name))

print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


In [2]:
import os
import subprocess
from os.path import join, exists, getsize
import multiprocessing
import pandas as pd
import numpy as np

from urllib.request import urlopen
from urllib.error import URLError, HTTPError
import time
import re

from tqdm import tqdm

# Overview
I want to analyze, for each EC class, which uncharacterized sequences have the same domains as characterized ones. For this I download a Pfam flatfile and parse it.

### Download Pfam pre-calculated domain data (Version 33.1)
This file is large, roughly 250 GB when unzipped.

In [3]:
url = 'ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam33.1/Pfam-A.full.uniprot.gz'
outfile = join(RAW_EXTERNAL, 'BRENDA_for_paper', 'Pfam-A.full.uniprot.gz')

if not exists(outfile.replace('.gz', '')):
    # download the hmm file
    if not exists(outfile):
        my_cmd = 'wget -O {} {}'.format(outfile, url)
        os.system(my_cmd)

    # unzip file
    my_cmd = 'gunzip {}'.format(outfile)
    os.system(my_cmd)

### Get all uniprot identifiers from fasta files

In [4]:
def get_identifiers():
    '''
    Function for distributing the files on a number of workers.
    '''
    # define folder
    inpath = join(INTERMEDIATE, 'BRENDA_for_paper', 'ec_identity_clustering')

    # run assemble a list of the file to run
    data = {}
    for fi in sorted(os.listdir(inpath)):

        if not fi.endswith('90_augmented.fasta'):
            continue

        # print(fi)

        with open(join(inpath, fi), 'r') as f:
            for line in f:
                # only look at header lines
                if line.startswith('>'):
                    line_data = line.lstrip('>').rstrip().split(';')
                    uid = line_data[0]

                    data[uid] = []

    print('{} identifiers'.format(len(data.keys())))

    return data


data = get_identifiers()

5072360 identifiers


### Go through Pfam file to find the domains

In [5]:
def get_doms(data, filepath):
    '''
    Try to find domains for the identifiers.
    Uses pre-computed Pfam file.
    '''
    all_uid = set(data.keys())
    start_found = False
    counter = 0
    with open(filepath, 'r', encoding="latin-1") as f: #latin-1 to deal with a small number of byte codes in the file
        for line in f:
            # find beginning of a pfam domain record
            if line.startswith('#=GF AC'):
                if start_found is False: # if I find two starts in a row something went wrong
                    pfam = line.split()[2].rstrip()
                    start_found = True
                else:
                    raise ValueError

            elif line.startswith('#'): # skip other annotation fields
                continue

            elif line.startswith('//'): # if I find the end before the start something went wrong
                if start_found is True:
                    start_found = False
                else:
                    raise ValueError

            else:
                uid = line.split('.')[0]
                if uid in all_uid:
                    data[uid].append(pfam)
                    counter += 1

    # sort the domains
    for uid in data.keys():
        data[uid] = sorted(data[uid])
                    
    print('found {} of {}'.format(counter, len(all_uid)))
    return data



def write_file(data, filepath):
    '''
    Write data to disk.
    '''
    outlines = []
    for uid in sorted(data.keys()):
        outlines.append('{}\t{}'.format(uid, ','.join(data[uid])))

    with open(filepath, 'w') as f:
        f.write('\n'.join(outlines))


        
def load_file(filepath):
    '''
    Get data from disk.
    '''
    data =  {}
    with open(filepath, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            uid = parts[0]

            if len(parts) == 1:
                data[uid] = []
            else:
                domains = [s.split('.')[0] for s in sorted(parts[1].split(','))] # convert PF01676.19 to PF01676
                data[uid] = domains
    return data
        
        
# parse the fasta files and get the uniprot identifiers
outfile = join(INTERMEDIATE, 'BRENDA_for_paper', 'dom_temp_data.tsv')
if not exists(outfile):
    data = get_identifiers()

    # parse the pfam file and get domains for the identifiers
    data = get_doms(data, join(RAW_EXTERNAL, 'BRENDA_for_paper', 'Pfam-A.full.uniprot'))

    # save data
    write_file(data, outfile)
    
    
# how many do not have a domain?
data = load_file(outfile)
print('{} sequences missing domain prediction'.format(list(data.values()).count([])))

172358 sequences missing domain prediction


### Query UniProt for missing domains, but only for the characterized sequences

In [6]:
def get_characterized():
    # get a dictionary of characterized identifiers from BRENDA
    characterized = {}
    with open(join(INTERMEDIATE, 'BRENDA_for_paper', 'parsed_info', 'ec_data_uid_orgs.tsv'), 'r') as f:
        f.readline()

        for line in f:
            ec, org, uid = line.rstrip().split('\t')

            if characterized.get(ec) is None:
                characterized[ec] = set([])
            characterized[ec].add(uid)
            
    # get all characterized identifiers from SwissProt
    with open(join(RAW_EXTERNAL, 'SwissProt-2020_02-protein-evidence.tsv'), 'r') as f:
        f.readline()
        
        for line in f:
            uid, ec, org, orgid = line.strip().split('\t')
            
            if characterized.get(ec) is None:
                characterized[ec] = set([])
            characterized[ec].add(uid)
            
    return characterized


def dlfile(folder, filename, url):
    '''
    Download a web page if the corresponding file does not exist.
    '''

    # Open the url
    try:
        out_path = join(folder, filename)
        if exists(out_path):
#             print('already on disk ' + url)
            return True
            
        elif not exists(out_path):
            f = urlopen(url)
            print("downloading " + url)

            # Open local file for writing
            with open(out_path, "wb") as local_file:
                local_file.write(f.read())
            time.sleep(0.5)
            return True

    #handle errors
    except HTTPError as e:
        print("HTTP Error:", e.code, url)
        return False
    
    except URLError as e:
        print("URL Error:", e.reason, url)
        return False
        

def get_pfam_from_file(filepath):
    '''
    Parse out pfam data from a file
    '''
    # open the uniprot and uniparc pages and append them
    with open(filepath, 'r') as f:
        document = f.read()

        
    # search through the combined document for identifiers
    m = re.findall('(PF[0-9]{5}|CL[0-9]{4})', document)

    # loop through the search result and keep unique identifiers
    pfam_ids = set([])
    for pid in m:
        if pid == '':
            continue
                
        pfam_ids.add(pid)
        
    return pfam_ids
        
    
    
def get_pfam_for_uids(uids, filepath):
    '''
    Query the UniProt database to get protein domains for 
    a list of protein identifiers.
    '''

    data = {'uid':[], 'pfam':[]}

    for uid in tqdm(uids):

        # download uniprot file
        uniprot_url = 'https://www.uniprot.org/uniprot/'
        
        result = dlfile(folder=filepath, filename='%s_uniprot.html' % uid, url=uniprot_url+uid)
        if result is False:
            continue

        # query uniprot file
        pfam_ids = get_pfam_from_file(join(filepath, '%s_uniprot.html' % uid))


        # download uniparc overview file
        uniparc_url = 'https://www.uniprot.org/uniparc/?query=%s' % uid
        overview_filename = join(filepath, '%s_uniparc_overview.html' % uid)
        result = dlfile(folder=filepath, filename='%s_uniparc_overview.html' % uid, url=uniparc_url)
        if result is False:
            continue

        # find the reference for the alternate identifier and download that file
        with open(overview_filename, 'r') as f:
            document = f.read()

            m = re.search('class="entryID"><a href="/uniparc/([a-zA-Z0-9]+)">', document)
            if m is None:
                continue
                
            new_target_url = 'https://www.uniprot.org/uniparc/%s' % m.group(1)
            result = dlfile(folder=filepath, filename='%s_uniparc.html' % uid, url=new_target_url)
            if result is False:
                continue

        # query uniparc file
        pfam_ids2 = get_pfam_from_file(join(filepath, '%s_uniparc.html' % uid))


        data['uid'].append(uid)
        data['pfam'].append(','.join(sorted(list(pfam_ids.union(pfam_ids2)))))

    
    return pd.DataFrame(data)




# get characterized identfiers
characterized = get_characterized()
uids = set([])
for ec in characterized.keys():
    uids.update(characterized[ec])
len(uids)

# which characterized identifiers were not present in the Pfam file?
missing = uids - set(data.keys())
print(len(missing))

# now download these
filepath = join(RAW_EXTERNAL, 'pfam')
if not exists(filepath):
    os.makedirs(filepath)
    
df = get_pfam_for_uids(sorted(list(missing), reverse=False), filepath)

  0%|          | 5/18032 [00:00<06:39, 45.14it/s]

18032


100%|██████████| 18032/18032 [02:11<00:00, 137.57it/s]


### Add the missing domains to the main data dictionary

In [7]:
data_missing = df.to_dict(orient='list')
for key, value in zip(data_missing['uid'], data_missing['pfam']):
    domains = sorted(value.split(','))
    if domains == ['']:
        domains = []
    data[key] = domains

# how many do not have a domain?
print('{} sequences missing domain prediction'.format(list(data.values()).count([])))

173295 sequences missing domain prediction


### Save the results

In [8]:
outfile = join(FINAL, 'brenda_2019_2', 'all_brenda_domain_data.tsv')
write_file(data, outfile)