This notebook is for downloading up-to-date protein domain information for the 1.1.3.15 sequences.<br/><br/>Copyright (C) 2019  Martin Engqvist Lab<br/>This program is free software: you can redistribute it and/or modify<br/>it under the terms of the GNU General Public License as published by<br/>the Free Software Foundation, either version 3 of the License, or<br/>(at your option) any later version.<br/>This program is distributed in the hope that it will be useful,<br/>but WITHOUT ANY WARRANTY; without even the implied warranty of<br/>MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the<br/>GNU General Public License for more details.<br/>You should have received a copy of the GNU General Public License<br/>along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory paths
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory



print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


In [2]:
import pandas as pd
import re
from urllib.request import urlopen
from urllib.error import URLError, HTTPError
import time

#### First download all UniProt and UniParc pages relating to the identifiers

In [5]:

def dlfile(folder, filename, url):
    '''
    Download a web page if the corresponding file does not exist.
    '''

    # Open the url
    try:
        out_path = join(folder, filename)
        if not exists(out_path):
            f = urlopen(url)
            print("downloading " + url)

            # Open local file for writing
            with open(out_path, "wb") as local_file:
                local_file.write(f.read())
            time.sleep(1)

    #handle errors
    except HTTPError as e:
        print("HTTP Error:", e.code, url)
    
    except URLError as e:
        print("URL Error:", e.reason, url)



# load a list of the identifiers I want
filepath = join(FINAL, 'brenda_2017_1', 'ec_uid_org_from_fasta_2017_1.tsv')
uid_ec = pd.read_csv(filepath, sep='\t').drop_duplicates()

# # only keep 1.1.3.15
data_subset = uid_ec[uid_ec['ec']=='1.1.3.15']

display(data_subset.head())

uids = data_subset.uid.values
display(len(uids))

# download uniparc and uniprot for each
filepath = join(RAW_EXTERNAL, 'pfam')

if not exists(filepath):
    os.mkdir(filepath)
    

    

Unnamed: 0,uid,ec
34,A0A0U5F9V4,1.1.3.15
35,S6DC87,1.1.3.15
36,D4XA33,1.1.3.15
37,D4XA32,1.1.3.15
38,D4XIR1,1.1.3.15


1411

#### Now look for pfam identifiers in these pages using regex

In [6]:
import re


def get_pfam_from_file(filepath):
    '''
    Parse out pfam data from a file
    '''
    # open the uniprot and uniparc pages and append them
    with open(filepath, 'r') as f:
        document = f.read()

        
    # search through the combined document for identifiers
    m = re.findall('(PF[0-9]{5}|CL[0-9]{4})', document)

    # loop through the search result and keep unique identifiers
    pfam_ids = set([])
    for pid in m:
        if pid == '':
            continue
                
        pfam_ids.add(pid)
        
    return pfam_ids
        
    
    

data = {'uid':[], 'pfam':[]}
    
    

for uid in uids:
    
    # download uniprot file
    uniprot_url = 'https://www.uniprot.org/uniprot/'
    dlfile(folder=filepath, filename='%s_uniprot.html' % uid, url=uniprot_url+uid)
    
    # query uniprot file
    pfam_ids = get_pfam_from_file(join(filepath, '%s_uniprot.html' % uid))

    
    # download uniparc overview file
    uniparc_url = 'https://www.uniprot.org/uniparc/?query=%s' % uid
    overview_filename = join(filepath, '%s_uniparc_overview.html' % uid)
    dlfile(folder=filepath, filename='%s_uniparc_overview.html' % uid, url=uniparc_url)
    
    # find the reference for the alternate identifier and download that file
    with open(overview_filename, 'r') as f:
        document = f.read()
        
        m = re.search('class="entryID"><a href="/uniparc/([a-zA-Z0-9]+)">', document)
        new_target_url = 'https://www.uniprot.org/uniparc/%s' % m.group(1)
        dlfile(folder=filepath, filename='%s_uniparc.html' % uid, url=new_target_url)
    
    # query uniparc file
    pfam_ids2 = get_pfam_from_file(join(filepath, '%s_uniparc.html' % uid))

            
    data['uid'].append(uid)
    data['pfam'].append(','.join(sorted(list(pfam_ids.union(pfam_ids2)))))

data_frame = pd.DataFrame(data)


In [8]:
data_frame.to_csv(join(FINAL, 'brenda_2017_1', 'pfam_info_2017_1.tsv'), sep='\t', index=False)