<p style="text-align: center;" ><font size="7" >Data download</font></p>
<hr />

# Preparation

## Dependancies

In [10]:
import pandas as pd
from collections import defaultdict
import os
import shutil as sh
import urllib
import tarfile

from ipywidgets import interact, interactive, fixed, interact_manual, IntProgress
import ipywidgets as widgets
from IPython.display import display
import gzip

#Pandarallel works only on linux and mac
try:
    from pandarallel import pandarallel
    pandarallel.initialize(nb_workers=8,progress_bar=True)
    PARRALEL = True
except:
    PARRALEL = False

from tqdm.notebook import tnrange, tqdm
tqdm.pandas() #activate tqdm progressbar for pandas apply

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


## import folder configuration

You can change folders setting in the "00-SETUP.ipynb" notebook

In [11]:
%run "./00-SETUP.ipynb"
UPDATE = False

# Donwload files 

## Downloading the CATH domain list

In [12]:
domfile = 'cath-domain-list.txt'

url = "ftp://orengoftp.biochem.ucl.ac.uk/cath/releases/all-releases/v4_2_0/cath-classification-data/cath-domain-list.txt"
destination = CATHFOLDER+domfile
if not os.path.isfile(destination) or UPDATE: 
    urllib.request.urlretrieve(url, destination)
    

colomnDomFile = [
    'Domain','Class','Architecture','Topology','Homologous','S35','S60','S95','S100','S100Count','DomSize','resolution',
]

## Download Correspondance between Uniprot and PDB code

In [13]:
url="ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/pdb_chain_uniprot.csv.gz"
destination = CATHFOLDER+"pdb_chain_uniprot.csv.gz" 
if not os.path.isfile(destination) or UPDATE: 
    urllib.request.urlretrieve(url, destination)
    
with gzip.open(destination, 'rb') as f:
    file = f.read()
    
    with open(destination.split('.gz')[0], 'wb') as output:
        output.write(file)
    #.write(destination.split('.gz')[0])

os.remove(destination)

## Download PROSITE files


In [14]:
url="ftp://ftp.expasy.org/databases/prosite/prosite_alignments.tar.gz"
destination = PROSITEFOLDER+"prosite_alignments.tar.gz" 

if UPDATE:
    if os.path.exists(PROSITEFOLDER+"msa"):
        import shutil
        shutil.rmtree(PROSITEFOLDER+"msa/")
    else:
        os.makedirs(PROSITEFOLDER+"msa/")

if not os.path.exists(PROSITEFOLDER+"msa") or UPDATE: 
    urllib.request.urlretrieve(url, destination)
    tf = tarfile.open(destination)
    tf.extractall(PROSITEFOLDER)
    tf.close()
    os.rename(PROSITEFOLDER+"prosite_alignments",PROSITEFOLDER+"msa")
    os.remove(destination)


    



## Download CATH PDB files

Reading cath domain list

In [15]:
cathDomains = pd.read_csv(CATHFOLDER+domfile,comment='#', sep=r"\s+", header=None)
cathDomains.columns = colomnDomFile
if PARRALEL:
    cathDomains['Superfamily'] = cathDomains.parallel_apply(lambda x: f"{x.Class}.{x.Architecture}.{x.Topology}.{x.Homologous}", axis=1)
else:
    cathDomains['Superfamily'] = cathDomains.progress_apply(lambda x: f"{x.Class}.{x.Architecture}.{x.Topology}.{x.Homologous}", axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=54358), Label(value='0 / 54358')))…

Creating the superfamily

In [16]:
cathSuperFamily = pd.DataFrame()
cathSuperFamily['Superfamily'] = cathDomains.Superfamily
cathSuperFamily['Domain'] = cathDomains.Domain

Creating a dictionary with the superfamily as key and list of cathdomain (pdb format) as value

In [17]:
cathDomainsPerSuperFamily = defaultdict(list)
#do not parralel this one
_ = cathSuperFamily.progress_apply(lambda x: cathDomainsPerSuperFamily[x.Superfamily].append(x.Domain), axis=1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=434857.0), HTML(value='')))




## Gather data for ALL domains

In [18]:
def download_dom(dom, folder):
    url = "http://www.cathdb.info/version/"+CATHVERSION+"/api/rest/id/"+dom+".pdb"
    destination = folder+dom+'.pdb'
    if not os.path.isfile(destination): 
        urllib.request.urlretrieve(url, destination)
    #progressbar.value += 1

def fetch_dom_for_superfamily(superfamily, cathDomainsPerSuperFamily, domName):
    print(f">Working with {domName} domain")
    global CATHFOLDER
    folder = CATHFOLDER+'domains/'+domName+'/raw/'
    if not os.path.exists(folder):
        os.makedirs(folder)
    if not os.path.exists(CATHFOLDER+'domains/'+domName+'/cleaned/'):
        os.makedirs(CATHFOLDER+'domains/'+domName+'/cleaned/')

    domlist = cathDomainsPerSuperFamily[superfamily]
    
    if PARRALEL:
        pd.Series(domlist).parallel_apply(lambda x: download_dom(x, folder))
    else:
        print(domlist)
        pd.Series(domlist).progress_apply(lambda x: download_dom(x, folder))


        
        
for superfamily,domain in SUPERFAMILY.items():
    fetch_dom_for_superfamily(superfamily, cathDomainsPerSuperFamily, domain)
    

>Working with PH domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=103), Label(value='0 / 103'))), HB…

>Working with C2 domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=44), Label(value='0 / 44'))), HBox…

>Working with C1 domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=16), Label(value='0 / 16'))), HBox…

>Working with PX domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9), Label(value='0 / 9'))), HBox(c…

>Working with FYVE domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=76), Label(value='0 / 76'))), HBox…

>Working with BAR domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=17), Label(value='0 / 17'))), HBox…

>Working with ENTH domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=17), Label(value='0 / 17'))), HBox…

>Working with SH2 domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=83), Label(value='0 / 83'))), HBox…

>Working with SEC14 domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7), Label(value='0 / 7'))), HBox(c…

>Working with START domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=56), Label(value='0 / 56'))), HBox…

>Working with C2DIS domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=162), Label(value='0 / 162'))), HB…

>Working with GLA domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=22), Label(value='0 / 22'))), HBox…

>Working with PLD domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=19), Label(value='0 / 19'))), HBox…

>Working with PLA domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=64), Label(value='0 / 64'))), HBox…

>Working with ANEXIN domain


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=51), Label(value='0 / 51'))), HBox…

**Now the next notebook can be runed** 😀