In [1]:
import pandas as pd
import numpy as np
from os.path import join
import os
import urllib.parse
import urllib.request
from Bio.UniProt.GOA import _gpa11iterator
import gzip
import pickle

import warnings
warnings.filterwarnings('ignore')

## 1. Extracting  UniProt IDs with transporter GO Terms:

In [2]:
df = pd.read_pickle(join("..", "..", "data", "GOA", "go_terms", "df_GO_with_substrates.pkl"))
transporter_go_terms = list(set(df["GO ID"]))

The file "goa_uniprot_all.gpa.gz" needs to be downloaded from http://current.geneontology.org/annotations/index.html and stored in the folder "data/GOA/"


In [3]:
filename = join("..", "..", "data", "GOA", "goa_uniprot_all.gpa.gz")

In [None]:
run = 0

df_GO_UID = pd.DataFrame(columns = ["Uniprot ID", "GO Term", 'ECO_Evidence_code'])


overall_count = 0
continuing = False

with gzip.open(filename, 'rt') as fp:
    for annotation in _gpa11iterator(fp):                 
        overall_count += 1
        if overall_count >= run*10**6 and overall_count < (run+1)*10**6:
            # Output annotated protein ID   
            UID = annotation['DB_Object_ID']
            GO_ID = annotation['GO_ID']
            ECO_Evidence_code = annotation["ECO_Evidence_code"]
            if GO_ID in transporter_go_terms:
                df_GO_UID = df_GO_UID.append({"Uniprot ID" : UID, "GO Term" : GO_ID,
                                             'ECO_Evidence_code' : ECO_Evidence_code}, ignore_index = True)
                
df_GO_UID.to_pickle(join("..", "..", "data", "GOA", "GO_UID_mapping", "df_GO_UID_part_" + str(run) +".pkl"))

In [None]:
df_GO_UID = pd.DataFrame(columns = ["Uniprot ID", "GO Term", 'ECO_Evidence_code'])

for run in range(923):
    try:
        with open(join("..", "..", "data", "GOA", "GO_UID_mapping", "df_GO_UID_part_" + str(run) +".pkl"), "rb") as fh:
            df_new = pickle.load(fh)
        #df_new = pd.read_pickle(join(CURRENT_DIR, "alex_data", "go_data", "GO_UID_mapping", "df_GO_UID_part_" + str(run) +".pkl"))
        df_GO_UID = pd.concat([df_GO_UID, df_new], ignore_index=True)
    except:
        print(run)

df_GO_UID.to_pickle(join("..", "..", "data", "GOA", "df_GO_UID.pkl"))

In [5]:
df_GO_UID = pd.read_pickle(join("..", "..", "data", "GOA", "df_GO_UID.pkl"))
len(df_GO_UID)

44681373

## 2. Mapping the ECO codes to 16 different evidence categories:

Every entry in the GOA database has an evidence code

In [6]:
filename_eco_obo = join("..", "..", "data", "GOA", 'eco.obo')

df = pd.DataFrame(columns = ["ECO ID"])
df["parents"] = ""

file1 = open(filename_eco_obo, 'r')
Lines = file1.readlines()
start = 0
while start != -1:
    start = Lines.index('[Term]\n')
    Lines = Lines[start+1:]
    try:
        ECO_Term = Lines[: Lines.index('[Term]\n')]
   

        ECO_list = []
        for line in ECO_Term:
            if "is_a" in line:
                ID = line.split("! ")[0][6:-1]
                if ID.find("{") != -1:
                    ID = ID[:ID.find("{")-1]
                ECO_list.append(ID)
            
    except:
        start = -1
            
    
    df = df.append({"ECO ID" :  ECO_Term[0][4:-1], "parents" : ECO_list}, ignore_index = True)
    
df["Evidence codes"] = ""

ECO_to_GAF = pd.read_csv(join("..", "..", "data", "GOA", 'ECO_to_GAF.tsv'), sep = "\t")
ECO_to_GAF

Unnamed: 0,ECO,Evidence,code default
0,ECO:0000278,EXP,
1,ECO:0000288,EXP,
2,ECO:0001043,EXP,
3,ECO:0001146,EXP,
4,ECO:0001147,EXP,
...,...,...,...
1182,ECO:0007659,RCA,
1183,ECO:0007666,RCA,
1184,ECO:0000245,RCA,Default
1185,ECO:0006017,TAS,


In [7]:
for ind in df.index:
    ID = df["ECO ID"][ind]
    help_df = ECO_to_GAF.loc[ECO_to_GAF["ECO"] == ID]
    if len(help_df) > 0:
        df["Evidence codes"][ind] = [list(help_df["Evidence"])[0]]
        
df_label = df.loc[df["Evidence codes"] != ""]
df_label.head()

df_label.to_pickle(join("..", "..", "data", "GOA", "df_ECO_label.pkl"))

In [8]:
df_label["Evidence codes"] = [code[0] for code in df_label["Evidence codes"]]
df_label.drop(columns =["parents"], inplace = True)
df_label.rename(columns = {"ECO ID" : "ECO_Evidence_code"}, inplace = True)

In [9]:
df_GO_UID = df_GO_UID.merge(df_label, how = "left", on = "ECO_Evidence_code")
df_GO_UID.rename(columns = {"Evidence codes" : "evidence"}, inplace = True)
df_GO_UID

Unnamed: 0,Uniprot ID,GO Term,ECO_Evidence_code,evidence
0,A0A001,GO:0042626,ECO:0000256,IEA
1,A0A001,GO:0055085,ECO:0000256,IEA
2,A0A002,GO:0042626,ECO:0000256,IEA
3,A0A002,GO:0055085,ECO:0000256,IEA
4,A0A009DWE1,GO:0022857,ECO:0000256,IEA
...,...,...,...,...
44681368,Z9JZ28,GO:0006811,ECO:0000323,IEA
44681369,Z9JZ28,GO:0015986,ECO:0000256,IEA
44681370,Z9JZ28,GO:0015986,ECO:0000256,IEA
44681371,Z9JZ28,GO:1902600,ECO:0000256,IEA


In [10]:
evidence_codes = list(set(df_GO_UID["evidence"]))
for ev in evidence_codes:
    print("%s : %s" % (ev, len(df_GO_UID.loc[df_GO_UID["evidence"] == ev])))

ISS : 8548
TAS : 2305
ISO : 7611
IEP : 68
IMP : 5728
IEA : 44470784
ISM : 455
IGI : 929
HMP : 6
ISA : 152
IPI : 60
NAS : 498
IC : 294
EXP : 53
RCA : 52
IDA : 7962
IGC : 66
IBA : 175802


#### Extracting all data points with experimental evidence

In [11]:
exp_evidence = ["EXP","IDA","IPI","IMP","IGI","IEP", "HTP","HDA","HMP","HGI","HEP"]

df_EXP = df_GO_UID.loc[df_GO_UID["evidence"] == "EXP"]
df_IDA = df_GO_UID.loc[df_GO_UID["evidence"] == "IDA"]
df_IPI = df_GO_UID.loc[df_GO_UID["evidence"] == "IPI"]
df_IMP = df_GO_UID.loc[df_GO_UID["evidence"] == "IMP"]
df_IGI = df_GO_UID.loc[df_GO_UID["evidence"] == "IGI"]
df_IEP = df_GO_UID.loc[df_GO_UID["evidence"] == "IEP"]

df_exp = pd.concat([df_EXP, df_IDA, df_IPI, df_IMP, df_IGI, df_IEP], ignore_index = True)
df_exp.drop_duplicates(inplace = True)
len(df_exp)

12913

In [12]:
df_GO_UID = df_exp.copy()

In [13]:
Uniprot_IDs = list(set(df_GO_UID["Uniprot ID"]))
print(len(Uniprot_IDs))

df_GO_UID.to_pickle(join("..", "..", "data", "GOA", "df_GO_UID_Transporter.pkl"))

6385


## 3. Mapping GO Terms to metabolite IDs:

In [14]:
df_GO_metabolite = pd.read_pickle(join("..", "..", "data", "GOA", "go_terms", "GO_terms_with_sub_IDs.pkl"))
df_GO_metabolite.head()

Unnamed: 0,GO ID,Definition,Name,Namespace,substrate,KEGG ID,PubChem CID,InChI,ChEBI
0,GO:0000006,"""Enables the transfer of zinc ions (Zn2+) from...",high-affinity zinc transmembrane transporter a...,molecular_function,zinc,,23994.0,InChI=1S/Zn,CHEBI:27363
1,GO:0000007,"""Enables the transfer of a solute or solutes f...",low-affinity zinc ion transmembrane transporte...,molecular_function,zinc ion,C00038,,,CHEBI:10113
5,GO:0000064,"""Enables the transfer of L-ornithine from one ...",L-ornithine transmembrane transporter activity,molecular_function,l-ornithine,C00077,,,CHEBI:6280
6,GO:0000095,"""Enables the transfer of S-adenosylmethionine ...",S-adenosyl-L-methionine transmembrane transpor...,molecular_function,s-adenosyl-l-methionine,C00019,,,CHEBI:22036
10,GO:0000102,"""Enables the transfer of L-methionine from one...",L-methionine secondary active transmembrane tr...,molecular_function,l-methionine,C00073,,,CHEBI:6271


In [15]:
df_UID_MID = pd.DataFrame(columns =["Uniprot ID", "molecule ID"])

for ind in df_GO_UID.index:
    if ind >= -1:
        GO_ID = df_GO_UID["GO Term"][ind]
        UID = df_GO_UID["Uniprot ID"][ind]
        met_IDs = list(df_GO_metabolite["ChEBI"].loc[df_GO_metabolite["GO ID"] == GO_ID])
        for met_ID in met_IDs:
            df_UID_MID = df_UID_MID.append({"Uniprot ID" : UID, "molecule ID" : met_ID}, ignore_index = True)
        
df_UID_MID.drop_duplicates(inplace = True)
Uniprot_IDs = list(set(df_UID_MID["Uniprot ID"]))
print(len(Uniprot_IDs), len(list(set(df_UID_MID["molecule ID"]))))

df_UID_MID

3420 281


Unnamed: 0,Uniprot ID,molecule ID
0,O14031,CHEBI:5437
1,O14329,CHEBI:10113
2,O42976,CHEBI:9266
3,O74969,CHEBI:5256
4,O76082,CHEBI:23038
...,...,...
6630,Q0GMA8,CHEBI:29035
6631,Q84W56,CHEBI:16411
6632,Q9CAT6,CHEBI:3288
6634,Q9VCI3,CHEBI:6486


## 4. Mapping UniProt IDs to amino acid sequences:

In [16]:
f = open(join("..", "..", "data", "GOA", "UNIPROT_IDs.txt"),"w") 

Uniprot_IDs = list(set(df_UID_MID["Uniprot ID"]))
for ID in list(set(Uniprot_IDs)):
    f.write(str(ID) + "\n")
f.close()

Using the Uniprot mapping service (https://www.uniprot.org/id-mapping) to map Uniprot IDs to sequences:

In [17]:
UNIPROT_df = pd.read_csv(join("..", "..", "data", "GOA", "Uniprot_results.csv"), sep = ";")
UNIPROT_df.drop(columns = ["Entry"], inplace = True)

In [18]:
df_UID_MID = df_UID_MID.merge(UNIPROT_df, how = "left", on = "Uniprot ID")

In [19]:
df_UID_MID.to_pickle(join("..", "..", "data", "GOA", "GOA_Transporter.pkl"))