In [1]:
import pandas as pd
import requests
import json
import gzip
import time

# Reading identifiers that are missing:
missingIDsFile = 'unmatch.txt.gz'

missinIds = []
with gzip.open(missingIDsFile, 'rt') as f:
    for row in f:
        missinIds.append(row.replace('"','').strip())
        
missingIds_df = pd.DataFrame({'missing_IDs': missinIds})
id_sample = (
    missingIds_df
    .loc[(~missingIds_df.missing_IDs.str.contains('-'))]
    .sample(500)
    .missing_IDs
)



## Fetching data from uniprot for the sample

In [3]:
def get_uniprot_data(uniprot_id):
    uniprotUrl = 'https://www.ebi.ac.uk/proteins/api/proteins'

    try:
        data = requests.get(f'{uniprotUrl}/{uniprot_id}').json()
    except:
        return None

    sequence = data['sequence']['sequence']
    
    db_ref = pd.DataFrame(data['dbReferences'])[['type','id']]
    db_ref.loc[len(db_ref)] = ['sequence', sequence]
    db_ref['trembl_id'] = uniprot_id
    db_ref['organism'] = data['organism']['taxonomy']
    
    # Adding sequence:
    
    return db_ref


target_df = pd.DataFrame(columns=['type','id','trembl_id'])
for x in id_sample:
    response = get_uniprot_data(x)
    if isinstance(response,pd.DataFrame):
        target_df = target_df.append(response)


target_df.reset_index(drop=True)
print(len(target_df))
print(target_df.head())
print(target_df.columns)

19964
   type        id trembl_id  organism
0  EMBL  AF212302    Q997F2  121791.0
1  EMBL  AY029768    Q997F2  121791.0
2  EMBL  AY029767    Q997F2  121791.0
3  EMBL  AF376747    Q997F2  121791.0
4  EMBL  AJ564621    Q997F2  121791.0
Index(['type', 'id', 'trembl_id', 'organism'], dtype='object')


In [22]:
# Subsetting human ids only:
human_ids = target_df.loc[target_df.organism==9606].trembl_id.unique().tolist()

# Saving human proteins:
with open('batch.fasta', 'wt') as o:
    target_df.loc[(target_df.type == 'sequence') & (target_df.trembl_id.isin(human_ids))].apply(lambda row: o.write(f">{row['trembl_id']}\n{row['id']}\n"), axis=1)




## Setting up blast

Downloading all human reference protein sequence as fasta:
```
https://www.uniprot.org/uniprot/?query=*&fil=organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22+AND+reviewed%3Ayes
```

Installing blast using conda:
```
conda install -c bioconda blast
```

Building local blast database:

```
makeblastdb -in ./uniprot_blast_seed.fasta -dbtype prot
```

output:

```
Adding sequences from FASTA; added 42373 sequences in 1.25296 seconds.
```



In [11]:
from Bio.Blast.Applications import NcbiblastpCommandline

In [23]:
inputFile = "batch.fasta"
uniprotDb = "uniprot_db"
eValueThreshold = 1e-10

from Bio.Blast.Applications import NcbiblastpCommandline
blast_output_file='blastout.tsv'

from Bio.Blast.Applications import NcbiblastpCommandline
cline = NcbiblastpCommandline(
    query=inputFile, 
    db=uniprotDb,outfmt=6, 
out=blast_output_file, evalue= eValueThreshold)
print(cline)
stdt, stdr= cline()

blastp -out blastout.tsv -outfmt 6 -query batch.fasta -db uniprot_db -evalue 1e-10


In [17]:
seq = '>test seq\n' + target_df.loc[target_df.type == 'sequence','id'].iloc[1]

In [24]:
blast_out_df = pd.read_csv('blastout.tsv', sep='\t', header=None, index_col=False, names=['qseqid','sseqid','pident','length','mismatch'])
len(blast_out_df.qseqid.unique())


17

In [29]:
len(blast_out_df.loc[blast_out_df.pident > 90].qseqid.unique())

16

In [94]:
target_df.loc[target_df.type == 'sequence']

Unnamed: 0,type,id,trembl_id
30,sequence,MSLPHLSLADARNLHLAAQGLLNKPRRRASLEDIPATISRMSLLQI...,P75843
100,sequence,MSKKPGGPGKSRAVNMLKRGMPRVLSLIGLKRAMLSLIDGKGPIRF...,Q8JU44
51,sequence,MLSQIAICIWVESTAILQDCQRALSADRYQLQVCESGEMLLEYAQT...,Q79PF6
37,sequence,MQNAVIYQPVQIEYLKKTSDLFSEQQLADSFVLIFHLKGNGYISIG...,P40408
19,sequence,MESSAKRKMDPDNPDEGPSSKVPRPETPVTKATTFLQTMLRKEVNS...,P03169
...,...,...,...
19,sequence,MVYSDKRIRKPAIRRSWLKRREKSDRTLRGQEDFVEVDWDVALDLV...,Q8CZY0
46,sequence,MTAQPQTLKIRRPDDWHIHLRDDEMLSTVLPYTSEVFARAIVMPNL...,Q8ZFU4
6,sequence,MHFRVTGEWNGEPFNRVIEAENINDCYDHWMIWAQIAHADVTNIRI...,C6ZCU7
3,sequence,MHSLFFEEPEPSGFGSSGQSSSLLAPDSPRPSTSQVQGPLHVHTPT...,P88948


In [122]:
len(blast_out_df.loc[blast_out_df.pident > 90].qseqid.unique())


36

In [8]:
uniref_df = pd.read_csv('/Users/dsuveges/Downloads/uniref-identity_0.9+taxonomy__Homo+sapiens+(Human)+[9606]_.tab.gz', sep='\t')
uniref_df.head()

Unnamed: 0,Cluster ID,Unnamed: 1,Cluster name,Size,Cluster members,Organisms,Length,Identity
0,UniRef90_Q6MZZ7-2,partially reviewed and UniParc,Cluster: Isoform 2 of Calpain-13,4,Q6MZZ7-2; A0A2J8K0E9; A0A2J8VNG3; UPI000DC1A1AE,Homo sapiens (Human); Pan troglodytes (Chimpan...,423,90%
1,UniRef90_P15907,partially reviewed and UniParc,"Cluster: Beta-galactoside alpha-2,6-sialyltran...",51,P15907; C9JH16; C9JVK7; P15907-2; A0A6D2WUU6; ...,Homo sapiens (Human); Pan troglodytes (Chimpan...,406,90%
2,UniRef90_UPI0007DC7430,UniParc,Cluster: inactive heparanase-2 isoform X4,1,UPI0007DC7430,Homo sapiens (Human),441,90%
3,UniRef90_Q4G148,partially reviewed and UniParc,Cluster: Glucoside xylosyltransferase 1,32,Q4G148; H2RCX7; A0A6D2YBP6; A0A2R8ZJB2; G3RQC4...,Homo sapiens (Human); Pan troglodytes (Chimpan...,440,90%
4,UniRef90_A0A1D5QV24,partially reviewed and UniParc,Cluster: Uncharacterized protein,7,A0A1D5QV24; A0A0A6YYK5; Q9BT81-2; G7N0I4; L8HQ...,Macaca mulatta (Rhesus macaque); Homo sapiens ...,440,90%


In [9]:
uniref_df = (
    uniref_df
    .assign(member_array = uniref_df['Cluster members'].apply(lambda x: x.split(';')))
    .explode('member_array')
)

0         [Q6MZZ7-2,  A0A2J8K0E9,  A0A2J8VNG3,  UPI000DC...
1         [P15907,  C9JH16,  C9JVK7,  P15907-2,  A0A6D2W...
2                                           [UPI0007DC7430]
3         [Q4G148,  H2RCX7,  A0A6D2YBP6,  A0A2R8ZJB2,  G...
4         [A0A1D5QV24,  A0A0A6YYK5,  Q9BT81-2,  G7N0I4, ...
                                ...                        
114154          [F8WEG4,  F8WEU0,  A0A087WZQ3,  A0A087WTC0]
114155                                         [A0A494C0W8]
114156                                             [Q9UJV7]
114157    [Q5HYL7,  Q5EB63,  A0A024RA01,  Q5HYL7-2,  A0A...
114158    [F5H0R6,  A0A2J8V1K1,  A0A2J8PZH7,  UPI0008F49...
Name: Cluster members, Length: 114159, dtype: object

In [114]:
uniref_df.head()

Unnamed: 0,Cluster ID,Unnamed: 1,Cluster name,Size,Cluster members,Organisms,Length,Identity,member_array
0,UniRef90_Q6MZZ7-2,partially reviewed and UniParc,Cluster: Isoform 2 of Calpain-13,4,Q6MZZ7-2; A0A2J8K0E9; A0A2J8VNG3; UPI000DC1A1AE,Homo sapiens (Human); Pan troglodytes (Chimpan...,423,90%,Q6MZZ7-2
0,UniRef90_Q6MZZ7-2,partially reviewed and UniParc,Cluster: Isoform 2 of Calpain-13,4,Q6MZZ7-2; A0A2J8K0E9; A0A2J8VNG3; UPI000DC1A1AE,Homo sapiens (Human); Pan troglodytes (Chimpan...,423,90%,A0A2J8K0E9
0,UniRef90_Q6MZZ7-2,partially reviewed and UniParc,Cluster: Isoform 2 of Calpain-13,4,Q6MZZ7-2; A0A2J8K0E9; A0A2J8VNG3; UPI000DC1A1AE,Homo sapiens (Human); Pan troglodytes (Chimpan...,423,90%,A0A2J8VNG3
0,UniRef90_Q6MZZ7-2,partially reviewed and UniParc,Cluster: Isoform 2 of Calpain-13,4,Q6MZZ7-2; A0A2J8K0E9; A0A2J8VNG3; UPI000DC1A1AE,Homo sapiens (Human); Pan troglodytes (Chimpan...,423,90%,UPI000DC1A1AE
1,UniRef90_P15907,partially reviewed and UniParc,"Cluster: Beta-galactoside alpha-2,6-sialyltran...",51,P15907; C9JH16; C9JVK7; P15907-2; A0A6D2WUU6; ...,Homo sapiens (Human); Pan troglodytes (Chimpan...,406,90%,P15907


In [10]:
uniref_df.loc[uniref_df.member_array.isin(human_ids.tolist())]

Unnamed: 0,Cluster ID,Unnamed: 1,Cluster name,Size,Cluster members,Organisms,Length,Identity,member_array
837,UniRef90_O95681,unreviewed,Cluster: Tenascin-X (Fragment),1,O95681,Homo sapiens (Human),423,90%,O95681
1436,UniRef90_A0A140VK41,partially reviewed and UniParc,Cluster: Ribose-phosphate diphosphokinase,14,A0A140VK41; F6U0X8; K7D407; I0FJ99; A0A2K6P908...,Homo sapiens (Human); Macaca mulatta (Rhesus m...,321,90%,A0A140VK41
5277,UniRef90_Q96CW7,unreviewed,Cluster: C4orf42 protein,4,Q96CW7; Q4W5N5; A0A2I3SZI4; A0A2R8ZJ19,Homo sapiens (Human); Pan troglodytes (Chimpan...,91,90%,Q96CW7
67805,UniRef90_P79483,partially reviewed and UniParc,Cluster: HLA class II histocompatibility antig...,428,P79483; A0A4D6G1L1; A0A223G198; A0A223G1D2; A0...,Homo sapiens (Human); Macaca mulatta (Rhesus m...,266,90%,P79483


In [120]:
missingIds_df.missing_IDs


0         100008588
1         100147744
2        A0A024A2C9
3        A0A024B7W1
4        A0A024QZR3
            ...    
19750        X5D7A4
19751        X5D7U3
19752        X5D8V5
19753        X5DNE0
19754        X5HMX4
Name: missing_IDs, Length: 19755, dtype: object