### Why we used Diamond instead of Biopython:
#### We have tried to implement BLAST from biopython package, but were unable to process such large amount of queries, to overcome this issue we have implemented diamond which can process large amount of queries.


In [1]:
!wget http://github.com/bbuchfink/diamond/releases/download/v2.1.6/diamond-linux64.tar.gz
!tar xzf diamond-linux64.tar.gz
!rm diamond-linux64.tar.gz

URL transformed to HTTPS due to an HSTS policy
--2023-05-24 18:55:00--  https://github.com/bbuchfink/diamond/releases/download/v2.1.6/diamond-linux64.tar.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/31987083/6b15e096-093f-4e8e-b9ff-5e7bb45db282?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230524%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230524T185501Z&X-Amz-Expires=300&X-Amz-Signature=91efbca0b0263acfc745213fe47b482589620464b1e126d47529b75ae60be402&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=31987083&response-content-disposition=attachment%3B%20filename%3Ddiamond-linux64.tar.gz&response-content-type=application%2Foctet-stream [following]
--2023-05-24 18:55:01--  https://objects.githubusercontent.com/github-production-release-as

In [2]:
from subprocess import Popen, PIPE
import time
import numpy as np
import pandas as pd

In [3]:
db_name='train_db'
outfile_name='matches.tsv'
k=16 

##### Creating a database from test sequence fasta file

In [4]:
p = Popen(['./diamond', 'makedb', 
           '--in', '/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta',
            '-d', db_name], stdin=PIPE, stdout=PIPE)
stdout, stderr = p.communicate()

diamond v2.1.6.160 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org
Please cite: http://dx.doi.org/10.1038/s41592-021-01101-x Nature Methods (2021)

#CPU threads: 4
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta
Opening the database file...  [0.194s]
Loading sequences...  [2.762s]
Masking sequences...  [2.684s]
Writing sequences...  [0.118s]
Hashing sequences...  [0.033s]
Loading sequences...  [0s]
Writing trailer...  [0.004s]
Closing the input file...  [0s]
Closing the database file...  [0.001s]

Database sequences  142246
  Database letters  78752603
     Database hash  e491561cb14a4f3b4cbeb2d58ede2339
        Total time  5.799000s


##### Running BLAST sequence alignment for proteins using blastp against testsuperset.fasta file

In [5]:
time0 = time.time() 
p = Popen(['./diamond', 'blastp', '-d', db_name,
           '-q', '/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta',
            '-o', outfile_name, '--max-target-seqs', str(k), '--quiet'], stdin=PIPE, stdout=PIPE)
stdout, stderr = p.communicate()
print(f'Execution time: {time.time()-time0}s')

Execution time: 242.81781554222107s


#### Storing matches in a new database with a bitscore for keeping track of matching

In [6]:
matches=pd.read_csv(outfile_name, sep='\t', header=None, 
                    names=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 
                           'gapopen', 'qstart', 'qend', 'sstart','send', 'evalue', 'bitscore'])
matches['qseqid']=matches['qseqid'].apply(lambda x: x.split('\\t')[0])
matches.head(20)

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,Q9CQV8,Q9CQV8,100.0,246,0,0,1,246,1,246,1.1e-167,464.0
1,Q9CQV8,P35213,98.8,246,3,0,1,246,1,246,1.0500000000000001e-165,459.0
2,Q9CQV8,P31946,98.8,246,3,0,1,246,1,246,2.12e-165,458.0
3,Q9CQV8,V9HWD6,98.8,246,3,0,1,246,1,246,2.12e-165,458.0
4,Q9CQV8,Q5PRD0,91.0,244,22,0,3,246,1,244,1.2e-150,421.0
5,Q9CQV8,P63104,87.2,242,31,0,3,244,1,242,1.04e-142,401.0
6,Q9CQV8,Q5ZKC9,86.8,242,32,0,3,244,1,242,2.97e-142,400.0
7,Q9CQV8,P63101,86.8,242,32,0,3,244,1,242,5.99e-142,399.0
8,Q9CQV8,P63102,86.8,242,32,0,3,244,1,242,5.99e-142,399.0
9,Q9CQV8,P68254,81.0,242,46,0,3,244,1,242,4.08e-133,377.0


### Getting GO terms from similar sequences


In [7]:
train_terms=pd.read_csv('/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv', sep='\t')
arrayofgos=train_terms.groupby('EntryID').term.apply(lambda x: np.array(x))
matches['terms']=arrayofgos[matches.sseqid.values].values
matches['ntargets']=matches.groupby(['qseqid']).qseqid.count()[matches['qseqid'].values].values

#### We have considered matching with all the terms and provided a probablity as multiple GO terms can be assosciated with the same function

In [8]:
test_df=matches[['qseqid','terms', 'ntargets']].explode('terms').reset_index()
test_df['ntargets']=1/test_df['ntargets']
test_df=test_df.groupby(['qseqid', 'terms']).sum().round(3).reset_index()
test_df.head()

Unnamed: 0,qseqid,terms,index,ntargets
0,A0A023PXF5,GO:0000722,5019000,0.7
1,A0A023PXF5,GO:0000723,5019000,0.7
2,A0A023PXF5,GO:0003674,3584991,0.5
3,A0A023PXF5,GO:0003678,3584991,0.5
4,A0A023PXF5,GO:0003824,3584991,0.5
