## Homology analysis

Generate BLAST alignments of the new DisProt against the old DisProt and against PDB seqres

Install blast on your home (check the version and paths)
```
wget https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.13.0+-x64-linux.tar.gz
tar -xf ncbi-blast-2.13.0+-x64-linux.tar.gz
export PATH="/home/$USER/ncbi-blast-2.13.0+/bin:$PATH" 
```
Download PDB seqres
```
mkdir -p ../data/output/blastdb
wget https://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt.gz -O ../data/output/blastdb/pdb_seqres.txt.gz
gunzip ../data/output/blastdb/pdb_seqres.txt.gz
```

Make dbs
```
makeblastdb -in ../data/output/disprot_old.fasta -dbtype prot -out ../data/output/blastdb/disprot_old.fasta
makeblastdb -in ../data/output/blastdb/pdb_seqres.txt -dbtype prot
```

Run BLAST
```
blastp -db ../data/output/blastdb/disprot_old.fasta -query ../data/output/disprot_new.fasta -out ../data/output/disprot_new_old.blast -outfmt 6 -num_threads 12
blastp -db ../data/output/blastdb/pdb_seqres.txt -query ../data/output/disprot_new.fasta -out ../data/output/disprot_new_pdb.blast -outfmt 6 -num_threads 12
```

In [2]:
# Parse BLAST result
from Bio import pairwise2
from Bio.SubsMat import MatrixInfo
import sys

In [3]:
# Fasta
disprot_new_fasta_file = "../data/output/disprot_new.fasta"
disprot_old_fasta_file = "../data/output/disprot_old.fasta"
seqres_fasta_file = "../data/output/blastdb/pdb_seqres.txt"

# Blast
disprot_new_old_blast_file = "../data/output/disprot_new_old.blast"
disprot_new_pdb_blast_file = "../data/output/disprot_new_pdb.blast"

# Output
out_file = "../data/output/homology.tsv"

In [4]:
# Collect all sequences
sequences = {}
for file_name in [disprot_new_fasta_file, disprot_old_fasta_file, seqres_fasta_file]:
    with open(file_name) as f:
        for line in f:
            if line[0] == ">":
                name = line[1:].strip().split()[0]
            else:
                sequences[name] = line.strip()

print(list(sequences.keys())[:10])
print(len(sequences))

['DP02342|P06837', 'DP02348|Q8N5F7', 'DP02361|O95429', 'DP02364|Q9H6Z4-3', 'DP02376|Q4ACU6', 'DP02377|Q9GRZ3', 'DP02393|Q8IZD2', 'DP02401|O61380', 'DP02405|Q8IYW5', 'DP02411|Q08050']
753808


In [5]:
# Parse blast output
# Recalculate identity percentage normalizing over the query length

"""
DP02150 DP02150 100.000 462     0       0       1       462     1       462     0.0     939
DP02150 DP01437 37.895  95      58      1       288     382     199     292     1.58e-12        64.7
DP02150 DP02426 43.878  98      49      4       262     356     115     209     1.16e-10        60.5
DP02849	DP00563	45.833	24	9	1	144	163	139	162	1.7	25.8
"""

blast = {}

for blast_file, db in zip([disprot_new_old_blast_file, disprot_new_pdb_blast_file], 
                                ["disprot", "pdb"]):
    with open(blast_file) as f:
        for line in f:
            qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore = line.strip().split()
            pident = (float(qend) - float(qstart) + 1 - float(mismatch))*100 / len(sequences[qseqid])

            blast.setdefault(qseqid, {}).setdefault(db, {}).setdefault(sseqid, 0.0)
            blast[qseqid][db][sseqid] = max(blast[qseqid][db][sseqid], float(pident))
    print(blast_file, len(blast))

../data/output/disprot_new_old.blast 494
../data/output/disprot_new_pdb.blast 494


In [None]:
with open(out_file, "w") as fout:

    fout.write("disprot_id\tdb\tblast_acc\tblast_id\tlocal_acc\tlocal_id\tglobal_acc\tglobal_id\n")

    for i, disprot_id in enumerate(blast):
        print(disprot_id)
        
        disprot_id_len = len(sequences[disprot_id])
        
        for db in blast[disprot_id]:
            
            matches = {"local": [], "global": []}
            
            for sseqid in blast[disprot_id][db]:
#                 print(sequences[sseqid])
                try:
                    alignment = pairwise2.align.globalds(sequences[disprot_id], sequences[sseqid], MatrixInfo.blosum62, -10, -0.5, one_alignment_only=True)[0]
                    identity = sum([1 if a == b else 0 for a, b in zip(alignment.seqA, alignment.seqB)]) * 100 / disprot_id_len
                    matches["global"].append((sseqid, identity))
                except Exception as e:
                    # print(e)
                    pass
                
                try:
                    alignment = pairwise2.align.localds(sequences[disprot_id], sequences[sseqid], MatrixInfo.blosum62, -10, -0.5, one_alignment_only=True)[0]
                    identity = sum([1 if a == b else 0 for a, b in zip(alignment.seqA, alignment.seqB)]) * 100 / disprot_id_len
                    matches["local"].append((sseqid, identity))
                except Exception as e:
                    # print(e)
                    pass

            # Sort based on best identity
            best_blast = sorted(blast[disprot_id][db].items(), key=lambda x: x[1], reverse=True)[0]
            best_local = [None, None]
            best_global = [None, None]
            if matches["local"]:
                best_local = sorted(matches["local"], key=lambda x: x[1], reverse=True)[0]
            if matches["global"]:
                best_global = sorted(matches["global"], key=lambda x: x[1], reverse=True)[0]
            
#             print(best_blast, best_local, best_global)
            
            # Write to file
            fout.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(disprot_id, db, *best_blast, *best_local, *best_global))
            
#         if i==2:
#             break


DP02342|P06837
DP02348|Q8N5F7
DP02361|O95429
DP02364|Q9H6Z4-3
DP02376|Q4ACU6
DP02377|Q9GRZ3
DP02393|Q8IZD2
DP02401|O61380
DP02405|Q8IYW5
DP02411|Q08050
DP02418|P35568
DP02421|O94687
DP02445|Q96RS0
DP02448|Q03707
DP02449|Q15583
DP02457|O95400
DP02465|P07199
DP02468|P17480
DP02470|Q12495
DP02472|Q3U182
