# Identificação de Homólogos

In [1]:
import os
from Bio import SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
from scripts_python import BLASTer

a = %pwd
wd = (a.rsplit('/',2))[0]
seq_id = "NM_004335"
prot_seq_id = "Q10589"
gene = "BST2"
file_blast_ncbi = "00D0DT8J016-Alignment.xml"
file_blast_uniprot = "B20210131216DA2B77BFBD2E6699CA9B6D1C41EB200F810O.fasta"


## Resultados do BLAST no NCBI¶

In [2]:
result_handle= open(os.path.join( wd,"data/homologue", gene, file_blast_ncbi))
blast_ncbi = NCBIXML.read(result_handle)
result_handle.close()

print('Foram carregadas {0} seq.'.format(len(blast_ncbi.alignments)))

Foram carregadas 100 seq.


Apesar da diversidade de organismos, a maioria das sequências identificadas foram humanas, isto verifica mais uma vez a redundancia de sequências desta Base de Dados.
Por outro lado mesmo existindo filtragem baseada em scores, existiram matches não especificos como bactérias.

### Resumo

In [3]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import re

query_len = blast_ncbi.query_length
homologue = []
sources = {}

print("Accession", "Identity%")
for aln in blast_ncbi.alignments:
    for HSP in aln.hsps:
        if HSP.score > 80 and HSP.expect < 1.0e-10 and HSP.identities/HSP.align_length > 0.80:
            print(aln.accession,HSP.identities/HSP.align_length)
            species = re.findall('\[[^\]]*\]', aln.hit_def)[0].replace("[","").replace("]","")
            if species not in sources:
                sources[species] = 1
            else:
                sources[species] +=1
            homologue.append(SeqRecord(Seq(HSP.sbjct), aln.title.split( ">", 1)[0], "", ""))


print("Total:",len(homologue))

Accession Identity%
NP_004326 1.0
BAD96844 0.9944444444444445
NP_001266147 0.9567567567567568
XP_018869871 0.9513513513513514
NP_001266560 0.9459459459459459
NP_001177409 0.9459459459459459
ADI58596 0.907608695652174
NP_001295602 0.9017341040462428
BAG64608 0.9629629629629629
NP_001166058 0.9127906976744186
ADI58598 0.9190751445086706
XP_030675274 0.9190751445086706
3MQB_A 1.0
3MQC_A 0.9739130434782609
3MQ7_A 0.9478260869565217
3NWH_A 0.963302752293578
2XG7_A 1.0
XP_024206821 0.92
3MQ9_A 0.9318181818181818
2X7A_A 0.9672131147540983
Total: 20


In [4]:
for source in sources.keys():
    print(source, ":", sources[source])

SeqIO.write(homologue, os.path.join(wd,"data/homologue", gene, f"{seq_id}_{gene}.fasta"), "fasta")

Homo sapiens : 9
Pan paniscus : 1
Gorilla gorilla gorilla : 1
Gorilla gorilla : 1
Pan troglodytes : 2
Pongo pygmaeus : 1
Nomascus leucogenys : 2
Pongo abelii : 1
Hylobates agilis : 1
Escherichia coli K-12 : 1


20

### Resultados UNIPROT

#### Comentários

In [5]:
path = os.path.join(wd, "data/homologue", gene, file_blast_uniprot)
blast_uniprot_raw = SeqIO.parse(path, format="fasta")
blast_uniprot = []
blast_uniprot_id = []

for protein in blast_uniprot_raw:
    blast_uniprot.append(protein)
    seq_id = re.findall('\|[^|]*\|', protein.id)[0].replace("|","")
    blast_uniprot_id.append(seq_id)

In [6]:
from wget import download

for accession in blast_uniprot_id:
    url = "https://www.uniprot.org/uniprot/{0}.xml".format(accession)
    path = os.path.join(wd, "data/homologue", gene, "uniprot","{0}.xml".format(accession))
    download(url, path)

In [7]:
import re

blast_matches = []

for accession in blast_uniprot_id:
    path = os.path.join(wd, "data/homologue", gene,"uniprot","{0}.xml".format(accession))
    blast_matches.append(SeqIO.read(path, format="uniprot-xml"))

comments = {}
an_domains = {}
keywords = {}
for match in blast_matches:
    if "keywords" in match.annotations.keys():
        for keyword in match.annotations["keywords"]:
            if keyword in keywords:
                keywords[keyword] = ", ".join([keywords[keyword],match.id])
            else:
                keywords[keyword] = match.id
    for annotation in match.annotations.keys():
        if re.match("(comment_)[a-z]*", annotation):
            comment_type = annotation.split("_",1)[1]
            if comment_type not in comments.keys():
                comments[comment_type] = {}
            comment = match.annotations[annotation][0]
            if comment in comments[comment_type].keys():
                comments[comment_type][comment] = ", ".join([comments[comment_type][comment], match.id])
            else:
                 comments[comment_type][comment] = match.id

for comment_type in comments.keys():
    print(comment_type)
    for comment in comments[comment_type].keys():
        print(comment, ":", comments[comment_type][comment])
    print("\n")

print("Keywords")
for keyword in keywords.keys():
    print(keyword, ":", keywords[keyword])
print("\n")

function
IFN-induced antiviral host restriction factor which efficiently blocks the release of diverse mammalian enveloped viruses by directly tethering nascent virions to the membranes of infected cells. Acts as a direct physical tether, holding virions to the cell membrane and linking virions to each other. The tethered virions can be internalized by endocytosis and subsequently degraded or they can remain on the cell surface. In either case, their spread as cell-free virions is restricted (PubMed:22520941, PubMed:21529378, PubMed:20940320, PubMed:20419159, PubMed:20399176, PubMed:19879838, PubMed:19036818, PubMed:18342597, PubMed:18200009). Its target viruses belong to diverse families, including retroviridae: human immunodeficiency virus type 1 (HIV-1), human immunodeficiency virus type 2 (HIV-2), simian immunodeficiency viruses (SIVs), equine infectious anemia virus (EIAV), feline immunodeficiency virus (FIV), prototype foamy virus (PFV), Mason-Pfizer monkey virus (MPMV), human T-

As anotações confirmam a relação das proteinas homólogas e o suporte ao sistema imunitário, indicando que esta função é conservada entre as especies do mamiferos, portadores deste gene.

### Features

In [8]:
features = {}
for match in blast_matches:
    for feature in match.features:
        if re.search("(domain|motif|bond|region)", feature.type):
            feature_type = feature.type
            if feature_type not in features.keys():
                features[feature_type] = {}

            feature_desc = str(feature.location)
            if "description" in feature.qualifiers.keys():
                feature_desc = " ".join([feature_desc,feature.qualifiers["description"]])

            if feature_desc in features[feature_type].keys():
                features[feature_type][feature_desc] = ", ".join([features[feature_type][feature_desc], match.id])
            else:
                 features[feature_type][feature_desc] = match.id

for feature_type in features.keys():
    print(feature_type)
    for feature in features[feature_type].keys():
        print(feature, ":", features[feature_type][feature])
    print("\n")

topological domain
[0:20] Cytoplasmic : Q10589
[48:161] Extracellular : Q10589
[0:26] Cytoplasmic : Q6WRU0, Q811A2
[47:183] Extracellular : Q6WRU0
[0:30] Cytoplasmic : Q8R2Q8
[51:152] Extracellular : Q8R2Q8
[47:152] Extracellular : Q811A2
[0:42] Cytoplasmic : P10716
[69:550] Extracellular : P10716


transmembrane region
[20:48] Helical; Signal-anchor for type II membrane protein : Q10589
[26:47] Helical; Signal-anchor for type II membrane protein : Q6WRU0, Q811A2
[30:51] Helical; Signal-anchor for type II membrane protein : Q8R2Q8
[42:69] Helical; Signal-anchor for type II membrane protein : P10716


coiled-coil region
[67:152] : Q10589
[65:178] : Q6WRU0
[73:147] : Q8R2Q8
[102:149] : Q811A2


lipid moiety-binding region
[160:161] GPI-anchor amidated serine : Q10589
[182:183] GPI-anchor amidated serine : Q6WRU0
[151:152] GPI-anchor amidated serine : Q8R2Q8, Q811A2


disulfide bond
[52:53] Interchain : Q10589
[62:63] Interchain : Q10589
[90:91] Interchain : Q10589
[53:54] Interchain : Q6

Apesar da varialibilidade da localização das features, normal dado que estas não se encontram alinhadas, podemos perceber que estas são comuns entre os homolgos, identificando locais e interesse para estudo.