In [38]:
from Bio import SeqIO
import re
from Bio.Blast import NCBIXML
import numpy as np
import copy



file_blast = "old/Q9Y6Q6_blast_nr.xml"
file_fasta = "data/Q9Y6Q6_blast_msa.fasta"


In [3]:
"""
Build a MSA from the BLAST alignment of the Q9Y6Q6 "TRAF6 binding domain" against SwissProt (SP) and
Non-Redundant (NR) databases. Use the NCBI webtool. Parse the output
"""

# with open("data/Q9Y6Q6_blast_sp.xml") as f:
with open(file_fasta, "w") as fout:
    with open(file_blast) as f:
        psiblast_rounds = NCBIXML.parse(f)
        # Iterate over Psiblast rounds
        for psiblast_round in psiblast_rounds:
            for alignment in psiblast_round.alignments:
                name = alignment.title.split()[0].split("|")[1].split(".")[0]

                for hsp in alignment.hsps:
                    if hsp.expect <= 10**-5:

                        # Initialize the aligned hit sequence by adding the starting gaps
                        seq = ['-'] * (hsp.query_start - 1)
                        # Add aligned amino acids. Skip positions with gaps in the query
                        for i, aa in enumerate(hsp.query):
                            if aa != '-':
                                seq.append(hsp.sbjct[i])
                        # Pad the sequence by adding end gaps
                        seq += ['-'] * (psiblast_round.query_length - len(seq))

                        # Append and convert to string
                        fout.write(">{}\n{}\n".format(name, "".join(seq)))


In [8]:
"""
Extract the hit fragments which align with in the "TRAF6 binding domain" interacting region.
The position of the interaction region is derived directly from the PDB (601:608) 
and mapped to UniProt (342:349) "QMPTEDEY"
"""
# For each position of the fragment get the aminoacid distribution
seqs = []
with open(file_fasta) as f:
    for line in f:
        if line[0] != ">":
            fragment = line[342:349]  # Extract the fragment corresponding to the interaction region in the query
            if set(fragment) != set("-"):
                # print(fragment, name)
                seqs.append(list(fragment))

In [9]:
"""
Calculate the pattern by looking at the amino acid distribution for each position
"""
seqs = np.array(seqs).T
for row in seqs:
    unique, counts = np.unique(row, return_counts=True)
    print(sorted(tuple(zip(unique, counts)), key=lambda k: k[1], reverse=True))


[('V', 178), ('I', 128), ('M', 66), ('G', 17), ('S', 7), ('-', 4), ('Q', 4), ('L', 2), ('D', 1), ('T', 1)]
[('P', 395), ('D', 4), ('V', 3), ('-', 2), ('G', 2), ('S', 2)]
[('T', 253), ('M', 136), ('A', 4), ('I', 4), ('-', 3), ('R', 3), ('S', 3), ('G', 1), ('Q', 1)]
[('E', 399), ('A', 3), ('S', 3), ('-', 1), ('D', 1), ('G', 1)]
[('D', 391), ('N', 9), ('E', 4), ('R', 3), ('S', 1)]
[('E', 397), ('T', 3), ('G', 2), ('V', 2), ('A', 1), ('P', 1), ('Q', 1), ('S', 1)]
[('Y', 395), ('S', 6), ('R', 3), ('E', 2), ('P', 2)]


In [15]:
"""
Find the new pattern against the Human proteome
"""
matches = []
seq_records = list(SeqIO.parse("data/human_up000005640.fasta", "fasta"))
for record in seq_records:
    # res = re.findall(".P.EDEY", str(record.seq))  # The new pattern, created using Blast
    res = re.findall(".P.EDEY", str(record.seq))  # The new pattern, created using Blast
    if res:
        matches.append((record.name, res))
        
print("Pattern matches against Human proteins", len(matches))
print("Pattern matches against Human proteins", matches)


Pattern matches against Human proteins 3
Pattern matches against Human proteins [('sp|O00750|P3C2B_HUMAN', ['NPGEDEY']), ('sp|Q01954|BNC1_HUMAN', ['VPGEDEY']), ('sp|Q9Y6Q6|TNR11_HUMAN', ['MPTEDEY'])]


In [39]:
"""
Calculate all possible patterns in human of a given lenght, no gaps, no ambiguous chars
"""
pattern_length = 7

counts = {}
seq_records = list(SeqIO.parse("data/human_up000005640.fasta", "fasta"))
for record in seq_records:
    seq = str(record.seq)
    for i in range(len(seq) - pattern_length):
        pattern = seq[i:i+pattern_length]
        counts.setdefault(pattern, 0)
        counts[pattern] += 1
   
counts = sorted([(pattern, c) for pattern, c in counts.items()], key=lambda x:x[1], reverse=True)
print("Count", len(counts))
print("Top pattern", counts[:10])


Count 9923526
Top pattern [('HTGEKPY', 2174), ('IHTGEKP', 1359), ('EEEEEEE', 1104), ('AAAAAAA', 1072), ('QQQQQQQ', 1051), ('GEKPYKC', 993), ('RIHTGEK', 972), ('TGEKPYK', 878), ('PPPPPPP', 871), ('SSSSSSS', 827)]


In [58]:
"""
Add gaps
"""
pattern_length = 7
max_gaps = 2


def add_gaps(pattern, patterns, iteration, max_iterations):
    if iteration < max_iterations:
        for i in range(len(pattern)):
            new_pattern = pattern[:i] + "-" + pattern[i+1:]
            if new_pattern not in patterns:
                patterns.add(new_pattern)
                add_gaps(new_pattern, patterns, iteration + 1, max_iterations)
    return patterns

counts = {}
seq_records = list(SeqIO.parse("data/human_up000005640.fasta", "fasta"))
for record in seq_records:
    seq = str(record.seq)
    for i in range(len(seq) - pattern_length):
        pattern = seq[i:i+pattern_length]
        patterns = add_gaps(pattern, set(), 0, max_gaps)
        for p in patterns:
#             print(p)
            counts.setdefault(p, 0)
            counts[p] += 1
       
   
counts = sorted([(pattern, c) for pattern, c in counts.items()], key=lambda x:x[1], reverse=True)
print("Count", len(counts))
print("Top pattern", counts[:10])


Count 104835606
Top pattern [('-HTGE-P', 3153), ('HTGE-P-', 3153), ('GEKP--C', 3034), ('-H-GEKP', 3032), ('H-GEKP-', 3032), ('GE-PY-C', 2989), ('H-GE-PY', 2985), ('-HTG-KP', 2914), ('HTG-KP-', 2914), ('HTG--PY', 2893)]


In [None]:
"""
Find the new pattern against IntAct TRAF6 interactors

# IntAct interactors of Q9Y4K3 (TRAF6) in MI-TAB format, "intact_TRAF6.tab". Downloaded manually from the website.

# IntAct interactors identifiers of Q9Y4K3 (TRAF6)
awk -F\\t '{print $1; print $2}' intact_TRAF6.tab | grep uniprotkb | cut -d ":" -f2 | sort -u > intact_TRAF6_list.txt

# Download the sequences from UniProt using the "Retrieve/ID mapping" tab. Saved as "intact_TRAF6.fasta"
"""
matches = []
seq_records = list(SeqIO.parse("data/intact_TRAF6.fasta", "fasta"))
for record in seq_records:
    res = re.findall("..P.EDEY", str(record.seq))  # The new pattern, created using Blast
    if res:
        matches.append((record.name, res))
print("The new pattern against IntAct TRAF6 interactors: ", len(matches))


In [None]:
"""
Find the new pattern against STRING TRAF6 interactors

Search Q9Y4K3 in STRING. Filter high interaction score 0.900. Remove 2nd shell interactors. Increase first shell limit
to a high number, e.g. 5,000. Exports, download interactors sequences (fasta).
"""
matches = []
seq_records = list(SeqIO.parse("data/string_protein_sequences_TRAF6.fa", "fasta"))
for record in seq_records:
    res = re.findall("..P.EDEY", str(record.seq))  # The new pattern, created using Blast
    if res:
        matches.append((record.name, res))
print("The new pattern against STRING TRAF6 interactors: ", len(matches))
print()


In [None]:
# ELM ###############

"""
# ELM instances of LIG_TRAF6
http://elm.eu.org/instances.tsv?q=LIG_TRAF6

Find the ELM pattern against the Human genome in SwissProt
"""
matches = []
seq_records = list(SeqIO.parse("data/human_proteins_sp.fasta", "fasta"))
for record in seq_records:
    res = re.findall("..P.E..[FYWHDE].", str(record.seq))  # The ELM pattern
    if res:
        matches.append((record.name, res))
print("ELM pattern against Human proteins", len(matches))

In [None]:
"""
Find the new pattern against IntAct TRAF6 interactors
"""
matches = []
seq_records = list(SeqIO.parse("data/intact_TRAF6.fasta", "fasta"))
for record in seq_records:
    res = re.findall("..P.E..[FYWHDE].", str(record.seq))  # The new pattern, created using Blast
    if res:
        matches.append((record.name, res))
print("ELM pattern against IntAct TRAF6 interactors: ", len(matches))

In [None]:
"""
Find the ELM pattern against STRING TRAF6 interactors
"""
matches = []
seq_records = list(SeqIO.parse("data/string_protein_sequences_TRAF6.fa", "fasta"))
for record in seq_records:
    res = re.findall("..P.E..[FYWHDE]", str(record.seq))  # The new pattern, created using Blast
    if res:
        matches.append((record.name, res))
print("ELM pattern against STRING TRAF6 interactors: ", len(matches))

In [None]:
# Parse MobiDB-lite output
disorder = {}
with open("data/intact_TRAF6.mobidblite") as f:
    for line in f:
        line = line.strip().split()
        if len(line) == 3:  # Exclude sub-regions (polar, ...)
            acc, start, end = line
            disorder.setdefault(acc, []).append((int(start), int(end)))

In [None]:
# Parse Pfam annotations
order = {}
with open("data/intact_TRAF6.pfam") as f:
    for line in f:
        name, start, end = line.strip().split()
        order.setdefault(name, []).append((int(start), int(end)))

In [None]:
"""
Find the ELM pattern against IntAct TRAF6 interactors limiting the search inside disordered and unstructured regions
"""
matches = []
seq_records = list(SeqIO.parse("data/intact_TRAF6.fasta", "fasta"))
for record in seq_records:

    # Exclude non-disordered regions (consider only those predicted by MobiDB-lite)
    if record.name in disorder:
        for start, end in disorder[record.name]:  # Exclude non disordered regions

            # Test if it overlaps with a structured (Pfam) region
            is_structured = False
            for start1, end1 in order.get(record.name, []):

                if min(end - start1, end1 - start) > 0:
                    is_structures = True
                    # print(record.name, "conflict", start, end, start1, end1)

            # Exclude structured regions
            if not is_structured:
                res = re.findall("..P.E..[FYWHDE].", str(record.seq)[start-1:end])  # The ELM pattern
                if res:
                    matches.append((record.name, res))

print("ELM pattern against IntAct TRAF6 interactors inside disordered and not structured regions: ", len(matches))


