In [2]:
import pandas as pd
import numpy as np
import re
import random
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
from Bio import SeqIO, Seq, SeqRecord
import pyhmmer
from collections import defaultdict
import xml.etree.ElementTree as ET

In [2]:
field_names = [
    "target_name",
    "accession",
    "query_name",
    "query_accession",
    "E_value_full",
    "score_full",
    "bias_full",
    "E_value_best",
    "score_best",
    "bias_best",
    "exp",
    "reg",
    "clu",
    "ov",
    "env",
    "dom",
    "rep",
    "inc",
    "description",
]

lines = list(open('../data/utp-data/utp-hits.txt').readlines())
lines = lines[3:]

rows = []
for line in lines:
    line = line.strip()
    if line.startswith('#'):
        continue
    fields = re.split(r'\s+', line.strip())
    if len(fields) == 0:
        continue
    row = dict(zip(field_names, fields))
    rows.append(row)
    
df = pd.DataFrame(rows)

In [3]:
our_hits = set(df["target_name"].unique())
their_hits = set((rec.id for rec in SeqIO.parse('../data/uTP_HMM_hits.fasta', 'fasta')))

venn2([our_hits, their_hits], ('Ours', 'Theirs'))
plt.show()

In [3]:
with pyhmmer.plan7.HMMFile("../data/utp-data/utp.hmm") as hmm_file:
    hmm = hmm_file.read()

In [14]:
with pyhmmer.easel.SequenceFile("../data/adk1075-data/ADK1075_proteomics_DB_2.fasta", digital=True, alphabet=hmm.alphabet) as seq_file:
    res = list(pyhmmer.hmmsearch(hmm, seq_file))

In [4]:
pipeline = pyhmmer.plan7.Pipeline(hmm.alphabet)

In [30]:
# Positive control
# ids = set(rec.id for rec in db)
good_c_term_ids = set((rec.id for rec in SeqIO.parse("../data/utp-data/good-c-term-gb.fasta", 'fasta')))
db = SeqIO.parse('../data/ADK1075_proteomics_DB_2.fasta', 'fasta')
positive_control = [rec for rec in db if rec.id in good_c_term_ids]
positive_control_hmmer = [pyhmmer.easel.TextSequence(name=rec.id.encode("utf-8"), sequence=str(rec.seq)).digitize(hmm.alphabet) for rec in positive_control]
hits = pipeline.search_hmm(hmm, pyhmmer.easel.DigitalSequenceBlock(hmm.alphabet, positive_control_hmmer))
hit_ids = set((hit.name.decode("utf-8") for hit in hits))

In [31]:
true_positive_rate = (len(hit_ids & good_c_term_ids) / len(good_c_term_ids)) * 100
false_positive_rate = (len(hit_ids - good_c_term_ids) / len(good_c_term_ids)) * 100
print(f"Positive control: {true_positive_rate=:.02f}%, {false_positive_rate=:.02f}%")

Positive control: true_positive_rate=93.20%, false_positive_rate=0.00%


In [34]:
# negative control - include very random sequences, ucyn-a encoded proteins

db = SeqIO.parse("../data/adk1075-data/ADK1075_proteomics_DB_2.fasta", "fasta")
db = [rec for rec in db if rec.id not in good_c_term_ids]
db_lens = defaultdict(list)
for record in db:
    db_lens[len(record.seq)].append(record)
db_lens = dict(db_lens)

positive_lens = np.array([len(record.seq) for record in positive_control])
mu = np.mean(positive_lens)
sigma = np.std(positive_lens)
positive_no_outliers = [record for record in positive_control if mu - 3 * sigma <= len(record.seq) <= mu + 3 * sigma]
positive_lens = np.array([len(record.seq) for record in positive_no_outliers])

negative_control = []
for _ in range(len(positive_lens)):
    l = int(np.random.normal(mu, sigma))
    closest = min(db_lens.keys(), key=lambda x: abs(x - l))
    chosen_rec_idx = random.randint(0, len(db_lens[closest]) - 1)
    chosen_rec = db_lens[closest].pop(chosen_rec_idx)
    if len(db_lens[closest]) == 0:
        del db_lens[closest]
    negative_control.append(chosen_rec)

negative_control_lens = np.array([len(record.seq) for record in negative_control])
print(f"Positive control:")
print(f"\tMu: {mu:.02f}")
print(f"\tSigma: {sigma:.02f}")
print(f"Negative control:")
print(f"\tMu: {np.mean(negative_control_lens):.02f}")
print(f"\tSigma: {np.std(negative_control_lens):.02f}")

Positive control:
	Mu: 682.33
	Sigma: 227.12
Negative control:
	Mu: 675.90
	Sigma: 229.92


In [35]:
plt.hist(positive_lens, bins=20, alpha=0.5, label='Positive control')
plt.hist(negative_control_lens, bins=20, alpha=0.5, label='Negative control')
plt.legend()
plt.show()

In [36]:
negative_control_hmmer = [pyhmmer.easel.TextSequence(name=rec.id.encode("utf-8"), sequence=str(rec.seq)).digitize(hmm.alphabet) for rec in negative_control]
hits = pipeline.search_hmm(hmm, pyhmmer.easel.DigitalSequenceBlock(hmm.alphabet, negative_control_hmmer))
hit_ids = set((hit.name.decode("utf-8") for hit in hits))

In [37]:
true_negative_rate = (len(hit_ids & good_c_term_ids) / len(good_c_term_ids)) * 100
false_negative_rate = (len(hit_ids - good_c_term_ids) / len(good_c_term_ids)) * 100
print(f"Negative control: {true_negative_rate=:.02f}%, {false_negative_rate=:.02f}%")

Negative control: true_negative_rate=0.00%, false_negative_rate=3.40%


In [38]:
# Precision and recall
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)
precision = true_positive_rate / (true_positive_rate + false_positive_rate)
recall = true_positive_rate / (true_positive_rate + false_negative_rate)
print(f"Precision: {precision=:.02f}")
print(f"Recall: {recall=:.02f}")

Precision: precision=1.00
Recall: recall=0.96


In [28]:
candidates = {
    "m_venus": {
        (
            "motif_2",
            "motif_1",
            "motif_3",
            "motif_4",
        ): "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKLICTTGKLPVPWPTLVTTLGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGGVQLADHYQQNTPIGDGPVLLPDNHYLSYQSKLSKDPNEKRDHMVLLEFVTAAGITLGMDELYKWLEEWRERLECWWDHSGTQTQLGACMGALGLHLGSRLDLRGHFGASLIEDAEPGCEWVEEPVEMLPDFPEPFSLPPIPRL",
        (
            "motif_2",
            "motif_1",
            "motif_6",
            "motif_3",
            "motif_4",
        ): "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKLICTTGKLPVPWPTLVTTLGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGGVQLADHYQQNTPIGDGPVLLPDNHYLSYQSKLSKDPNEKRDHMVLLEFVTAAGITLGMDELYKWLEEWRERLECWWMPTETQTQLGACMGALGLHLGSRLDGARASAWRAALLGRAPPPHAPAEPGCEWVEELDRELPDFPEPFSLPPIPRL",
        (
            "motif_2",
            "motif_1",
            "motif_3",
            "motif_5",
        ): "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKLICTTGKLPVPWPTLVTTLGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGGVQLADHYQQNTPIGDGPVLLPDNHYLSYQSKLSKDPNEKRDHMVLLEFVTAAGITLGMDELYKWLEEWRERLECWWASRSTQTQLGACMGALGLHLGSRLDDLATFGDGHFYGAEPGCEWVEELRAVVTLZLPDFPD",
        (
            "motif_2",
            "motif_1",
            "motif_3",
            "motif_5",
            "motif_7",
        ): "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKLICTTGKLPVPWPTLVTTLGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGGVQLADHYQQNTPIGDGPVLLPDNHYLSYQSKLSKDPNEKRDHMVLLEFVTAAGITLGMDELYKWLEEWRERLECWWEHALTQTQLGACMGALGLHLGSRLDQGELGPQVVYASAEPGCEWVEEYVAEAALZLPDFPDAPPJPRLLP",
        (
            "motif_2",
            "motif_1",
            "motif_3",
            "motif_7",
        ): "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKLICTTGKLPVPWPTLVTTLGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGGVQLADHYQQNTPIGDGPVLLPDNHYLSYQSKLSKDPNEKRDHMVLLEFVTAAGITLGMDELYKWLEEWRERLECWWLEERTQTQLGACMGALGLHLGSRLDEAGSMRSKIPVHAEPGCEWVEEWGSVPLAKPPPJPRLLP",
    },
    "m_neon_green": {
        (
            "motif_2",
            "motif_1",
            "motif_3",
            "motif_4",
        ): "MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEELNLKSTKGDLQFSPWILVPHIGYGFHQYLPYPDGMSPFQAAMVDGSGYQVHRTMQFEDGASLTVNYRYTYEGSHIKGEAQVKGTGFPADGPVMTNSLTAADWCRSKKTYPNDKTIISTFKWSYTTGNGKRYRSTARTTYTFAKPMAANYLKNQPMYVFRKTELKHSKTELNFKEWQKAFTDVMGMDELYKWLEEWRERLECWWGPVGTQTQLGACMGALGLHLGSRLDNEQETQTISAIVAEPGCEWVEEAAPGLPDFPEPFSLPPIPRL",
        (
            "motif_2",
            "motif_1",
            "motif_6",
            "motif_3",
            "motif_4",
        ): "MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEELNLKSTKGDLQFSPWILVPHIGYGFHQYLPYPDGMSPFQAAMVDGSGYQVHRTMQFEDGASLTVNYRYTYEGSHIKGEAQVKGTGFPADGPVMTNSLTAADWCRSKKTYPNDKTIISTFKWSYTTGNGKRYRSTARTTYTFAKPMAANYLKNQPMYVFRKTELKHSKTELNFKEWQKAFTDVMGMDELYKWLEEWRERLECWWLDPKTQTQLGACMGALGLHLGSRLDIAPYFAWRAALLGRAPPPSARAEPGCEWVEEALDDLPDFPEPFSLPPIPRL",
        (
            "motif_2",
            "motif_1",
            "motif_3",
            "motif_5",
        ): "MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEELNLKSTKGDLQFSPWILVPHIGYGFHQYLPYPDGMSPFQAAMVDGSGYQVHRTMQFEDGASLTVNYRYTYEGSHIKGEAQVKGTGFPADGPVMTNSLTAADWCRSKKTYPNDKTIISTFKWSYTTGNGKRYRSTARTTYTFAKPMAANYLKNQPMYVFRKTELKHSKTELNFKEWQKAFTDVMGMDELYKWLEEWRERLECWWIVGDTQTQLGACMGALGLHLGSRLDKPYDFGLWVLAFAEPGCEWVEELDTGLLLZLPDFPD",
        (
            "motif_2",
            "motif_1",
            "motif_3",
            "motif_5",
            "motif_7",
        ): "MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEELNLKSTKGDLQFSPWILVPHIGYGFHQYLPYPDGMSPFQAAMVDGSGYQVHRTMQFEDGASLTVNYRYTYEGSHIKGEAQVKGTGFPADGPVMTNSLTAADWCRSKKTYPNDKTIISTFKWSYTTGNGKRYRSTARTTYTFAKPMAANYLKNQPMYVFRKTELKHSKTELNFKEWQKAFTDVMGMDELYKWLEEWRERLECWWFNAATQTQLGACMGALGLHLGSRLDSRSVVAAIALGAAEPGCEWVEEQGRAMQLZLPDFPDEPPJPRLLP",
        (
            "motif_2",
            "motif_1",
            "motif_3",
            "motif_7",
        ): "MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEELNLKSTKGDLQFSPWILVPHIGYGFHQYLPYPDGMSPFQAAMVDGSGYQVHRTMQFEDGASLTVNYRYTYEGSHIKGEAQVKGTGFPADGPVMTNSLTAADWCRSKKTYPNDKTIISTFKWSYTTGNGKRYRSTARTTYTFAKPMAANYLKNQPMYVFRKTELKHSKTELNFKEWQKAFTDVMGMDELYKWLEEWRERLECWWSIIETQTQLGACMGALGLHLGSRLDASPLPSCRSDDLAEPGCEWVEELYDGPRAAFPPJPRLLP",
    },
}

candidates = {
    k: {
        "+".join((m.split("_")[-1] for m in motifs)): pyhmmer.easel.TextSequence(
            name=(k + "_" + "+".join((m.split("_")[-1] for m in motifs))).encode(
                "utf-8"
            ),
            sequence=seq,
        ).digitize(hmm.alphabet)
        for motifs, seq in seqs.items()
    }
    for k, seqs in candidates.items()
}

candidate_scores = {}

for k, seqs in candidates.items():
    candidate_scores[k] = {}
    for m, seq in seqs.items():
        hits = pipeline.search_hmm(hmm, pyhmmer.easel.DigitalSequenceBlock(hmm.alphabet, [seq]))
        if len(hits) == 0:
            candidate_scores[k][m] = 0
        else:
            candidate_scores[k][m] = hits[0].pvalue
    candidate_scores[k] = dict(sorted(candidate_scores[k].items(), key=lambda x: x[1]))

# Print results
for k, seqs in candidate_scores.items():
    print(f"{k}:")
    for m, score in seqs.items():
        print(f"\t{m}:\t{score:.2e}")
        
# Save the results as fasta
with open("../data/utp-data/reporter-candidates.fasta", "w") as f:
    for k, seqs in candidates.items():
        for m, seq in seqs.items():
            SeqIO.write(
                SeqRecord.SeqRecord(
                    Seq.Seq(seq.textize().sequence),
                    id=k + "__" + m.replace("+", "_"),
                    description="",
                ),
                f,
                "fasta",
            )

m_venus:
	2+1+6+3+4:	4.97e-26
	2+1+3+4:	1.16e-22
	2+1+3+5+7:	1.77e-22
	2+1+3+7:	5.41e-20
	2+1+3+5:	7.82e-19
m_neon_green:
	2+1+6+3+4:	7.31e-26
	2+1+3+4:	1.00e-23
	2+1+3+5+7:	8.84e-23
	2+1+3+7:	2.54e-20
	2+1+3+5:	1.25e-17


In [5]:


"MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEELNLKSTKGDLQFSPWILVPHIGYGFHQYLPYPDGMSPFQAAMVDGSGYQVHRTMQFEDGASLTVNYRYTYEGSHIKGEAQVKGTGFPADGPVMTNSLTAADWCRSKKTYPNDKTIISTFKWSYTTGNGKRYRSTARTTYTFAKPMAANYLKNQPMYVFRKTELKHSKTELNFKEWQKAFTDVMGMDELYKWLEEWRERLECWWLDPKTQTQLGACMGALGLHLGSRLDIAPYFAWRAALLGRAPPPSARAEPGCEWVEEALDDLPDFPEPFSLPPIPRL"[len("MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEELNLKSTKGDLQFSPWILVPHIGYGFHQYLPYPDGMSPFQAAMVDGSGYQVHRTMQFEDGASLTVNYRYTYEGSHIKGEAQVKGTGFPADGPVMTNSLTAADWCRSKKTYPNDKTIISTFKWSYTTGNGKRYRSTARTTYTFAKPMAANYLKNQPMYVFRKTELKHSKTELNFKEWQKAFTDVMGMDELYK"):]

In [6]:
m_neon_green = "MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEELNLKSTKGDLQFSPWILVPHIGYGFHQYLPYPDGMSPFQAAMVDGSGYQVHRTMQFEDGASLTVNYRYTYEGSHIKGEAQVKGTGFPADGPVMTNSLTAADWCRSKKTYPNDKTIISTFKWSYTTGNGKRYRSTARTTYTFAKPMAANYLKNQPMYVFRKTELKHSKTELNFKEWQKAFTDVMGMDELYK"

m_venus = "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKLICTTGKLPVPWPTLVTTLGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGGVQLADHYQQNTPIGDGPVLLPDNHYLSYQSKLSKDPNEKRDHMVLLEFVTAAGITLGMDELYK"

meme_xml = ET.parse("./utp-data/meme/meme_gb.xml")

motif_seqs = {
    tag.attrib["id"]: tag.attrib["name"] for tag in meme_xml.findall(".//motif")
}
utp = (
    motif_seqs["motif_2"]
    + motif_seqs["motif_1"]
    + motif_seqs["motif_3"]
    + motif_seqs["motif_4"]
)
print(utp)

print(f"{m_neon_green + utp=}\n{m_venus + utp=}")

test = [
    pyhmmer.easel.TextSequence(
        name=b"m_neon_green_utp", sequence=m_neon_green + utp
    ).digitize(hmm.alphabet),
    pyhmmer.easel.TextSequence(
        name=b"m_venus_utp", sequence=m_venus + utp
    ).digitize(hmm.alphabet),
]
hits = pipeline.search_hmm(
    hmm, pyhmmer.easel.DigitalSequenceBlock(hmm.alphabet, test)
)

WLEEWRERLECWWTQTQLGACMGALGLHLGSRLDAEPGCEWVEELPDFPEPFSLPPIPRL
m_neon_green + utp='MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEELNLKSTKGDLQFSPWILVPHIGYGFHQYLPYPDGMSPFQAAMVDGSGYQVHRTMQFEDGASLTVNYRYTYEGSHIKGEAQVKGTGFPADGPVMTNSLTAADWCRSKKTYPNDKTIISTFKWSYTTGNGKRYRSTARTTYTFAKPMAANYLKNQPMYVFRKTELKHSKTELNFKEWQKAFTDVMGMDELYKWLEEWRERLECWWTQTQLGACMGALGLHLGSRLDAEPGCEWVEELPDFPEPFSLPPIPRL'
m_venus + utp='MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKLICTTGKLPVPWPTLVTTLGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGGVQLADHYQQNTPIGDGPVLLPDNHYLSYQSKLSKDPNEKRDHMVLLEFVTAAGITLGMDELYKWLEEWRERLECWWTQTQLGACMGALGLHLGSRLDAEPGCEWVEELPDFPEPFSLPPIPRL'


In [3]:
len(set("MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKLICTTGKLPVPWPTLVTTLGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGGVQLADHYQQNTPIGDGPVLLPDNHYLSYQSKLSKDPNEKRDHMVLLEFVTAAGITLGMDELYKWLEEWRERLECWWTQTQLGACMGALGLHLGSRLDAEPGCEWVEELPDFPEPFSLPPIPRL"))

In [9]:
hits[0].score

In [20]:
print(f"{hits[0].name.decode('utf-8')}: {hits[0].evalue:.2e}\n{hits[1].name.decode('utf-8')}: {hits[1].evalue:.2e}")

m_neon_green_utp: 3.63e-15
m_venus_utp: 3.89e-15
