In [1]:
from Bio import SeqIO
import yaml
import pickle

In [2]:
# Open up sanger stuff as well
with open('../Data/CohortTats/PacBio/PacBio_Trimmed_Normal_Tats.fasta') as handle:
    pacbio = {record.id[:9] : " ".join(record.seq) for record in SeqIO.parse(handle, "fasta")}
    print(f"PacBio sequence count: {len(pacbio)}")
    
with open('../Data/CohortTats/Sanger/Sanger_Unambiguous_Trimmed_Normal_Tats.fasta') as handle:
    cares_tats = {record.id: " ".join(record.seq) for record in SeqIO.parse(handle, "fasta")}
    cares_tats.update(pacbio)
    print(f"Supplemented Sanger sequences: {len(cares_tats) - len(pacbio)}")
    
with open("../Data/Klase_Tat_interaction_miRNA_names.yaml") as handle:
    interactions = yaml.load(handle, Loader = yaml.FullLoader)
    
with open("/data/Chapter_3/Data/RawData/mirbase.fa") as handle:
    name2seq = {record.name : str(record.seq) for record in SeqIO.parse(handle, 'fasta') if 'hsa' in record.name}

PacBio sequence count: 133
Supplemented Sanger sequences: 163


In [10]:
import pandas as pd
pd.Series([len("".join(seq.split(" "))) for seq in cares_tats.values()]).describe()

count    296.000000
mean      96.712838
std       15.618176
min       10.000000
25%      101.000000
50%      101.000000
75%      101.000000
max      106.000000
dtype: float64

In [4]:
#cares_tats

In [5]:
import re

def process_id(mirna_id : str, name2seq : dict) -> str:
    """
    Jank conversions from listed miRNAs to their updated names.
    
    """

    # Just an odd ID typo
    if mirna_id == "hsa-miR-103-a":
        return "hsa-mir-103a-1"
    
    # http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=MI0001727
    # hsa-miR-453 has been merged with  hsa-mir-323b and should be used instead
    if mirna_id == 'hsa-miR-453':
        return "hsa-mir-323b"
    
    # See whether ID just works as is
    replaced_id = mirna_id.replace("miR","mir")
    
    try:
        name2seq[mirna_id]
        return mirna_id
    except:
        pass
    
    try:
        name2seq[replaced_id]
        return replaced_id
    except:
        pass
    
    # See whether adding "a" as end fixes ID
    try:
        formatted_id = f"{mirna_id}a"
        name2seq[formatted_id]
        return formatted_id
    except:
        pass
    
    try:
        formatted_id = f"{replaced_id}a"
        name2seq[formatted_id]
        return formatted_id
    except:
        pass
    
    # See whether adding "-1" at end fixes ID
    try:
        formatted_id = f"{mirna_id}-1"
        name2seq[formatted_id]
        return formatted_id
    except:
        pass
    
    try:
        formatted_id = f"{replaced_id}-1"
        name2seq[formatted_id]
        return formatted_id
    except:
        pass
    
    # See whether adding "-1" at end fixes ID
    try:
        formatted_id = f"{mirna_id}-1"
        name2seq[formatted_id]
        return formatted_id
    except:
        pass
    
    try:
        formatted_id = f"{replaced_id}-1"
        name2seq[formatted_id]
        return formatted_id
    except:
        pass
    
    # See whether adding "a-1" fixes ID
    try:
        formatted_id = f"{mirna_id}a-1"
        name2seq[formatted_id]
        return formatted_id
    except:
        pass
    
    try:
        formatted_id = f"{replaced_id}a-1"
        name2seq[formatted_id]
        return formatted_id
    except:
        pass

    # See whether adding "a" to number fixes ID
    try:
        formatted_id = re.sub(r"-(\d+)-", r"-\1a-", mirna_id)
        name2seq[formatted_id]
        return formatted_id
    except:
        pass
    
    try:
        formatted_id = re.sub(r"-(\d+)-", r"-\1a-", replaced_id)
        name2seq[formatted_id]
        return formatted_id
    except:
        pass
    
    # See whether adding "-1" to number fixes ID
    try:
        formatted_id = re.sub(r"-(\d+)-", r"-\1-1-", mirna_id)
        name2seq[formatted_id]
        return formatted_id
    except:
        pass
    
    try:
        formatted_id = re.sub(r"-(\d+)-", r"-\1-1-", replaced_id)
        name2seq[formatted_id]
        return formatted_id
    except:
        pass

In [6]:
# Keep true if experimenting with ID conversions
# If False, removes all None name : seq values. This is to remove miRNAs that no longer exist in miRBase. 
# If True, keeps all None values and displays which IDs need to be further modified to return sequence
testing = False

tat_interactions = {name : name2seq.get(process_id(name, name2seq), None) for name in interactions['Interaction']}
tat_non_interactions = {name : name2seq.get(process_id(name, name2seq), None) for name in interactions['NonInteraction']}


print(f"Original Interacting Tat-miRNA count: {len(tat_interactions)}")


if not testing:
    # Filter out long miRNAs and non-existant miRNAs
    tat_interactions = {key : value for key,value in tat_interactions.items() if value}
    tat_interactions = {key : value for key,value in tat_interactions.items() if len(value) <= 100}
    
    tat_non_interactions = {key : value for key,value in tat_non_interactions.items() if value}
    tat_non_interactions = {key : value for key,value in tat_non_interactions.items() if len(value) <= 100}

print(f"Filtered for existing and <= 100 Interacting Tat-miRNA count: {len(tat_interactions)}")
print(f"Filtered for existing and <= 100 Interacting Tat-miRNA count: {len(tat_non_interactions)}")

    
if testing:
    for key,value in tat_interactions.items():
        if not value:
            print(f"{key}: {value}")

Original Interacting Tat-miRNA count: 68
Filtered for existing and <= 100 Interacting Tat-miRNA count: 57
Filtered for existing and <= 100 Interacting Tat-miRNA count: 284


In [7]:
tat_non_interactions

{'hsa-let-7a': 'UGGGAUGAGGUAGUAGGUUGUAUAGUUUUAGGGUCACACCCACCACUGGGAGAUAACUAUACAAUCUACUGUCUUUCCUA',
 'hsa-miR-326': 'CCUCUGGGCCCUUCCUCCAG',
 'hsa-let-7b': 'CGGGGUGAGGUAGUAGGUUGUGUGGUUUCAGGGCAGUGAUGUUGCCCCUCGGAAGAUAACUAUACAACCUACUGCCUUCCCUG',
 'hsa-let-7c': 'GCAUCCGGGUUGAGGUAGUAGGUUGUAUGGUUUAGAGUUACACCCUGGGAGUUAACUGUACAACCUUCUAGCUUUCCUUGGAGC',
 'hsa-let-7d': 'CCUAGGAAGAGGUAGUAGGUUGCAUAGUUUUAGGGCAGGGAUUUUGCCCACAAGGAGGUAACUAUACGACCUGCUGCCUUUCUUAGG',
 'hsa-let-7e': 'CCCGGGCUGAGGUAGGAGGUUGUAUAGUUGAGGAGGACACCCAAGGAGAUCACUAUACGGCCUCCUAGCUUUCCCCAGG',
 'hsa-let-7f': 'UCAGAGUGAGGUAGUAGAUUGUAUAGUUGUGGGGUAGUGAUUUUACCCUGUUCAGGAGAUAACUAUACAAUCUAUUGCCUUCCCUGA',
 'hsa-miR-1': 'UGGGAAACAUACUUCUUUAUAUGCCCAUAUGGACCUGCUAAGCUAUGGAAUGUAAAGAAGUAUGUAUCUCA',
 'hsa-miR-9': 'CGGGGUUGGUUGUUAUCUUUGGUUAUCUAGCUGUAUGAGUGGUGUGGAGUCUUCAUAAAGCUAGAUAACCGAAAGUAAAAAUAACCCCA',
 'hsa-miR-15a': 'CCUUGGAGUAAAGUAGCAGCACAUAAUGGUUUGUGGAUUUUGAAAAGGUGCAGGCCAUAUUGUGCUGCCUCAAAAAUACAAGG',
 'hsa-miR-15b': 'UUGAGGCCUUAAAGUACUGUAGCAGCACAU

In [8]:
cares_tats

{'A0382-R00': 'M E P V D P R L E P W K H P G S Q P K T P C T N C Y C K K C C F H C Q V C F I T K G L G I S Y G R K K R R Q R R R P P Q G S Q T H Q V S L S K Q P A S Q A R K D P T G P K E S K K K V K R E T E A D P V D',
 'A0008-R01': 'M E P V D P S L E P W K H P G S Q P K T A C N T C Y C K R C C L H C Q Y C F T T K G L G I S Y G R K K R R Q R R R P A Q G S E T G Q V P L S K Q P T P Q R R G D P T G P K E S K K K V E R E T E T D P V H',
 'A0008-R00': 'M E P V D P R L E P W N H P G S Q P K T P C T P C Y C K R C C F H C Q V C F I T K G L G I S Y G R K K R R Q R R R S P P D S E T H Q A S L S K Q P A S Q R R E D P T G P K E S K K K V E R E T E T D P V H',
 'A0095-R01': 'M E P V D P R L E P W N H P G S Q P K T P C T P C Y C K R C C L H C Q V C F I T K G L G I S Y G R K K R R K R R R P P Q D S E A H Q E P L S K Q P T S Q P R G D P T G P E E S K K T V E R E T E T H P R D',
 'A0095-R00': 'M E P V D P R L E P W N H P G S Q P K T P C T P C Y C K R C C L H C Q S C F T R K G L G I S Y G R K K R R K R

In [9]:
def generate_all_combos(list1, list2, list_limit = None):
    """
    Returns all combinations of items in two input lists
    
    ex)
    ---
    >>> x = [1,2,3]
    >>> y = ['x','y']
    >>> generate_all_combos(x,y)
    [(1, 'x'), (1, 'y'), (2, 'x'), (2, 'y'), (3, 'x'), (3, 'y')]
    """
    return [(i,j) for i in list1[:list_limit] for j in list2[:list_limit]]

In [10]:
cares_mirna_combos = generate_all_combos(list(cares_tats.values()), list(tat_interactions.values()))
cares_mirna_ids = generate_all_combos(list(cares_tats.keys()), list(tat_interactions.keys()))

cares_mirna_non_interact_combos = generate_all_combos(list(cares_tats.values()), list(tat_non_interactions.values()))
cares_mirna_non_interact_ids = generate_all_combos(list(cares_tats.keys()), list(tat_non_interactions.keys()))

In [11]:
interacting_tats, interacting_mirnas = zip(*cares_mirna_combos)
non_interacting_tats, non_interact_mirnas = zip(*cares_mirna_non_interact_combos)

In [12]:
import sys
sys.path.append("../../Chapter_3")

from transformers import AutoTokenizer, AutoModel
from Utils.negative_sampling import IterableProteinEmbedding
from Utils.encoders import SkipGramEmbedder
import torch

model     = AutoModel.from_pretrained("Rostlab/prot_bert_bfd")
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert_bfd", do_lower_case = False)

interacting_tat_embedder = IterableProteinEmbedding(interacting_tats, tokenizer, model, chunksize = 3, max_len = 110, cuda = True)
noninteracting_tat_embedder = IterableProteinEmbedding(non_interacting_tats, tokenizer, model, chunksize = 3, max_len = 110, cuda = True)

with torch.no_grad():
    interacting_encoded_tat = torch.cat(list(interacting_tat_embedder))
    noninteracting_encoded_tat = torch.cat(list(noninteracting_tat_embedder))

  0%|          | 0/5624 [00:00<?, ?it/s]

  0%|          | 0/28022 [00:00<?, ?it/s]

In [13]:
rna_embedder = SkipGramEmbedder("/data/Chapter_3/SeqEmbedders/GensimWord2Vec/RNA2Vec_1024_hidden.model", reduce = True)

interacting_encoded_mirnas = rna_embedder(interacting_mirnas)
noninteracting_encoded_mirnas = rna_embedder(non_interact_mirnas)

interacting_encoded_mirnas

  kmer_matrix = torch.stack([torch.from_numpy(self.bio2vec.wv[kmer.lower()]) for kmer in kmers]).T


tensor([[-0.0798, -0.0949, -0.0973,  ..., -0.1659,  0.0539, -0.3381],
        [ 0.6910,  0.2231,  1.8914,  ..., -1.2562,  0.1066, -0.2320],
        [-0.1549, -0.1900, -0.3548,  ...,  0.1658,  0.1127, -0.0458],
        ...,
        [-0.0426, -0.1723,  0.0303,  ..., -0.2267, -0.3077, -0.2778],
        [ 0.0508, -0.3626,  0.1157,  ..., -0.0997, -0.0164,  0.2104],
        [-0.0652, -0.1539,  0.1250,  ..., -0.1297, -0.0473, -0.1937]])

In [14]:
cares_interacting_data = [interacting_encoded_tat.cpu(), interacting_encoded_mirnas]
cares_noninteracting_data = [noninteracting_encoded_tat.cpu(), noninteracting_encoded_mirnas]

cares_interacting_data

[tensor([[-0.0230, -0.0018, -0.0016,  ..., -0.0163,  0.0186, -0.0563],
         [-0.0230, -0.0018, -0.0016,  ..., -0.0163,  0.0186, -0.0563],
         [-0.0230, -0.0018, -0.0016,  ..., -0.0163,  0.0186, -0.0563],
         ...,
         [-0.0352,  0.0016,  0.0126,  ..., -0.0227,  0.0290, -0.0581],
         [-0.0352,  0.0016,  0.0126,  ..., -0.0227,  0.0290, -0.0581],
         [-0.0352,  0.0016,  0.0126,  ..., -0.0227,  0.0290, -0.0581]]),
 tensor([[-0.0798, -0.0949, -0.0973,  ..., -0.1659,  0.0539, -0.3381],
         [ 0.6910,  0.2231,  1.8914,  ..., -1.2562,  0.1066, -0.2320],
         [-0.1549, -0.1900, -0.3548,  ...,  0.1658,  0.1127, -0.0458],
         ...,
         [-0.0426, -0.1723,  0.0303,  ..., -0.2267, -0.3077, -0.2778],
         [ 0.0508, -0.3626,  0.1157,  ..., -0.0997, -0.0164,  0.2104],
         [-0.0652, -0.1539,  0.1250,  ..., -0.1297, -0.0473, -0.1937]])]

In [15]:
cares_interacting_data

[tensor([[-0.0230, -0.0018, -0.0016,  ..., -0.0163,  0.0186, -0.0563],
         [-0.0230, -0.0018, -0.0016,  ..., -0.0163,  0.0186, -0.0563],
         [-0.0230, -0.0018, -0.0016,  ..., -0.0163,  0.0186, -0.0563],
         ...,
         [-0.0352,  0.0016,  0.0126,  ..., -0.0227,  0.0290, -0.0581],
         [-0.0352,  0.0016,  0.0126,  ..., -0.0227,  0.0290, -0.0581],
         [-0.0352,  0.0016,  0.0126,  ..., -0.0227,  0.0290, -0.0581]]),
 tensor([[-0.0798, -0.0949, -0.0973,  ..., -0.1659,  0.0539, -0.3381],
         [ 0.6910,  0.2231,  1.8914,  ..., -1.2562,  0.1066, -0.2320],
         [-0.1549, -0.1900, -0.3548,  ...,  0.1658,  0.1127, -0.0458],
         ...,
         [-0.0426, -0.1723,  0.0303,  ..., -0.2267, -0.3077, -0.2778],
         [ 0.0508, -0.3626,  0.1157,  ..., -0.0997, -0.0164,  0.2104],
         [-0.0652, -0.1539,  0.1250,  ..., -0.1297, -0.0473, -0.1937]])]

In [16]:
with open("../Data/EncodedData/EncodedCARESPositivemiRNACombos.pickle",'wb') as infile:
    pickle.dump(cares_interacting_data, infile)
    
with open("../Data/EncodedData/CARESPositivemiRNAComboIDs.pickle",'wb') as infile:
    pickle.dump(cares_mirna_ids, infile)
    
with open("../Data/EncodedData/EncodedCARESNegativemiRNACombos.pickle",'wb') as infile:
    pickle.dump(cares_noninteracting_data, infile)
    
with open("../Data/EncodedData/CARESNegativemiRNAComboIDs.pickle",'wb') as infile:
    pickle.dump(cares_mirna_non_interact_ids, infile)