In [61]:
from Bio.Align import PairwiseAligner
import pandas as pd
import numpy as np
import tensorflow as tf

In [82]:
class KinsfolkProfileSampler():
    def __init__(self, path_to_targets, path_to_keys, path_to_pep):
        self.aligner = PairwiseAligner()
        self.targets = pd.read_csv(path_to_targets)
        self.keys = pd.read_csv(path_to_keys)
        self.peptides = np.load(path_to_pep)['data']
        
    def align(self, query):
        scores = self.targets['Marker'].apply(lambda x: self.aligner.align(x, query)[0].score)
        return scores
    
    def findkins(self, query):
        scores = self.align(query)
        indices = scores.nlargest(10).index.values
        return self.targets["Target"][indices]
    
    def findpep(self, query):
        kins = self.findkins(query)
        pepids = self.keys.query("Target in @kins")['PepID']
        pepindices = pepids.apply(lambda x: int(x.split('pep.')[1])).drop_duplicates().values
        pep = self.peptides[pepindices]
        pep = tf.one_hot(pep, depth=43).numpy()
        pep = pep.reshape(-1, 43*194)
        return pep
        

In [3]:
path_to_targets = 'data/processed_data/Baseline/targets.csv'
path_to_keys = 'data/processed_data/Baseline/keys.csv'
path_to_pep = "data/processed_data/Baseline/enc_aln_pep.npz"

In [83]:
kps = KinsfolkProfileSampler(path_to_targets, path_to_keys, path_to_pep)

In [97]:
pep = kps.align('AGAGUUUGAUCCUGGCUCAGGAUGAACGCUGGCGGCGUGCCUAAUACAUGCAAGUCGAACGAAGCAUCUUCGGAUGCUUAGUGGCGAACGGGUGAGUAACACGUAGAUAACCUACCUUUAACUCGAGGAUAACUCCGGGAAACUGGAGCUAAUACUGGAUAGGAUGUGUGCAUGAAAAAAACACAUUUAAAGAUUUAUCGGUUUAAGAGGGGUCUGCGGCGCAUUAGUUAGUUGGUGGGGUAAAAGCCUACCAAGACGAUGAUGCGUAGCCGGACUGAGAGGUCUACCGGCCACAUUGGGACUGAGAACGGCCCAAACUCCUACGGGAGGCAGCAGUAGGGAAUUUUCGGCAAUGGGGGAAACCCUGACCGAGCAACGCCGCGUGAACGACGAAGUACUUCGGUAUGUAAAGUUCUUUUAUAUGGGAAGAAAAAUUAAAAAUUGACGGUACCAUAUGAAUAAGCCCCGGCUAACUAUGUGCCAGCAGCCGCGGUAAUACAUAGGGGGCGAGCGUUAUCCGGAUUUACUGGGCGUAAAGGGUGCGUAGGUGGUUAUAAAAGUUUGUGGUGUAAGUGCAGUGCUUAACGCUGUGAGGCUAUGAAAACUAUAUAACUAGAGUGAGACAGAGGCAAGUGGAAUUCCAUGUGUAGCGGUAAAAUGCGUAAAUAUAUGGAGGAACACCAGUGGCGAAGGCGGCUUGCUGGGUCUAUACUGACACUGAUGCACGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCACGCCGUAAACGAUGAGAACUAAGUGUUGGCCAAAAGGUCAGUGCUGCAGUUAACGCAUUAAGUUCUCCGCCUGAGUAGUACGUACGCAAGUAUGAAACUCAAAGGAAUUGACGGGACCCCGCACAAGCGGUGGAUCAUGUUGUUUAAUUCGAAGAUACACGAAAAACCUUACCAGGUCUUGACAUACUCUGCAAAGGCUUAGAAAUAAGUUCGGAGGCUAACAGAUGUACAGGUGGUGCACGGUUGUCGUCAGCUCGUGUCGUGAGAUGUUGGGUUAAGUCCCGCAACGAGCGCAACCCUUAUUGCUAGUUACCAUCAUUAAGUUGGGGACUCUAGCGAGACUGCCAGUGAUAAAUUGGAGGAAGGUGGGGAUGACGUCAAAUCAUCAUGCCCCUUAUGACCUGGGCUACAAACGUGAUACAAUGGCUGGAACAAAGAGAAGCGAUAGGGUGACCUGGAGCGAAACUCACAAAAACAGUCUCAGUUCGGAUUGGAGUCUGCAACUCGACUCCAUGAAGUCGGAAUCGCUAGUAAUCGCAAAUCAGCAUGUUGCGGUGAAUACGUUCUCGGGGUUUGUACACACCGCCCGUCAAACCACGAAAGUGGGCAAUACCCAACGCCGGUGGCCUAACCCGAAAGGGAGGGAGCCGUCUAAGGUAGGGUCCAUGAUUGGGGUUAAGUCGUAACAAGGUAUCCCUACGGGAACGUGGGGAUGGAUCACCU')


In [96]:
pep.sum(axis=0)

array([ 0., 65.,  0., ...,  0.,  0.,  0.], dtype=float32)

In [108]:
pep.nlargest(10) / pep.nlargest(10).sum()

0      0.114946
244    0.103621
255    0.098382
3      0.097997
118    0.097997
2      0.097766
109    0.097535
108    0.097304
186    0.097304
78     0.097149
Name: Marker, dtype: float64