In [18]:
from collections import Counter
from src.dataset import load_benchmark_dataset, Species, Modification

In [19]:
dataset = load_benchmark_dataset(Species.human, Modification.psi)

In [20]:
positive_samples = dataset.samples[dataset.targets == 1]['sequence'].values
negative_samples = dataset.samples[dataset.targets == 0]['sequence'].values

In [29]:
def process_one_side(start, end, samples, proba, reverse=False):
    uniques = set()
    for sample in samples:
        uniques.add(sample[start: end])

    index = start - 1 if reverse else end

    if end == start:
        nucleotides = []
        for sample in samples:
            nucleotides.append(sample[index])
        proba[''] = Counter(nucleotides)
        return

    for item in uniques:
        nucleotides = []
        for sample in samples:
            if sample[start: end] == item:
                nucleotides.append(sample[index])
        proba[item] = Counter(nucleotides)


def prepare_probabilities(samples):
    if len(samples) == 0:
        return None

    r_prob = dict()
    l_prob = dict()
    mid = len(samples[0]) // 2

    for i in range(mid):
        l_limit = mid - i
        r_limit = mid + i + 1

        process_one_side(mid + 1, r_limit, samples, r_prob)
        process_one_side(l_limit, mid, samples, l_prob, True)

    return r_prob, l_prob

In [30]:
pos_prob = prepare_probabilities(positive_samples)
neg_prob = prepare_probabilities(negative_samples)

In [68]:
def _resolve_value(motif, nucleotide, proba):
    return proba[motif][nucleotide] if motif in proba else 0


def encode(sequence: str):
    mid = len(sequence) // 2
    pos_result = [0] * mid * 2
    neg_result = [0] * mid * 2

    for i in range(mid):
        l_limit = mid - i - 1
        r_limit = mid + i + 2

        l_seq = sequence[l_limit: mid][1:]
        r_seq = sequence[mid + 1: r_limit][:-1]

        pos_result[l_limit] = _resolve_value(l_seq, sequence[l_limit], pos_prob[0])
        pos_result[r_limit - 2] = _resolve_value(r_seq, sequence[r_limit -1], pos_prob[1])
        
        neg_result[l_limit] = _resolve_value(l_seq, sequence[l_limit], neg_prob[0])
        neg_result[r_limit - 2] = _resolve_value(r_seq, sequence[r_limit -1], neg_prob[1])

    return pos_result + neg_result


encode('CAUGGAGAGAUGUUCUUUACU')

[0,
 0,
 0,
 0,
 0,
 2,
 6,
 5,
 32,
 176,
 112,
 51,
 11,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 2,
 2,
 12,
 26,
 126,
 91,
 29,
 9,
 0,
 1,
 0,
 0,
 0,
 0,
 0]