In [52]:
import json
import numpy as np
import plotly.express as px
from pandas import DataFrame
from collections import Counter
from sklearn.manifold import TSNE

from src.dataset import load_benchmark_dataset, Species, Modification, SeqBunch

In [70]:
def remove_central(sample):
    sample = sample[0]
    mid = len(sample) // 2

    return sample[:mid] + sample[mid + 1:]


def extract_sequences_from_df(df: DataFrame):
    return list(map(remove_central, df.values.tolist()))


def extract_data(bunch: SeqBunch):
    return extract_sequences_from_df(bunch.samples), bunch.targets.values.tolist()

In [71]:
h_train_samples, h_train_targets = extract_data(
    load_benchmark_dataset(Species.human, Modification.psi)
)

h_test_samples, h_test_targets = extract_data(
    load_benchmark_dataset(Species.human, Modification.psi, True)
)

In [72]:
def generate_probabilities(samples: list[str], targets):
    pos_samples = []
    neg_samples = []

    for i in range(len(samples)):
        for j in range(len(samples[i]) - 1):
            if targets[i] == 1:
                pos_samples.append(samples[i][:j + 1])
            else:
                neg_samples.append(samples[i][:j + 1])

    return Counter(pos_samples), Counter(neg_samples)


result = generate_probabilities(h_train_samples, h_train_targets)

In [73]:
pos_result = result[0]
neg_result = result[1]

In [74]:
results = dict()
only_pos_results = dict()
only_neg_results = dict()

pos_keys = list(pos_result.keys())
neg_keys = list(neg_result.keys())

for key in neg_keys:
    if key not in results:
        if pos_result[key] == 0 and neg_result[key] > 1:
            only_neg_results[key] = neg_result[key]

        if neg_result[key] == 0:
            only_pos_results[key] = pos_result[key]

        results[key] = (pos_result[key], neg_result[key])

print(only_pos_results)
print(only_neg_results)

{}
{'CAUA': 4, 'CAUAU': 2, 'AUUAU': 2, 'CCAAG': 2, 'CCAAGG': 2, 'UUCUC': 2, 'UCCCC': 2, 'CCCCCCU': 2, 'CCCCCCUU': 2, 'AAGU': 3, 'CACC': 3, 'GGGUC': 2, 'UUUCC': 2, 'CAACA': 3, 'AUCCU': 2, 'AUCCUA': 2, 'GAUG': 3, 'GAUGA': 2, 'GGGCAC': 2, 'CACUC': 3, 'GAUCA': 2, 'GAUCAA': 2, 'GAUCAAA': 2, 'GAUCAAAA': 2, 'CAUUC': 2, 'ACAGGA': 2, 'UCUA': 4, 'UCUAU': 2, 'UCUAUU': 2, 'AAAAGGG': 2, 'AAAAGGGA': 2, 'CCCCU': 2, 'CCCCUG': 2, 'CCCCUGG': 2, 'UUGAA': 2, 'UUCCG': 2, 'UUCCGG': 2, 'AAUGAA': 2, 'ACCAG': 2, 'ACCAGA': 2, 'UACA': 3, 'UGAAA': 2, 'GCUCA': 3, 'GCUCAG': 2, 'UUGGU': 2, 'UUACA': 2, 'UUACAG': 2, 'UUACAGG': 2, 'UCCCA': 2, 'UAAAA': 4, 'CACUCU': 2, 'UGGC': 2, 'UGGCA': 2, 'UAUU': 3, 'UAUUA': 2, 'AUGGU': 2, 'UUCCC': 2, 'AAUAC': 2, 'UUUUUUUA': 2, 'CAUAA': 2, 'AAAGA': 2, 'GUAUA': 2, 'ACUU': 2, 'ACUUG': 2, 'UGAG': 4, 'UGAGC': 2, 'UGUAAA': 2, 'CAGGC': 2, 'CAGGCA': 2, 'CAGGCAG': 2, 'CAGGCAGA': 2, 'CAGGCAGAC': 2, 'UCAA': 3, 'UCAAG': 2, 'UCUGU': 2, 'GACAG': 2, 'GACAGU': 2, 'GACAGUG': 2, 'AGAGG': 2, 'CACUG': 2

In [95]:
def positional_probabilities(samples: list[str], targets: list[int]):
    probabilities = []
    pos_nucleotides = []
    neg_nucleotides = []

    for i in range(len(samples)):
        for j in range(len(samples[i])):
            if targets[i] == 1:
                if len(pos_nucleotides) > j:
                    pos_nucleotides[j].append(samples[i][j])
                else:
                    pos_nucleotides.append([samples[i][j]])
            else:
                if len(neg_nucleotides) > j:
                    neg_nucleotides[j].append(samples[i][j])
                else:
                    neg_nucleotides.append([samples[i][j]])

    pos_nucleotides = list(map(lambda x: Counter(x), pos_nucleotides))
    neg_nucleotides = list(map(lambda x: Counter(x), neg_nucleotides))

    for i in range(len(pos_nucleotides)):
        pos_counter = pos_nucleotides[i]
        neg_counter = neg_nucleotides[i]

        pos_dict = {}
        neg_dict = {}
        for nuc in ['A', 'C', 'G', 'U']:
            count = pos_counter[nuc] + neg_counter[nuc]
            pos_dict[nuc] = pos_counter[nuc] / count
            neg_dict[nuc] = neg_counter[nuc] / count

        probabilities.append({'pos': pos_dict, 'neg': neg_dict})

    return probabilities

In [97]:
with open('1_positional_probabilities.json', 'w') as file:
    file.write(json.dumps(positional_probabilities(h_train_samples, h_train_targets)))

In [100]:
with open('1_ue_positional_probabilities.json', 'w') as file:
    file.write(json.dumps(positional_probabilities(h_train_samples + h_test_samples, h_train_targets + h_test_targets)))