In [None]:
from Bio import SeqIO
import numpy as np
from scipy.stats import entropy
import matplotlib.pylab as plt
from sklearn import metrics
from tqdm.notebook import trange
from joblib import Parallel, delayed

import json

In [None]:
names = []
seqs = []
lengths = []
uniques = []
for seq_record in SeqIO.parse("data/combined-MSA.fasta", "fasta"):
    names.append(seq_record.id)
    seqs.append(str(seq_record.seq))
    uniques.append(list(set(str(seq_record.seq))))
    lengths.append(len(seq_record))

In [None]:
array = np.array([list(word) for word in seqs]).T

In [None]:
probabilities = []
for i in range(array.shape[0]):
    uniques, counts = np.unique(array[i], return_counts=True)
    d = {u : c for u, c in zip(uniques, counts)}
    d.pop("-", None)
    s = sum(d.values())
    p = []
    for key, value in d.items():
        p.append(value / s)
    probabilities.append(p)

In [None]:
entropies = np.array([entropy(prob, base=2) for prob in probabilities])
# msk = np.argwhere(entropies > 0)
# msk = np.reshape(msk, (msk.shape[0],))

n = 500

idx = np.argpartition(entropies, -n)[-n:]
msk = idx[np.argsort((-entropies)[idx])]

entropies = np.reshape(entropies[msk], (entropies[msk].shape[0],))

In [None]:
list(zip(msk, entropies))

In [None]:
plt.bar(msk, entropies)
plt.savefig('barplot.pdf')
plt.show()

In [None]:
def mutualinfo_vect(array, length, i):
    res = np.zeros(shape=(length))
    for j in range(i, length):
        res[j] = metrics.mutual_info_score(array[i], array[j])
    return res


def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

In [None]:
length = array.shape[0]

mutual_info = Parallel(n_jobs=16)(delayed(mutualinfo_vect)(array, length, i) for i in trange(length))

mutual_info = np.stack(mutual_info)

np.save('data/mutual_info.npy', mutual_info)

In [None]:
mutual_info = np.load('data/mutual_info.npy')

In [None]:
checkpoints = []
checkpoints_rel = {}
hops = []
threshold = 0.6
k = 0
for i in trange(mutual_info.shape[0]):
    if np.any(mutual_info[i] > threshold):
        k = k + 1
        checkpoints.append({'checkpoint' : i, 'sequence' : k})
        checkpoints_rel[i] = k

        
k = 0
for i in trange(mutual_info.shape[0]):
    if np.any(mutual_info[i] > threshold):
        k = k + 1
        for j in range(mutual_info.shape[1]):
            if mutual_info[i, j] > threshold and i != j:
                hops.append({"source" : k,
                             "target" : checkpoints_rel[j],
                             "value" : 10000,
    #                          "region":"AUS",
    #                          "team":"CHEETAHS","product":"JIRA",
                             "seriescount" : 1})
len(checkpoints)

In [None]:
d = {'checkpoints' : checkpoints,
     'hops' : hops}

with open("d3jump/jp.json", "w") as outfile:
    json.dump(d, outfile)