In [3]:
import glob
import re

import pandas as pd
import numpy as np
import scipy as sp

import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
columns=["mode", "lr", "bs", "p", "n_iter", "d_mean", "d_q1", "d_median", "d_q3", "pearson", "p_mut"]
stats = []

for f in glob.glob("./out/*.csv"):
    random_pattern = r"\.\/out\/inference.random.(?P<n_iter>\d+).(?P<mode>(final)|(validation)).LR(?P<lr>.*)_BS(?P<bs>.*)_P(?P<p>.*).csv"
    base_pattern = r"\.\/out\/inference.(?P<mode>(final)|(validation)).LR(?P<lr>.*)_BS(?P<bs>.*)_P(?P<p>.*).csv"
    if attributes := re.match(random_pattern, f):
        pass
    elif attributes := re.match(base_pattern, f):
        attributes = attributes.groupdict()
        attributes["n_iter"] = 0
    else:
        break
    
    df = pd.read_csv(f)
    pearson =  sp.stats.pearsonr(df["perplexity_src"], df["perplexity_mut"]).statistic
    dq1, dq3 = (df["perplexity_src"] - df["perplexity_mut"]).quantile(0.25), (df["perplexity_src"] - df["perplexity_mut"]).quantile(0.75)
    dmean, dmedian = (df["perplexity_src"] - df["perplexity_mut"]).mean(), (df["perplexity_src"] - df["perplexity_mut"]).median()
    
    base_seq = pd.read_csv("./data/_nt_test.csv").loc[:, "seq"].to_list()
    mutated_seq = pd.read_csv(f)["sequences_mut"].to_list()
    p_mut = pd.Series([sum([base[i] != mut[i] for i in range(len(base))]) / len(base) for base, mut in zip(base_seq, mutated_seq) if len(base) == len(mut)]).mean()


    entry = dict(zip(columns, [attributes['mode'], attributes['lr'], attributes['bs'], attributes['p'], attributes['n_iter'], dmean, dq1, dmedian, dq3, pearson, p_mut]))
    stats.append(entry)



stats = {key: [i[key] for i in stats] for key in stats[0]}
stats = pd.DataFrame(stats)
stats = stats.astype({'p': 'float32', 'n_iter': 'int32'})
stats = stats.sort_values(by=["p", "mode", "lr", "bs", "n_iter"])
print(stats.to_latex(float_format="{:.2f}".format))

\begin{tabular}{llllrrrrrrrr}
\toprule
 & mode & lr & bs & p & n_iter & d_mean & d_q1 & d_median & d_q3 & pearson & p_mut \\
\midrule
75 & final & 0.0004 & 256 & 0.05 & 0 & 0.00 & 0.00 & 0.00 & 0.00 & 1.00 & 0.00 \\
68 & final & 0.0004 & 256 & 0.05 & 3 & 0.04 & 0.01 & 0.02 & 0.06 & 0.98 & 0.04 \\
6 & final & 0.0004 & 256 & 0.05 & 5 & 0.08 & 0.02 & 0.05 & 0.10 & 0.93 & 0.06 \\
16 & final & 0.0004 & 256 & 0.05 & 10 & 0.12 & 0.04 & 0.10 & 0.19 & 0.88 & 0.12 \\
32 & final & 0.0004 & 256 & 0.05 & 20 & 0.20 & 0.08 & 0.14 & 0.28 & 0.50 & 0.20 \\
23 & final & 0.0004 & 256 & 0.05 & 40 & 0.07 & 0.12 & 0.23 & 0.35 & 0.23 & 0.30 \\
8 & final & 0.0004 & 256 & 0.05 & 80 & 0.38 & 0.25 & 0.36 & 0.47 & 0.26 & 0.39 \\
4 & final & 0.0004 & 512 & 0.05 & 0 & 0.00 & 0.00 & 0.00 & 0.00 & 1.00 & 0.00 \\
78 & final & 0.0004 & 512 & 0.05 & 3 & 0.03 & 0.01 & 0.02 & 0.06 & 0.92 & 0.04 \\
37 & final & 0.0004 & 512 & 0.05 & 5 & 0.07 & 0.01 & 0.04 & 0.10 & 0.93 & 0.06 \\
70 & final & 0.0004 & 512 & 0.05 & 10 & 0.12 

In [130]:
base_seq = pd.read_csv("./data/_nt_test.csv").loc[:, "seq"].to_list()
mutated_seq = pd.read_csv(f)["sequences_mut"].to_list()

edges = [(base[i], mut[i]) for base, mut in zip(base_seq, mutated_seq) if len(base) == len(mut) for i in range(len(base)) if base[i] != mut[i]]
_from, _to = set([el[0] for el in edges]), set([el[1] for el in edges])
nodes = list(_from.union(_to))
adjacency_matrix = {node: {n: 0 for n in nodes} for node in nodes}
for base, mut in edges: adjacency_matrix[base][mut] += 1
adjacency_matrix = [list(values.values()) for _, values in adjacency_matrix.items()]
adjacency_matrix = np.array(adjacency_matrix)
top_base= [nodes[i] for i in np.argsort(adjacency_matrix.sum(axis=1))[-5:]][::-1]
top_mut = [nodes[i] for i in np.argsort(adjacency_matrix.sum(axis=0))[-5:]][::-1]
top_base, top_mut

(['S', 'D', 'R', 'T', 'G'], ['L', 'A', 'R', 'E', 'G'])

In [136]:
top_mut = list(zip(list(adjacency_matrix.flatten().argsort()[-20:] // len(adjacency_matrix)), list(adjacency_matrix.flatten().argsort()[-20:] % len(adjacency_matrix))))[::-1]

In [137]:
[(nodes[base], nodes[mut], adjacency_matrix[base, mut]) for base, mut in top_mut]

[('R', 'L', 12),
 ('G', 'L', 11),
 ('V', 'L', 10),
 ('D', 'L', 9),
 ('Q', 'L', 9),
 ('T', 'L', 9),
 ('F', 'L', 8),
 ('A', 'L', 8),
 ('V', 'A', 7),
 ('D', 'A', 7),
 ('P', 'L', 6),
 ('L', 'A', 6),
 ('S', 'E', 6),
 ('S', 'L', 6),
 ('E', 'L', 5),
 ('S', 'A', 5),
 ('E', 'G', 5),
 ('N', 'K', 5),
 ('T', 'E', 5),
 ('S', 'R', 5)]

In [92]:
adjacency_matrix[14, 18]

12

In [9]:
print(stats.to_latex(index=False, float_format="{:.3f}".format))

\begin{tabular}{lllrrrrrrr}
\toprule
mode & lr & bs & p & n_iter & d_mean & d_q1 & d_median & d_q3 & pearson \\
\midrule
final & 0.0004 & 256 & 0.050 & 0 & 0.001 & 0.000 & 0.000 & 0.000 & 1.000 \\
final & 0.0004 & 256 & 0.050 & 3 & 0.039 & 0.008 & 0.023 & 0.062 & 0.979 \\
final & 0.0004 & 256 & 0.050 & 5 & 0.078 & 0.016 & 0.055 & 0.102 & 0.935 \\
final & 0.0004 & 256 & 0.050 & 10 & 0.125 & 0.039 & 0.102 & 0.188 & 0.876 \\
final & 0.0004 & 512 & 0.050 & 0 & 0.000 & 0.000 & 0.000 & 0.000 & 1.000 \\
final & 0.0004 & 512 & 0.050 & 3 & 0.030 & 0.006 & 0.023 & 0.057 & 0.916 \\
final & 0.0004 & 512 & 0.050 & 5 & 0.066 & 0.008 & 0.039 & 0.102 & 0.934 \\
final & 0.0004 & 512 & 0.050 & 10 & 0.123 & 0.039 & 0.082 & 0.188 & 0.873 \\
validation & 0.0004 & 128 & 0.050 & 0 & 0.003 & 0.000 & 0.000 & 0.000 & 0.996 \\
validation & 0.0004 & 128 & 0.050 & 3 & 0.039 & 0.008 & 0.023 & 0.062 & 0.955 \\
validation & 0.0004 & 128 & 0.050 & 5 & 0.064 & 0.016 & 0.043 & 0.102 & 0.895 \\
validation & 0.0004 & 128 