## Reconstruction with felsenstein

In [None]:
using Revise, PhyloTools, JLD2, PyPlot, Statistics

FileSequences = "../data_Anc/3_seq_DBD_collapsed_noonlychild_prunedsubtree301/seq3/seq3_mu14.96.fa"
;FileTree = "../data_Anc/DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301.nwk"
;  #m = 3.3255125659900124;

m = 1000.; 


res, p = Felsenstein(FileSequences, 
    FileTree, 
    FileSequences, 
    m);


res2, p2, like2 = Felsenstein2(FileSequences, 
    FileTree, 
    FileSequences, 
    m);

new_p = p ./ sum(p, dims=2);
cor(new_p[:], p2[:])
extrema(new_p .- p2)


_, NatMSA, _, _, q = read_fasta(FileSequences,1.0,0.2,false);
StationaryProb = compute_empirical_freqs(Int.(NatMSA), q; eps = 10^-5);


L, M = size(NatMSA);


res, p, like = Felsenstein2(FileSequences, 
    FileTree, 
    FileSequences, 
    m);

exact_like = zeros(L); inf_like = zeros(L); for i in 1:L
    exact_like[i] -= M * sum(StationaryProb[i,:] .* log.(StationaryProb[i,:]))
    inf_like[i] -= sum(log.(like[i,:]))
end
cor(exact_like, inf_like)
close("all"); plt.scatter(exact_like, inf_like); plt.xlabel("Exact neg-loglike"
    ); plt.ylabel("Inferred neg-loglike"); savefig("../ciao.png")



sum(abs2, p .- p2)


p_19 = [6.635051012487593e-8, 2.2109709355598605e-5, 4.4153068201072376e-5, 6.635051012487593e-8, 6.635051012487593e-8, 6.635051012487593e-8, 6.635051012487593e-8, 6.635051012487593e-8, 2.271068139300597e-5, 6.6350510124876e-8, 6.6350510124876e-8, 0.9997775377037664, 6.635051012487593e-8, 6.635051012487593e-8, 6.635051012487593e-8, 0.0001103838702765608, 6.635051012487593e-8, 6.635051012487593e-8, 6.635051012487586e-8, 2.2109709355598666e-5, 6.635051012487586e-8];
p_20 = [0.99858468853303, 4.34007837480259e-8, 4.34007837480259e-8, 4.34007837480259e-8, 4.34007837480259e-8, 0.0002185170595525506, 4.34007837480259e-8, 4.34007837480259e-8, 2.8904698285800303e-5, 4.34007837480259e-8, 4.34007837480259e-8, 4.34007837480259e-8, 7.586599564250211e-5, 4.34007837480259e-8, 4.34007837480259e-8, 0.001047368187868486, 4.400451386466602e-5, 4.34007837480259e-8, 4.3400783748025813e-8, 4.3400783748025813e-8, 4.3400783748025813e-8];

sum(abs2, p_19 .- p2[19,:])
sum(abs2, p_20 .- p2[20,:])

true_log_like = [-0.013929200131569713, -248.04716950711918, -102.98192258838772, -0.013929200131569713, -202.80255601721754, -359.27418520844395, -308.72272092312306, -41.313623094215345, -260.60977207365954, -244.5433433151459, -198.93342921518519, -7.498002960671273, -18.891533154419157, -226.21859356411932, -92.24917442601917, -203.7495169264531, -121.84171606822503, -0.013929200131569713, -43.79321578381407, -110.11317579674657, -0.013929200131569713, -7.57173018812454, -13.325761627149102, -7.6336274748259, -18.255457479494062, -44.23203015347086, -14.222837537638553, -150.56118355071274, -258.37976424719676, -267.0060991073859, -371.75336741693707, -377.3314214602087, -284.5101557613972, -394.8900492929836, -94.40294715312757, -375.1135073494219, -0.013929200131569713, -214.06252065794203, -242.1684093819262, -463.7529659652144, -256.9823512340833, -322.1915339757689, -6.398688357544629, -249.7848098071, -366.37671650982566, -333.9427692343579, -377.4153472381578, -405.5005798127121, -365.8194858136205, -488.97103064141356, -314.6875273781831, -108.57334395871233, -0.013929200131569713, -106.03648322071103, -207.5723551251978, -0.013929200131569713, -37.90905384054155, -120.55602380376301, -275.21999974455616, -153.3637805173926, -22.505457994436902, -203.1792611673148, -241.35751463950163, -311.98512636802803, -29.418482806278174, -0.013929200131551476, -359.83348979876075, -28.96397299753315, -6.748935789447794, -0.013929200131521529, -0.013929200131521529, -0.013929200131521529, -0.013929200131521529, -0.013929200131521529, -0.013929200131521529, -0.013929200131521529];
sum(abs2, like_2 .- true_like)


using BenchmarkTools

@btime Felsenstein($(FileSequences), $(FileTree), $(FileSequences), $(m))
@btime Felsenstein2($(FileSequences), $(FileTree), $(FileSequences), $(m))



In [None]:
# test likelihood

using Revise, PhyloTools, JLD2, PyPlot, Statistics


FileTree = "../DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301.nwk"
;FileSequences = "../seq1_mu42.88.fa"
;  m = 10.; 

#=For site 3: -14.76
For site 19: -322.91
For site 48: -625.02
For site 72: -533.13 =#

res, p, like = Felsenstein2(FileSequences, 
    FileTree, 
    FileSequences, 
    m);

log.(like[[3,19,48,72]])

p[[3,19,48,72]]

In [None]:
FileNat =
"../data_Anc/DBD_cleaned_20241118_f"; 
FileTree = 
"../data_Anc/DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301.nwk";  m = 100.;

@load "../data_Anc/3_start_seq_and_sweeps4DBD_ASR.jld2"
start_seq = Int.(start_msa[:,1]);


T = FelsensteinSampler(start_seq, FileNat, FileTree, m);

FileSequences = "../trial.fa" 
leavestofasta(FileSequences, T)

res, p = Felsenstein(FileSequences, 
    FileTree, 
    FileSequences, 
    m);


res2, p2, like2 = Felsenstein2(FileSequences, 
    FileTree, 
    FileSequences, 
    m);

new_p = p ./ sum(p, dims=2);
cor(new_p[:], p2[:])
extrema(new_p .- p2)


_, LeafMSA, _, _, q = read_fasta(FileSequences,1.0,0.2,false);
f = compute_empirical_freqs(Int.(LeafMSA), q; eps = 10^-5);


_, NatMSA, _, _, q = read_fasta(FileNat,1.0,0.2,false);
W = compute_empirical_freqs(Int.(NatMSA), q; eps = 10^-5);




L, M = size(NatMSA);


exact_like = zeros(L); inf_like = zeros(L); for i in 1:L
    exact_like[i] -= M * sum(f[i,:] .* log.(W[i,:]))
    inf_like[i] -= sum(log.(p[i,:]))
end
cor(exact_like, inf_like)
close("all"); plt.scatter(exact_like, inf_like); plt.xlabel("Exact neg-loglike"
    ); plt.ylabel("Inferred neg-loglike"); savefig("../ciao.png")




In [None]:
l1 = "UniRef90_A0A0L7QLK7/9-87";
l2 = "UniRef90_A0A1A9UXE3/268-342";
mask = (data(T[l1]).seq .!=21) .|| (data(T[l2]).seq .!=21)
ham_dist(data(T[l1]).seq[mask], data(T[l2]).seq[mask])
ham_dist(data(T[l1]).seq, data(T[l2]).seq)

## Reconstructing ancestor DBD

In [None]:
using Revise, PhyloTools, TreeTools, Statistics, JLD2, PyPlot

FileSequences = ["../data_Anc/3_seq_DBD_collapsed_noonlychild_prunedsubtree301/seq3/seq3_mu2.99.fa",
    "../data_Anc/3_seq_DBD_collapsed_noonlychild_prunedsubtree301/seq3/seq3_mu14.96.fa",
    "../data_Anc/3_seq_DBD_collapsed_noonlychild_prunedsubtree301/seq3/seq3_mu2992.57.fa"]
    ; FileTree = 
"../data_Anc/DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301.nwk"; 

@load "../data_Anc/3_start_seq_and_sweeps4DBD_ASR.jld2"

@load "../data_Anc/3_DBD_collapsed_noonlychild_prunedsubtree301.jld2"
start_msa = Int.(start_msa);

L = size(start_msa,1); 

res_all = []; p_all = [];
@time for file in FileSequences
    Z = PhyloTools.read_fasta_dict(file)
    T = read_tree(FileTree, node_data_type = PhyloTools.Seqontree)
    for leaf in leaves(T)
              data(T[label(leaf)]).seq = Z[label(leaf)] 
    end
    mu = infer_mu(FileTree, file, gap_option = true)
    println("Gap option true $(mu)")
    mu2 = infer_mu(FileTree, file, gap_option = false)
    println("Gap option false $(mu2)")
    #mu = 3.3255125659900124
    #=res, p = Felsenstein(file, 
    FileTree, 
    file, 
    mu);
    push!(res_all, res)
    push!(p_all, p)=#
end

H_asr = [ham_dist(start_msa[:,3], res_all[i]) for i in 1:length(res_all)] ./ L;
H_evol = [mean(ham_dist(start_msa[:,3], step_msa[i])) for i in 1:length(res_all)] ./ L;

sweeps = [1., 5., 1000.];

close("all"); plt.scatter(sweeps, H_asr, label = "H_asr"); plt.scatter(sweeps, H_evol, 
    label = "H_evol"); plt.xscale("log"); plt.xlabel("Average root-leaf sweeps"); plt.ylabel(
    "Hamming distance");plt.legend(); savefig("../prova_asr.png")




## Testing the inference of mu

In [None]:
using Revise, PhyloTools, JLD2, PyPlot, TreeTools, Statistics

FileTree = "../data_Anc/DBD_tree_clean_short_300_rooted_midpoint"; FileNat =
"../data_Anc/DBD_cleaned_20241118_f"; 

@load "../data_Anc/3_start_seq_and_sweeps4DBD_ASR.jld2"
start_seq = Int.(start_msa[:,1]);

mus = [0.1,0.2,0.5,1,2,5,10,20,50,100,200,500,1000]; inf_mus = [];
for mu in mus
    @time T = FelsensteinSampler(start_seq, FileNat, FileTree, mu);
    leaf_msa = seqs_from_leaves(T)
    push!(inf_mus, infer_mu(T))
end

cor(mus, inf_mus)

close("all"); plt.scatter(mus,inf_mus); plt.xscale("log"); plt.yscale("log");savefig("../trial.png")


### do also with potts


using Revise, Genie, JLD2, PyPlot, TreeTools, Statistics

FileTree = "../data_Anc/DBD_tree_clean_short_300_rooted_midpoint"; 
@load "../data_Genie/pars_dbd.jld2"; h = h_dbd; J = J_dbd;
@load "../data_Anc/3_start_seq_and_sweeps4DBD_ASR.jld2"
start_seq = Int.(start_msa[:,1]);
mus = Float64.([0.1,0.2,0.5,1,2,5,10,20]); inf_mus_p = [];
for mu in mus
    @time T = run_evolution_ontree(start_seq, FileTree, h, J, mu = mu, p = 0.5);
    leaf_msa = msa_from_leafs(T)
    push!(inf_mus_p, Genie.infer_felse_mu(T))
end

cor(mus, inf_mus_p)

close("all"); plt.scatter(mus,inf_mus_p); plt.xscale("log"); plt.yscale("log");savefig("../trial_p.png")


## Testing the inference of stationary probability

In [None]:
using Revise, PhyloTools, JLD2, PyPlot

FileNat =
"../data_Anc/DBD_cleaned_20241118_f"; 
FileTree = 
"../data_Anc/DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301.nwk";  mu = 10.;

@load "../data_Anc/3_start_seq_and_sweeps4DBD_ASR.jld2"
start_seq = Int.(start_msa[:,1]);

#=
T = FelsensteinSampler(start_seq, FileNat, FileTree, mu);

FileSequences = "../trial.fa" 
leavestofasta(FileSequences, T)
=#

FileSequences = "../seq3_mu14.96.fa"

res2, p2, like2 = Felsenstein2(FileSequences, 
    FileTree, 
    FileSequences, 
    mu);


@time W = run_inference(FileSequences, FileTree, FileSequences, mu, 
    each_step = 1);



In [None]:
FileNat =
"../data_Anc/DBD_cleaned_20241118_f"; 
FileTree = 
"../data_Anc/DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301.nwk";  mu = 6.879519513828341;

@load "../data_Anc/3_start_seq_and_sweeps4DBD_ASR.jld2"


FileSequences = "../data_Anc/OFF_3_seq_DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301/seq1/seq1_mu42.88.fa"


@time W = run_inference(FileSequences, FileTree, FileSequences, mu);


q = 21;
StationaryProb = W;
LeavesSequences = PhyloTools.read_fasta_dict(FileSequences)
MyTree = read_tree(FileTree, node_data_type = PhyloTools.ProbabilityOnTree)
AncestorSequence, AncestorProbability, AncLike = Felsenstein2(
    LeavesSequences, StationaryProb, MyTree, label(root(MyTree)), m);
AncestorSequence




In [None]:
FileNat =
"../data_Anc/DBD_cleaned_20241118_f"; 
FileTree = 
"../data_Anc/DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301.nwk"; 

@load "../data_Anc/3_start_seq_and_sweeps4DBD_ASR.jld2"
start_seq = Int.(start_msa[:,1]);

_, NatMSA, _, _, q = PhyloTools.read_fasta(FileNat,1.0,0.2,false)
L,M = size(NatMSA)  
StationaryProb = compute_empirical_freqs(Int.(NatMSA), q)

mus = [1., 2., 5., 10., 20., 50.];


cor_sample_inf = [];cor_sample_leaves = [];for mu in mus
    
    T = FelsensteinSampler(start_seq, FileNat, FileTree, mu);
    FileSequences = "../trial_mu$(mu).fa" 
    leavestofasta(FileSequences, T)
    _, LeavesMSA, _, _, q = PhyloTools.read_fasta(FileSequences,1.0,0.2,false)
    
    LeavesProb = compute_empirical_freqs(Int.(LeavesMSA), q)
    push!(cor_sample_leaves, cor(LeavesProb[:], StationaryProb[:]))
    
    @time W = run_inference(FileNat, FileTree, FileSequences, mu);
    push!(cor_sample_inf, cor(W'[:], StationaryProb[:]))
end


close("all"); plt.plot(mus, cor_sample_inf, label = "Ground truth vs Inferred", color = "blue"
    ); plt.plot(mus, cor_sample_leaves, label = "Ground truth vs Leaves", color = "orange"); plt.xlabel("Mu"
); plt.ylabel("Pearson correlation"); plt.legend(); savefig("../probability_inference_results.png")





In [None]:
#FileNat = "../data_Anc/DBD_cleaned_20241118_f"; 

FileNat = "../Gen.jl/data/alignments/natural/DBD_alignment.uniref90.cov80.a2m"; 

FileTree = 
"../data_Anc/DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301.nwk"; 

@load "../data_Anc/3_start_seq_and_sweeps4DBD_ASR.jld2"
start_seq = Int.(start_msa[:,1]);

_, NatMSA, _, _, q = PhyloTools.read_fasta(FileNat,1.0,0.2,false)
L,M = size(NatMSA)  
StationaryProb = compute_empirical_freqs(Int.(NatMSA), q)

mus = readdlm("../data_Anc/OFF_3_seq_DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301/mus.txt")[:];


cor_sample_inf = [];cor_sample_leaves = [];for mu in mus
    
    FileSequences = joinpath(folder, "seq1/seq1_mu$(round(mu, 
        digits = 2)).fa")
    _, LeavesMSA, _, _, q = PhyloTools.read_fasta(FileSequences,1.0,0.2,false)
    
    LeavesProb = compute_empirical_freqs(Int.(LeavesMSA), q)
    push!(cor_sample_leaves, cor(LeavesProb[:], StationaryProb[:]))
    
    @time W = run_inference(FileNat, FileTree, FileSequences, mu);
    push!(cor_sample_inf, cor(W'[:], StationaryProb[:]))
end


close("all"); plt.scatter(mus, cor_sample_inf, label = "Ground truth vs Inferred", color = "blue"
    ); plt.scatter(mus, cor_sample_leaves, label = "Ground truth vs Leaves", color = "orange"); plt.xlabel("Mu"
); plt.ylabel("Pearson correlation"); plt.xscale("log"); plt.legend(); savefig("../probability_inference_results_potts_nucleo.png")



In [None]:
T = read_tree(FileTree, node_data_type = PhyloTools.ProbabilityOnTree);
Z = PhyloTools.read_fasta_dict(FileSequences);
L = 76; q = 21;

@tasks for s in 1:L
        for leaf in leaves(T)
            ρ = zeros(Float64,q)
            ρ[Z[label(leaf)][s]] = 1.
            data!(leaf, PhyloTools.ProbabilityOnTree(prob = ρ))
        end
    end

@tasks for s in 1:L
            for leaf in leaves(T)
                data(T[label(leaf)]).prob .= 0.
                data(T[label(leaf)]).prob[Z[label(leaf)][s]] = 1.
            end
        end

# Reshuffling 

In [None]:
using Revise, PhyloTools, JLD2, PyPlot, DCAUtils, StatsBase

@load "../data_Genie/pars_dbd.jld2"; h = copy(h_dbd); J = copy(J_dbd);
dbd_msa = read_fasta_alignment("../Gen.jl/data/alignments/natural/DBD_alignment.uniref90.cov80.a2m",0.9);

FileSequences = "../data_Anc/OFF_3_seq_DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301/seq1/seq1_mu42.88.fa"
;FileTree = "../data_Anc/DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301.nwk"
; m = 42.88;

res, p = Felsenstein(FileSequences, 
    FileTree, 
    FileSequences, 
    m);

f = compute_empirical_freqs(msa, 21, eps = 10^-5);
eff_numb =2 .^ sum(-f .* log.(f) ./ log(2), dims = 2)[:];

msa = sampling_ANC(p, n_seq = 100);
new_msa, en_evol, x = reshuffle(msa, h, J, shuffle_units=10, info = true);
@time new_msa2, en_evol2, x2 = PhyloTools.reshuffle_entr(msa, h, J, shuffle_units=5, info = true);

close("all"); plt.plot(x, en_evol, label = "reshuffling energy"); plt.xscale("log"); plt.xlabel("Shuffle units"
    ); plt.ylabel("Mean energy"); plt.legend(); savefig("../prova_reshuffling.png")


mean_en_evol = moving_average(en_evol, 50);

close("all"); plt.plot(x[1:end-49], mean_en_evol, label = "reshuffling energy"); plt.xscale("log"); plt.xlabel("Shuffle units"
    ); plt.ylabel("Mean energy"); plt.legend(); savefig("../prova_reshuffling_mean.png")

In [None]:
using Revise, PhyloTools, TreeTools, DCAUtils, JLD2, PyPlot, Statistics, DelimitedFiles


nat_msa  = read_fasta_alignment("../Gen.jl/data/alignments/natural/DBD_alignment.uniref90.cov80.a2m", 0.9);
w = compute_weights(nat_msa, 22, 0.2)[1];
@load "../data_Genie/pars_dbd.jld2"; h = h_dbd; J = J_dbd;
@load "../data_Anc/3_start_seq_and_sweeps4DBD_ASR.jld2"

q = 21; L =76;

#tree_file = "../data_Anc/DBD_tree_midpoint_rooted.nwk"; 
#tree_file = "../data_Anc/DBD_tree_clean_short_300_rooted_midpoint"
tree_file = "../data_Anc/DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301.nwk"
tree = read_tree(tree_file, node_data_type = Seq);
dd = [distance(tree.root, a) for a in leaves(tree)];

branch_d = mean([distance(tree.root, a) for a in leaves(tree)]);
mus = sweeps[[1,2,3,4,6]] ./ branch_d;

folder = "../data_Anc/OFF_3_seq_DBDtree_collapsed_noonlychild_midpointrooted_prunedsubtree301"


res_ASR = zeros(Int, 3, L, length(mus));
pp = Matrix{Matrix{Float64}}(undef, 3, length(mus));
rec_msas1 = Matrix{Matrix{Int}}(undef, 3, length(mus));
rec_msas2 = Matrix{Matrix{Int}}(undef, 3, length(mus));
rec_msas3 = Matrix{Matrix{Int}}(undef, 3, length(mus));

close("all")
fig, axs = plt.subplots(3, length(mus), figsize = (20,12) )

for i in 1:3
    inf_mus = readdlm(joinpath(folder, "seq$(i)/inferred_mus$(i).txt"))
    
    println(i)
    for n in 1:length(mus)
    
        file_seqs = joinpath(folder, "seq$(i)/seq$(i)_mu$(round(mus[n], 
        digits = 2)).fa")
        res, p = Felsenstein2(file_seqs, tree_file, file_seqs, inf_mus[n])
        res_ASR[i,:,n] .= Int.(res)
        pp[i,n] = p
        rec_msas1[i,n] = sampling_ANC(pp[i,n], n_seq = 100)
        rec_msas2[i,n] = sampling_ANC(pp[i,n], n_seq = 200)
        rec_msas3[i,n] = sampling_ANC(pp[i,n], n_seq = 500)
       
        @time new_msa1, en_evol1, x1 = PhyloTools.reshuffle_entr(rec_msas1[i,n], h, J, info = true, temp = 0.4);
        @time new_msa2, en_evol2, x2 = PhyloTools.reshuffle_entr(rec_msas2[i,n], h, J, info = true, temp = 0.4);
        @time new_msa3, en_evol3, x3 = PhyloTools.reshuffle_entr(rec_msas3[i,n], h, J, info = true,  temp = 0.4);

        axs[i,n].plot(x1, en_evol1, color = "blue")
        axs[i,n].plot(x2, en_evol2, color = "red")
        axs[i,n].plot(x3, en_evol3, color = "green")        
        axs[i,n].set_xscale("log")
        
    end
end
            
for i in 1:3
    axs[i,1].set_ylabel("Seq $(i)", fontsize = 20)
end

for n in 1:length(mus)
    axs[1,n].set_title("Mu =  $(round(mus[n], digits = 3))", fontsize = 20) 
end
    
fig.supxlabel("Shuffle units", fontsize = 30)
fig.supylabel("Mean Energy", fontsize = 30)
savefig("../shuffling_analysis_low_temp04.png")



res_ASR = zeros(Int, 3, L, length(mus));
pp = Matrix{Matrix{Float64}}(undef, 3, length(mus));
rec_msas1 = Matrix{Matrix{Int}}(undef, 3, length(mus));
rec_msas2 = Matrix{Matrix{Int}}(undef, 3, length(mus));
rec_msas3 = Matrix{Matrix{Int}}(undef, 3, length(mus));

close("all")
fig, axs = plt.subplots(3, length(mus), figsize = (20,12) )

for i in 1:3
    inf_mus = readdlm(joinpath(folder, "seq$(i)/inferred_mus$(i).txt"))
    
    println(i)
    for n in 1:length(mus)
    
        file_seqs = joinpath(folder, "seq$(i)/seq$(i)_mu$(round(mus[n], 
        digits = 2)).fa")
        res, p = Felsenstein2(file_seqs, tree_file, file_seqs, inf_mus[n])
        res_ASR[i,:,n] .= Int.(res)
        pp[i,n] = p
        rec_msas1[i,n] = sampling_ANC(pp[i,n], n_seq = 100)
        rec_msas2[i,n] = sampling_ANC(pp[i,n], n_seq = 200)
        rec_msas3[i,n] = sampling_ANC(pp[i,n], n_seq = 500)
       
        @time new_msa1, en_evol1, x1 = PhyloTools.reshuffle_entr_min(rec_msas1[i,n], h, J, info = true);
        @time new_msa2, en_evol2, x2 = PhyloTools.reshuffle_entr_min(rec_msas2[i,n], h, J, info = true);
        @time new_msa3, en_evol3, x3 = PhyloTools.reshuffle_entr_min(rec_msas3[i,n], h, J, info = true);

        axs[i,n].plot(x1, en_evol1, color = "blue")
        axs[i,n].plot(x2, en_evol2, color = "red")
        axs[i,n].plot(x3, en_evol3, color = "green")        
        axs[i,n].set_xscale("log")
        
    end
end
            
for i in 1:3
    axs[i,1].set_ylabel("Seq $(i)", fontsize = 20)
end

for n in 1:length(mus)
    axs[1,n].set_title("Mu =  $(round(mus[n], digits = 3))", fontsize = 20) 
end
    
fig.supxlabel("Shuffle units", fontsize = 30)
fig.supylabel("Mean Energy", fontsize = 30)
savefig("../shuffling_analysis_min.png")

        
        

## Testing the sampler

In [None]:
using Revise, PhyloTools, JLD2, PyPlot

FileSequences = "../data_Anc/3_seq_alignments300_clean/seq1/seq1_mu0.59.fa"; FileTree = 
"../data_Anc/DBD_tree_clean_short_300_rooted_midpoint"; FileNat =
"../data_Anc/DBD_cleaned_20241118_f"; m = 0.5;

res, p = Felsenstein(FileNat, 
    FileTree, 
    FileSequences, 
    m; 
    verbose = true);

@load "../data_Anc/3_start_seq_and_sweeps4DBD_ASR.jld2"
start_seq = Int.(start_msa[:,1]);


_, NatMSA, _, _, q = read_fasta(FileNat,1.0,0.2,false); q = 21;
StationaryProb = compute_empirical_freqs(Int.(NatMSA), q);

#start_seq = Int.(start_msa[:,1]);
start_seq = rand(1:21,76);
mus = [1.0 * (10^(i-3.)) for i in 0:20]; 
cors = [];

for m in mus
    
    T = FelsensteinSampler(start_seq,
        FileNat,
        FileTree, 
        m);

    leaf_msa = seqs_from_leaves(T);
    leaf_prob = compute_empirical_freqs(Int.(leaf_msa), q);
    push!(cors, cor(leaf_prob[:], StationaryProb[:]))
end

close("all"); plt.plot(mus, cors); plt.xlabel("Mu"); plt.ylabel(
    "Pearson"); plt.xscale("log"); plt.savefig("../sampler_test.png")

## Plotting distances

In [None]:
using TreeTools, PyPlot

tree = read_tree("../data_Anc/DBD_tree")
tree_rerooted = read_tree("../data_Anc/DBD_tree_midpoint_rooted.nwk")
dd = [distance(tree.root, a) for a in leaves(tree)];
dd_rerooted = [distance(tree_rerooted.root, a) for a in leaves(tree_rerooted)];

num_bins = 40

# Create histograms
close("all")
plt.hist(dd, bins=num_bins, histtype="step", density =true, label="DBD full tree", color="blue")
plt.hist(dd_rerooted, bins=num_bins, histtype="step", density =true, label="DBD full tree re-rooted", 
    color="orange")

# Add legend and labels
plt.xlabel("Root-to-leaf branch length", fontsize = 15)
plt.ylabel("Frequency", fontsize = 15)
plt.legend(fontsize = 10)
savefig("../full_tree_branch_lenghts.pdf")