In [None]:
include("load_save.jl")
R, X, Y, L, MF, SF, Y, PY = load_small_dataset("avGFPs", 'B', val_mode=true)
R1, X1, Y1, L1, MF1, SF1, Y1, PY1 = load_small_dataset("avGFPs", 'C', val_mode=true)
M = load_dataset_model("avGFPs", 'B')
M1 = load_dataset_model("avGFPs", 'C')

function findnearest(A::Vector{Float64}, val::Real)
    return findmin(abs.(A .- val))[2]
end

using PyPlot, FASTX, Combinatorics, ProRF, JLD2, Pandas, Seaborn, XLSX

In [None]:
NMF = MF ./ maximum(MF) + MF1 ./ maximum(MF1)
view_importance(R, L, NMF, show_number=30)

In [None]:
NL = Vector{String}()
for l in L
    nl = parse(Int, l[1:end-1])
    if nl ∈ [65, 72, 203]
        push!(NL, "**" * l)
    elseif nl ∈ [18, 27, 29, 57, 60, 61, 64, 66, 67, 146, 148, 168, 205, 222]
        push!(NL, "*" * l)
    else
        push!(NL, l)
    end
end

tag = Vector{String}()
for l in L
    nl = parse(Int, l[1:end-1])
    if nl ∈ [18, 27, 29, 57, 60, 61, 64, 65, 66, 67, 146, 148, 168, 203, 205, 222]
        push!(tag, "Interacting\nchromophores site")
    else
        push!(tag, "Remains")
    end
end
data = Pandas.DataFrame(Dict("Feature Importance" => NMF, "AA type" => tag));

In [None]:
view_importance(R, NL, NMF, show_number=30)
data["all"] = ""
set_theme(style="whitegrid", palette="tab10")
violinplot(x="all", y="Feature Importance", hue="AA type", inner="quartile", data=data, split=true, bw=0.4, cut=0)
xlabel("")
display(gcf())
close("all")
matplotlib.rc_file_defaults()

In [None]:
PyPlot.hist(Y, bins=40)
ProRF.@show_pyplot

In [None]:
NumL = get_amino_loc(L)
num_dict = Int(length(L) / length(NumL))
value_matrix = reshape(NMF, (num_dict, Int(length(NMF) / num_dict)))' ./ maximum(NMF)
sort_idx = sortperm(maximum(value_matrix, dims=2)[:, 1], rev=true)
NumL = NumL[sort_idx]
NumL = NumL[1:7]

In [None]:
sort_ind_b = sortperm(Y, rev=true)
sort_ind_c = sortperm(Y1, rev=true)

ref_sdata_vector = [(FASTA.sequence(String, record), String(FASTA.identifier(record))) for record in open(FASTA.Reader, "Data/avGFPs/data.fasta")];

In [None]:
open(FASTA.Writer, "AData/alnallseq_b.fasta") do io
    for (seq, id) in ref_sdata_vector[sort_ind_b]
        write(io, FASTA.Record(id, seq[NumL]))
    end
end

open(FASTA.Writer, "AData/alnallseq_c.fasta") do io
    for (seq, id) in ref_sdata_vector[sort_ind_c]
        write(io, FASTA.Record(id, seq[NumL]))
    end
end

In [None]:
sort_loc = getindex.(sort(collect(zip(L, NMF)), by = x -> x[2], rev=true), 1)
tar_loc = sort_loc[findall(x -> |([string(i) == x[1:end-1] for i in NumL]...), sort_loc)]
tar_ind = [findfirst(isequal(i), L) for i in tar_loc]
X[:, tar_ind];

In [None]:
star_ind = tar_ind

In [None]:
clipboard("select imp, resi " * join(string.(NumL), '+'))

In [None]:
seqdata_vector = [(String(FASTA.description(record)), FASTA.sequence(String, record)) for record in open(FASTA.Reader, R.fasta_loc)]
main_seq = seqdata_vector[findfirst(x -> x[1] == "avGFP", seqdata_vector)][2]

In [None]:
d123[main_seq[65]], d123[main_seq[66]], d123[main_seq[67]]

In [None]:
collect(map(x -> d123[main_seq[x]], NumL))

In [None]:
seq_vector = [collect(FASTA.sequence(String, record)) for record in open(FASTA.Reader, R.fasta_loc)]
aa_vector = ['M', 'P', 'K', 'Q', 'I', 'H', 'E', 'W', 'T', 'S', 'C', 'D', 'A', 'L', 'Y', 'V', 'R', 'G', 'N', 'F'];

In [None]:
last_mut_number = 2
mut_seq_vector = Vector{String}()

for seq in seq_vector
    for mut_number in 1:last_mut_number
        for residue_vector in combinations(NumL, mut_number)
            total_aa_vector = Vector{Vector{Char}}()
            for residue in residue_vector
                push!(total_aa_vector, filter(x -> x ≠ seq[residue], aa_vector))
            end
            
            for mut_vector in Iterators.product(total_aa_vector...)
                ref_seq = deepcopy(seq)
                for (mut, res) in zip(mut_vector, residue_vector)
                    ref_seq[res] = mut
                end
                push!(mut_seq_vector, join(ref_seq))
            end
        end
    end
end

In [None]:
NumL = NumL[sort_idx]

In [None]:
@load "AData/save_data.JLD2" mut_seq_vector pre_vector_b pre_vector_c

In [None]:
@save "AData/save_data.JLD2" mut_seq_vector pre_vector_b pre_vector_c Xt