In [None]:
using Serialization, CircularArrays, CSV, DataFrames, Distributions, StatsBase, StatsPlots

In [None]:
gff = CSV.File("AP000423.gff"; comment = "#", header = ["accession","software","feature","start","stop","score","strand","phase","attributes"]) |> DataFrame
tomask = filter(x -> x.feature ∈ ["rRNA", "tRNA"], gff)

function mask!(data::Vector{<: Real})
    for f in eachrow(tomask)
        data[f.start:f.stop] .= 0
    end
    data
end

function readtermini(file::String)
    termini = deserialize(file)
    termini[1] = reverse(termini[1])
    termini[2] = reverse(termini[2])
    termini = mask!.(termini)
end

In [None]:
unligated_samples = ["At_U7", "At_U8", "At_U17"]
unligated_termini = Vector{Vector{Int}}()
for sample in unligated_samples
    if isempty(unligated_termini)
        unligated_termini = readtermini("mappings2/$sample.termini.bin")
    else
        unligated_termini .+= readtermini("mappings2/$sample.termini.bin")
    end
end
unligated5 = DataFrame(pos = Int[], unligated5 = Int[])
for (i,f) in enumerate(unligated_termini[1])
    push!(unligated5, (i, f))
end
for (i,r) in enumerate(unligated_termini[3])
    push!(unligated5, (-i, r))
end
unligated3 = DataFrame(pos = Int[], unligated3 = Int[])
for (i,f) in enumerate(unligated_termini[2])
    push!(unligated3, (i, f))
end
for (i,r) in enumerate(unligated_termini[4])
    push!(unligated3, (-i, r))
end
plot(unligated_termini[1]; color = :darkgreen, label = "fwd 5'")
plot!(-unligated_termini[3]; color = :green, label = "rev 5'")
plot!(unligated_termini[2]; color = :red, label = "fwd 3'")
plot!(-unligated_termini[4]; color = :darkorange, label = "rev 3'");

In [None]:
ligated_samples = ["At_C4", "At_C5", "At_C16"]
ligated_termini = Vector{Vector{Int}}()
for sample in ligated_samples
    if isempty(ligated_termini)
        ligated_termini = readtermini("mappings2/$sample.termini.bin")
    else
        ligated_termini .+= readtermini("mappings2/$sample.termini.bin")
    end
end
ligated5 = DataFrame(pos = Int[], ligated5 = Int[])
for (i,f) in enumerate(ligated_termini[1])
    push!(ligated5, (i, f))
end
for (i,r) in enumerate(ligated_termini[3])
    push!(ligated5, (-i, r))
end
ligated3 = DataFrame(pos = Int[], ligated3 = Int[])
for (i,f) in enumerate(ligated_termini[2])
    push!(ligated3, (i, f))
end
for (i,r) in enumerate(ligated_termini[4])
    push!(ligated3, (-i, r))
end
plot(ligated_termini[1]; color = :darkgreen, label = "fwd 5'")
plot!(-ligated_termini[3]; color = :green, label = "rev 5'")
plot!(ligated_termini[2]; color = :red, label = "fwd 3'")
plot!(-ligated_termini[4]; color = :darkorange, label = "rev 3'");

In [None]:
TAP_samples = ["At_T13", "At_T14", "At_T15"]
TAP_termini = Vector{Vector{Int}}()
for sample in TAP_samples
    if isempty(TAP_termini)
        TAP_termini = readtermini("mappings2/$sample.termini.bin")
    else
        TAP_termini .+= readtermini("mappings2/$sample.termini.bin")
    end
end
tap5 = DataFrame(pos = Int[], tap5 = Int[])
for (i,f) in enumerate(TAP_termini[1])
    push!(tap5, (i, f))
end
for (i,r) in enumerate(TAP_termini[3])
    push!(tap5, (-i, r))
end
tap3 = DataFrame(pos = Int[], tap3 = Int[])
for (i,f) in enumerate(TAP_termini[2])
    push!(tap3, (i, f))
end
for (i,r) in enumerate(TAP_termini[4])
    push!(tap3, (-i, r))
end
plot(TAP_termini[1]; color = :darkgreen, label = "fwd 5'")
plot!(-TAP_termini[3]; color = :green, label = "rev 5'")
plot!(TAP_termini[2]; color = :red, label = "fwd 3'")
plot!(-TAP_termini[4]; color = :darkorange, label = "rev 3'");

In [None]:
function betathreshold(c1::Int, c2::Int, t::Float64)
    priorα = 1
    priorβ = 1
    beta = Beta(c1 + priorα, c2+priorβ)
    ccdf(beta, t)
end

function betamean(c1, c2)
    priorα = 1
    priorβ = 1
    beta = Beta(c1 + priorα, c2 + priorβ)
    mean(beta)
end

In [None]:
fiveprime = innerjoin(unligated5, ligated5; on = :pos)
fiveprime = innerjoin(fiveprime, tap5; on = :pos)

#size factor calculation
sharedpeaks = filter(x -> minimum([x.unligated5, x.ligated5, x.tap5]) > 9, fiveprime)
geomeans = Float64[]
for peak in eachrow(sharedpeaks)
    push!(geomeans, geomean([peak.unligated5, peak.ligated5, peak.tap5]))
end
unligated5_sizefactor = median(sharedpeaks.unligated5 ./ geomeans)
ligated5_sizefactor = median(sharedpeaks.ligated5 ./ geomeans)
tap5_sizefactor = median(sharedpeaks.tap5 ./ geomeans)

CvsU = Float64[]
TvsC = Float64[]
for site in eachrow(fiveprime)
    push!(CvsU, betamean(site.ligated5 ./ ligated5_sizefactor, site.unligated5 ./ unligated5_sizefactor))
    push!(TvsC, betamean(site.tap5 ./ tap5_sizefactor, site.ligated5 ./ ligated5_sizefactor))
end
fiveprime.CvsU = CvsU
fiveprime.TvsC = TvsC;

In [None]:
threeprime = innerjoin(unligated3, ligated3; on = :pos)
threeprime = innerjoin(threeprime, tap3; on = :pos)

#size factor calculation
sharedpeaks = filter(x -> minimum([x.unligated3, x.ligated3, x.tap3]) > 9, threeprime)
geomeans = Float64[]
for peak in eachrow(sharedpeaks)
    push!(geomeans, geomean([peak.unligated3, peak.ligated3, peak.tap3]))
end
unligated3_sizefactor = median(sharedpeaks.unligated3 ./ geomeans)
ligated3_sizefactor = median(sharedpeaks.ligated3 ./ geomeans)
tap3_sizefactor = median(sharedpeaks.tap3 ./ geomeans)

CvsU = Float64[]
TvsC = Float64[]
for site in eachrow(threeprime)
    push!(CvsU, betamean(site.ligated3 ./ ligated3_sizefactor, site.unligated3 ./ unligated3_sizefactor))
    push!(TvsC, betamean(site.tap3 ./ tap3_sizefactor, site.ligated3 ./ ligated3_sizefactor))
end
threeprime.CvsU = CvsU
threeprime.TvsC = TvsC;

In [None]:
#processed transcript ends
sigCvsU5peaks = filter(x -> x.CvsU > 0.95, fiveprime)
sigCvsU3peaks = filter(x -> x.CvsU > 0.95, threeprime)

In [None]:
filtereddata = zeros(Int, length(unligated_termini[1]))
for peak in eachrow(sigCvsU5peaks[sigCvsU5peaks.pos .> 0, :])
    filtereddata[peak.pos] = ligated_termini[1][peak.pos]
end
plot(filtereddata; color = :darkgreen, label = "fwd 5'")
filtereddata .= 0
for peak in eachrow(sigCvsU5peaks[sigCvsU5peaks.pos .< 0, :])
    filtereddata[-peak.pos] = ligated_termini[3][-peak.pos]
end
plot!(-filtereddata; color = :green, label = "rev 5'")
filtereddata .= 0
for peak in eachrow(sigCvsU3peaks[sigCvsU3peaks.pos .> 0, :])
    filtereddata[peak.pos] = ligated_termini[2][peak.pos]
end
plot!(filtereddata; color = :red, label = "fwd 3'")
for peak in eachrow(sigCvsU3peaks[sigCvsU3peaks.pos .< 0, :])
    filtereddata[-peak.pos] = ligated_termini[4][-peak.pos]
end
plot!(-filtereddata; color = :darkorange, label = "rev 3'");

In [None]:
#processed transcript ends
TvsC5peaks = filter(x -> x.TvsC > 0, fiveprime)
TvsC3peaks = filter(x -> x.CvsU > 0, threeprime)

In [None]:
#primary transcript ends
sigTvsC5peaks = filter(x -> x.TvsC > 0.95, fiveprime)
sigTvsC3peaks = filter(x -> x.TvsC > 0.95, threeprime)

In [None]:
filtereddata = zeros(Int, length(ligated_termini[1]))
for peak in eachrow(sigTvsC5peaks[sigTvsC5peaks.pos .> 0, :])
    filtereddata[peak.pos] = ligated_termini[1][peak.pos]
end
plot(filtereddata; color = :darkgreen, label = "fwd 5'")
filtereddata .= 0
for peak in eachrow(sigTvsC5peaks[sigTvsC5peaks.pos .< 0, :])
    filtereddata[-peak.pos] = ligated_termini[3][-peak.pos]
end
plot!(-filtereddata; color = :green, label = "rev 5'")
filtereddata .= 0
for peak in eachrow(sigTvsC3peaks[sigTvsC3peaks.pos .> 0, :])
    filtereddata[peak.pos] = ligated_termini[2][peak.pos]
end
plot!(filtereddata; color = :red, label = "fwd 3'")
for peak in eachrow(sigTvsC3peaks[sigTvsC3peaks.pos .< 0, :])
    filtereddata[-peak.pos] = ligated_termini[4][-peak.pos]
end
plot!(-filtereddata; color = :darkorange, label = "rev 3'");