### for circus output ###

In [None]:
# =======================
# 0) Constants & helpers
# =======================
using Serialization, CSV, DataFrames, SparseArrays, Distributions, StatsBase

const GENOME_LEN = 154_478
const N = GENOME_LEN

# Significance thresholds
const THRESH_TvsC_SIG = 0.75
const THRESH_TvsU_SIG = 0.95
const THRESH_DEFAULT   = 0.95
const THRESH_CvsU_SIG  = 0.80

# Reverse-complement index helpers (1-based)
rc(x::Int) = N - x + 1
rc(r::UnitRange{Int}) = (N - last(r) + 1):(N - first(r) + 1)

# If you want reverse positions as -(N - i) instead of -(N - i + 1), set true.
const RC_MINUS_ONE = false  # false => genomic pos = -(N - i + 1); true => -(N - i)

# =======================
# 1) Load GFF & masking
# =======================
gff = CSV.File("AP000423.gff";
               comment="#",
               header=["accession","software","feature","start","stop","score","strand","phase","attributes"]) |> DataFrame
tomask = filter(x -> x.feature ∈ ["rRNA","tRNA"], gff)

function mask!(data::AbstractVector{<:Real})
    for f in eachrow(tomask)
        data[f.start:f.stop] .= 0
    end
    return data
end

# =======================
# 2) Mapping struct
# =======================
struct Mapping
    readrange::UnitRange{Int32}   # genomic (plus-orientation)
    strand::Char                  # '+' or '-'
    strandrange::UnitRange{Int32} # as provided by mapper (may be RC)
end

# ==========================================
# 3) Termini container + I/O
# ==========================================
struct Termini{T}
    fwd5::Vector{T}
    rev5::Vector{T}
    fwd3::Vector{T}
    rev3::Vector{T}
end

# Original semantics: place raw counts into forward/reverse arrays by strand.
function readcounts(filepath::String)
    counts = Termini{Int}(spzeros(Int, GENOME_LEN), spzeros(Int, GENOME_LEN),
                          spzeros(Int, GENOME_LEN), spzeros(Int, GENOME_LEN))
    for (m1, m2) in deserialize(filepath)
        if m1.strand == '+'
            counts.fwd3[mod1(last(m1.strandrange),  GENOME_LEN)] += 1
            counts.fwd5[mod1(first(m2.strandrange), GENOME_LEN)] += 1
        else
            counts.rev3[mod1(last(m1.strandrange),  GENOME_LEN)] += 1
            counts.rev5[mod1(first(m2.strandrange), GENOME_LEN)] += 1
        end
    end
    return counts
end

# Sum counts across a group of files
function sum_group_counts(paths::Vector{String})
    acc = Termini{Int}(zeros(Int, GENOME_LEN), zeros(Int, GENOME_LEN),
                       zeros(Int, GENOME_LEN), zeros(Int, GENOME_LEN))
    for p in paths
        c = readcounts(p)
        @inbounds begin
            acc.fwd5 .+= c.fwd5
            acc.rev5 .+= c.rev5
            acc.fwd3 .+= c.fwd3
            acc.rev3 .+= c.rev3
        end
    end
    return acc
end

# =======================
# 4) Discover mapping files (U/C/T)
# =======================
const MAPPINGS_DIR = "/mappings3"

_get_idx(fname) = try
    m = match(r"^[UCT](\d+)\.mappings3\.bin$", fname)
    m === nothing ? typemax(Int) : parse(Int, m.captures[1])
catch
    typemax(Int)
end

function group_bins(dir::AbstractString)
    files = sort(readdir(dir; join=true))
    files = filter(f -> occursin(r"\.mappings3\.bin$", f), files)

    base = basename.(files)
    unligated = sort(filter(f -> occursin(r"^U\d+\.mappings3\.bin$", f), base), by=_get_idx)
    ligated   = sort(filter(f -> occursin(r"^C\d+\.mappings3\.bin$", f), base), by=_get_idx)
    tap       = sort(filter(f -> occursin(r"^T\d+\.mappings3\.bin$", f), base), by=_get_idx)

    unligated = joinpath.(Ref(dir), unligated)
    ligated   = joinpath.(Ref(dir), ligated)
    tap       = joinpath.(Ref(dir), tap)

    @assert !isempty(unligated) "No U*.mappings3.bin files found in $(dir)"
    @assert !isempty(ligated)   "No C*.mappings3.bin files found in $(dir)"
    @assert !isempty(tap)       "No T*.mappings3.bin files found in $(dir)"

    return unligated, ligated, tap
end

unligated_bins, ligated_bins, tap_bins = group_bins(MAPPINGS_DIR)

# =======================
# 5) Build group counts & mask
# =======================
u = sum_group_counts(unligated_bins)
c = sum_group_counts(ligated_bins)
t = sum_group_counts(tap_bins)

mask!.(Ref(u.fwd5)); mask!.(Ref(u.rev5)); mask!.(Ref(u.fwd3)); mask!.(Ref(u.rev3))
mask!.(Ref(c.fwd5)); mask!.(Ref(c.rev5)); mask!.(Ref(c.fwd3)); mask!.(Ref(c.rev3))
mask!.(Ref(t.fwd5)); mask!.(Ref(t.rev5)); mask!.(Ref(t.fwd3)); mask!.(Ref(t.rev3))

# =======================
# 6) Convert to ±pos DataFrames
#    Forward:  +i
#    Reverse:  -(genomic index), i.e., -(rc(i)) [or -(N - i) if RC_MINUS_ONE]
#    IMPORTANT: include ALL positions (zeros too) to preserve joins.
# =======================
@inline function neg_genomic_pos(i::Int)
    if RC_MINUS_ONE
        return -(N - i)          # e.g. i=140_926 -> -13_552 (your example)
    else
        return -rc(i)            # -(N - i + 1), strict 1-based rc
    end
end

function termini_to_df(name::String, fwd::AbstractVector{<:Real}, rev::AbstractVector{<:Real})
    df = DataFrame(:pos => Int[], Symbol(name) => Float64[])
    # Forward → +genomic (emit ALL i, including zeros)
    @inbounds for (i, v) in enumerate(fwd)
        push!(df, (i, float(v)))
    end
    # Reverse → -genomic (emit ALL i, including zeros)
    @inbounds for (i, v) in enumerate(rev)
        push!(df, (neg_genomic_pos(i), float(v)))
    end
    return df
end

unligated5 = termini_to_df("unligated5", u.fwd5, u.rev5)
unligated3 = termini_to_df("unligated3", u.fwd3, u.rev3)
ligated5   = termini_to_df("ligated5",   c.fwd5, c.rev5)
ligated3   = termini_to_df("ligated3",   c.fwd3, c.rev3)
tap5       = termini_to_df("tap5",       t.fwd5, t.rev5)
tap3       = termini_to_df("tap3",       t.fwd3, t.rev3)

# =======================
# 7) Beta stats
# =======================
betamean(c1, c2) = mean(Beta(c1 + 1, c2 + 1))  # Jeffreys-like prior (1,1)

# --- 5′ ---
fiveprime = innerjoin(unligated5, ligated5; on=:pos)
fiveprime = innerjoin(fiveprime, tap5; on=:pos)

shared5 = filter(x -> minimum((x.unligated5, x.ligated5, x.tap5)) > 9, fiveprime)
geomeans5 = [geomean((p.unligated5, p.ligated5, p.tap5)) for p in eachrow(shared5)]
unligated5_sf = median(shared5.unligated5 ./ geomeans5)
ligated5_sf   = median(shared5.ligated5   ./ geomeans5)
tap5_sf       = median(shared5.tap5       ./ geomeans5)

fiveprime.CvsU = [betamean(r.ligated5/ligated5_sf, r.unligated5/unligated5_sf) for r in eachrow(fiveprime)]
fiveprime.TvsC = [betamean(r.tap5/tap5_sf,         r.ligated5/ligated5_sf)     for r in eachrow(fiveprime)]
fiveprime.TvsU = [betamean(r.tap5/tap5_sf,         r.unligated5/unligated5_sf) for r in eachrow(fiveprime)]

# --- 3′ ---
threeprime = innerjoin(unligated3, ligated3; on=:pos)
threeprime = innerjoin(threeprime, tap3; on=:pos)

shared3 = filter(x -> minimum((x.unligated3, x.ligated3, x.tap3)) > 9, threeprime)
geomeans3 = [geomean((p.unligated3, p.ligated3, p.tap3)) for p in eachrow(shared3)]
unligated3_sf = median(shared3.unligated3 ./ geomeans3)
ligated3_sf   = median(shared3.ligated3   ./ geomeans3)
tap3_sf       = median(shared3.tap3       ./ geomeans3)

threeprime.CvsU = [betamean(r.ligated3/ligated3_sf, r.unligated3/unligated3_sf) for r in eachrow(threeprime)]
threeprime.TvsC = [betamean(r.tap3/tap3_sf,         r.ligated3/ligated3_sf)     for r in eachrow(threeprime)]
threeprime.TvsU = [betamean(r.tap3/tap3_sf,         r.unligated3/unligated3_sf) for r in eachrow(threeprime)]

# =======================
# 8) Exports (no IR folding)
# =======================
outroot = "/beta_distribution_output"
subdir  = "files"
outdir  = joinpath(outroot, subdir)
isdir(outdir) || mkpath(outdir)
savecsv(fname::AbstractString, df) = CSV.write(joinpath(outdir, fname), df)

# --- Save: ALL ---
savecsv("beta_norm_CvsU_all_5p.csv", filter(:CvsU => x -> x > 0, fiveprime))
savecsv("beta_norm_CvsU_all_3p.csv", filter(:CvsU => x -> x > 0, threeprime))
savecsv("beta_norm_TvsC_all_5p.csv", filter(:TvsC => x -> x > 0, fiveprime))
savecsv("beta_norm_TvsC_all_3p.csv", filter(:TvsC => x -> x > 0, threeprime))
savecsv("beta_norm_TvsU_all_5p.csv", filter(:TvsU => x -> x > 0, fiveprime))
savecsv("beta_norm_TvsU_all_3p.csv", filter(:TvsU => x -> x > 0, threeprime))

# --- Save: SIGNIFICANT sets ---
savecsv("beta_norm_CvsU_sig095_5p.csv", filter(:CvsU => x -> x >= THRESH_DEFAULT, fiveprime))
savecsv("beta_norm_CvsU_sig095_3p.csv", filter(:CvsU => x -> x >= THRESH_DEFAULT, threeprime))
savecsv("beta_norm_CvsU_sig080_5p.csv", filter(:CvsU => x -> x >= THRESH_CvsU_SIG, fiveprime))
savecsv("beta_norm_CvsU_sig080_3p.csv", filter(:CvsU => x -> x >= THRESH_CvsU_SIG, threeprime))

savecsv("beta_norm_TvsC_sig095_5p.csv", filter(:TvsC => x -> x >= THRESH_DEFAULT, fiveprime))
savecsv("beta_norm_TvsC_sig095_3p.csv", filter(:TvsC => x -> x >= THRESH_DEFAULT, threeprime))
savecsv("beta_norm_TvsU_sig095_5p.csv", filter(:TvsU => x -> x >= THRESH_TvsU_SIG, fiveprime))
savecsv("beta_norm_TvsU_sig095_3p.csv", filter(:TvsU => x -> x >= THRESH_TvsU_SIG, threeprime))

# --- Combined: BOTH significant 5′ ---
sig_both_5 = filter(r -> (r.TvsC >= THRESH_TvsC_SIG) && (r.TvsU >= THRESH_TvsU_SIG), fiveprime)

function with_end_5p(df::DataFrame)
    out = deepcopy(df)
    insertcols!(out, 2, :end => fill("5p", nrow(out)))
    return out
end

combined_5p = with_end_5p(sig_both_5)
savecsv("beta_norm_TvsC075_TvsU095_bothsig_5p.csv", combined_5p)



"/Users/lenic/Library/CloudStorage/OneDrive-UWA/PhD/Research/01 Transcript End Analysis/Bioinformatics/2025/counts3/beta_distributions/beta_normalised/noIRfold/beta_norm_TvsC075_TvsU095_bothsig_5p.csv"