In [None]:
using Serialization
using Flux
using OneHotArrays
using Statistics
using StatsBase
using InvertedIndices
using Plots
using CircularArrays
using CSV, DataFrames
using StatsPlots

# Set a fixed random seed for reproducibility
using Random
Random.seed!(1001)

In [None]:
# Initialize empty arrays to store data and corresponding target labels
alldata = Float32[]  # Array to store data
alltargets = Bool[]  # Array to store target labels (true/false)

footprint_files = filter!(x -> endswith(x, ".bin"), readdir("threeprime_footprints", join=true))
numfootprints = 0
for ff in footprint_files
    # Deserialize data from file "footprints/i.bin" and store it in the 'matrix' variable
    matrix = deserialize(ff)
    
    # Append the deserialized data (matrix) to the 'alldata' array
    append!(alldata, matrix)
    
    # Append the label 'true' to the 'alltargets' array to indicate that this data is a 'footprint'
    append!(alltargets, true)

    numfootprints += 1
end
numfootprints

background_files = filter!(x -> endswith(x, ".bin"), readdir("threeprime_backgrounds", join=true))
numbackgrounds = 0
for bg in background_files
    matrix = deserialize(bg)
    append!(alldata, matrix)
    append!(alltargets, false)

    numbackgrounds += 1
end
numfootprints, numbackgrounds

In [None]:
# Define the neural network model using Chain composition
model = Chain(
    # Convolutional layer with a 3x3 kernel size, 1 input channel, and 3 output channels, followed by ReLU activation
    # Padding can be used to ensure that the spatial dimensions of the output feature maps are the same as the input image. In this case, pad = SamePad() is used.
    Conv((3,3), 1 => 1, relu),
    # Maxpooling layer with a 3x3 pooling window
    x -> maxpool(x, (12,1)),
    # Reshape layer to flatten the output tensor
    x -> reshape(x, :, size(x, 4)),
    # Fully connected dense layer with input size calculated from the previous layer output dimensions and 2 output neurons
    Dense(4, 2),
    # Softmax activation function to compute class probabilities
    softmax
)

# Setup the Adam optimizer with a learning rate of 0.01 and associate it with the model
optim = Flux.setup(Flux.Adam(0.01), model)  

In [None]:
# Generate random indices to select test data from 'alldata'
testidxs = sort!(sample(1:100000, 10000, replace = false))

# Define ranges of indices corresponding to the selected test data
testranges = reduce(vcat, [collect(i*150-149:i*150) for i in testidxs])

# Extract test data from 'alldata' using the defined ranges
testdata = alldata[testranges]

# Reshape test data to match the required format for model input
testdata = reshape(testdata, 50, 3, 1, 10000)

# Create one-hot encoded labels for test targets based on 'alltargets' using the selected test indices
testtargets = onehotbatch(alltargets[testidxs], [true, false])

# Extract training data by excluding the test data ranges from 'alldata'
trainingdata = alldata[Not(testranges)]

# Reshape training data to match the required format for model input
trainingdata = reshape(trainingdata, 50, 3, 1, 90000)

# Create one-hot encoded labels for training targets based on 'alltargets' excluding the test indices
trainingtargets = onehotbatch(alltargets[Not(testidxs)], [true, false])

In [None]:
# Create a DataLoader object by passing training data and targets as a tuple,
# setting the batch size to 20, and shuffling the data during training
loader = Flux.DataLoader((trainingdata, trainingtargets), batchsize=20, shuffle=true);

In [None]:
function trainmodel!(model, loader)
    # Create an empty array to store losses during training
    meanlosses = Float64[]

    # Training loop, using the whole data set 1000 times:
    for epoch in 1:100
        losses = Float64[]
        for (x, y) in loader
            loss, grads = Flux.withgradient(model) do m
                # Evaluate model and loss inside gradient context:
                y_hat = m(x)
                Flux.crossentropy(y_hat, y)
            end
            Flux.update!(optim, model, grads[1])
            push!(losses, loss)  # logging, outside gradient context
        end
        push!(meanlosses, mean(losses))
    end
    meanlosses
end

losses = trainmodel!(model, loader)
plot(losses)

In [None]:
function testmodel(testdata, testtargets)
    # Pass the test data through the model to get predictions
    out = model(testdata)

    # Initialize counters for successful and failed predictions
    tp = 0
    tn = 0
    fp = 0
    fn = 0

    # Iterate over each sample in the test dataset
    for i in 1:size(testtargets, 2)
        # Check if the predicted class matches the true class for each sample
        if out[1, i] > 0.5 && testtargets[1, i] > 0.5
            tp += 1
        elseif out[1, i] > 0.5 && testtargets[1, i] <= 0.5
            fp += 1
        elseif out[1, i] <= 0.5 && testtargets[1, i] > 0.5
            fn += 1
        elseif out[1, i] <= 0.5 && testtargets[1, i] <= 0.5
            tn += 1
        else
            println(out[1, i], "\t", testtargets[1, i])
        end
    end

    # Return the counts of successful and failed predictions
    tp, tn, fp, fn
end

In [None]:
tp, tn, fp, fn = testmodel(testdata, testtargets)

In [None]:
using CSV, DataFrames, Serialization, CircularArrays

# --- Load GFF and genome length ---
Atgff = CSV.File("AP000423.gff";
    comment = "#",
    header = ["accession", "software", "feature", "start", "stop", "score", "strand", "phase", "attributes"]
) |> DataFrame

genome_length = first(Atgff[Atgff.feature .== "region", :stop])
rc(x::Real) = genome_length - x + 1

# --- Identify and mask tRNA / rRNA ---
structuralRNA = sort!(filter(x -> x.feature âˆˆ ["tRNA", "rRNA"], Atgff), :start)

mask_fwd = falses(genome_length)
mask_rev = falses(genome_length)
for row in eachrow(structuralRNA)
    mask_fwd[row.start:row.stop] .= true
    mask_rev[rc(row.stop):rc(row.start)] .= true
end

function apply_mask!(data, mask)
    data[mask] .= 0.0
    return data
end

# --- Load raw data ---
fwd_conservation = CircularVector(deserialize("arabidopsis_conservation.bin"))
rev_conservation = reverse(fwd_conservation)

fwd_srna = CircularVector(deserialize("srna_fwd.bin"))
rev_srna = CircularVector(deserialize("srna_rev.bin"))

fwd_threeprime = CircularVector(deserialize("TAP_vs_untreated.fwd.three.bin"))
rev_threeprime = CircularVector(deserialize("TAP_vs_untreated.rev.three.bin"))

# --- Apply masking BEFORE normalization ---
apply_mask!(fwd_conservation, mask_fwd)
apply_mask!(rev_conservation, mask_rev)
apply_mask!(fwd_srna, mask_fwd)
apply_mask!(rev_srna, mask_rev)
apply_mask!(fwd_threeprime, mask_fwd)
apply_mask!(rev_threeprime, mask_rev)

# --- Normalization ---
function log_normalise(fwd_data, rev_data)
    log_fwd_data = log.(fwd_data .+ 1)
    log_rev_data = log.(rev_data .+ 1)
    minval = min(minimum(log_fwd_data), minimum(log_rev_data))
    maxval = max(maximum(log_fwd_data), maximum(log_rev_data))
    log_fwd_data .= (log_fwd_data .- minval) ./ maxval
    log_rev_data .= (log_rev_data .- minval) ./ maxval
    return log_fwd_data, log_rev_data
end

fwd_conservation, rev_conservation = log_normalise(fwd_conservation, rev_conservation)
fwd_srna, rev_srna = log_normalise(fwd_srna, rev_srna)
fwd_threeprime, rev_threeprime = log_normalise(fwd_threeprime, rev_threeprime)

# --- Verify masking success ---
for (name, data) in [
    ("Conservation", fwd_conservation),
    ("sRNA", fwd_srna),
    ("threeprime", fwd_threeprime)
]
    is_zero = all(data[104691:107500] .== 0.0)
    println(rpad(name, 12), ": ", is_zero)
end

# --- Define window builder ---
function datawindow!(datablock::Vector{Float32}, strand::String, p::Int, w::Int)
    if strand == "+"
        datablock[1:w]         .= fwd_conservation[p:p+w-1]
        datablock[w+1:2*w]     .= fwd_srna[p:p+w-1]
        datablock[2*w+1:3*w]   .= fwd_threeprime[p:p+w-1]
    else
        datablock[1:w]         .= rev_conservation[p:p+w-1]
        datablock[w+1:2*w]     .= rev_srna[p:p+w-1]
        datablock[2*w+1:3*w]   .= rev_threeprime[p:p+w-1]
    end
    return datablock
end

# --- Safe bounds helper ---
window_inbounds(p::Int, w::Int, L::Int) = (p >= 1) && (p + w - 1 <= L)
const ZERO_PRED = Float32[0.0, 1.0]

# --- Model predictions ---
window = 50
datablock = Vector{Float32}(undef, window * 3)

L = length(fwd_conservation)
fwd_predictions = Vector{Vector{Float32}}(undef, L)

for position in 1:L
    if mask_fwd[position] || !window_inbounds(position, window, L)
        fwd_predictions[position] = ZERO_PRED
        continue
    end
    datablock = datawindow!(datablock, "+", position, window)
    pred = model(reshape(datablock, window, 3, 1, 1))
    fwd_predictions[position] = vec(Array(pred))
end

Lr = length(rev_conservation)
rev_predictions = Vector{Vector{Float32}}(undef, Lr)

for position in 1:Lr
    if mask_rev[position] || !window_inbounds(position, window, Lr)
        rev_predictions[position] = ZERO_PRED
        continue
    end
    datablock = datawindow!(datablock, "-", position, window)
    pred = model(reshape(datablock, window, 3, 1, 1))
    rev_predictions[position] = vec(Array(pred))
end

serialize("threeprime_3x3_12_4.predictions", (fwd_predictions, rev_predictions))
