Skip to content

Commit

Permalink
Unify the datastore types under an abstract type (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
Ben J. Ward committed Aug 15, 2019
1 parent 7d9991e commit 2e7e4fc
Show file tree
Hide file tree
Showing 7 changed files with 197 additions and 176 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2019 BioJulia
Copyright (c) 2019 Ben J. Ward

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
116 changes: 111 additions & 5 deletions src/ReadDatastores.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
module ReadDatastores

export
PairedReadDatastore,
export
ReadDatastore,
PairedReads,
PairedReadOrientation,
FwRv,
RvFw,
LongReadDatastore,
SequenceBuffer
LongReads,
SequenceBuffer,

name,
maxseqlen,
orientation,
load_sequence!,
buffer


using BioSequences, FASTX

const SeqDataStoreMAGIC = 0x05D5
const ReadDatastoreMAGIC = 0x05D5

@enum Filetype::UInt16 begin
PairedDS = 1
Expand Down Expand Up @@ -45,9 +53,107 @@ end
return v
end

###
### Abstract ReadDatastore type
###
abstract type ReadDatastore{S<:BioSequence} end

###
### Concrete ReadDatastore type definitions
###
include("paired-reads.jl")
include("long-reads.jl")
include("sequence-buffer.jl")

###
### ReadDatastore generics
###
@inline buffer(ds::ReadDatastore) = SequenceBuffer(ds)

Base.firstindex(ds::ReadDatastore) = 1
Base.lastindex(ds::ReadDatastore) = length(ds)
Base.eachindex(ds::ReadDatastore) = Base.OneTo(lastindex(ds))
Base.IteratorSize(ds::ReadDatastore) = Base.HasLength()
Base.IteratorEltype(ds::ReadDatastore) = Base.HasEltype()
Base.eltype(ds::ReadDatastore{T}) where {T} = T
Base.close(ds::ReadDatastore) = close(stream(ds))

@inline function Base.checkbounds(ds::ReadDatastore, i::Integer)
if firstindex(ds) i lastindex(ds)
return true
end
throw(BoundsError(ds, i))
end

@inline function Base.iterate(ds::ReadDatastore, state = 1)
@inbounds if firstindex(ds) state lastindex(ds)
return ds[state], state + 1
else
return nothing
end
end

function Base.open(f::Function, ::Type{T}, filepath::AbstractString) where {T<:ReadDatastore}
ds = open(T, filepath)
try
f(ds)
finally
close(ds)
end
end

## Loading a sequence from a datastore - generic internals

function _inbounds_index_of_sequence end

"""
_load_sequence_data!(prds::PairedReads, seq::LongSequence{DNAAlphabet{4}})
Reads the sequence data from the current position of a datastore's stream, into
a destination sequence `seq`.
This function knows how much data to read from the stream, based on the sequence's
data blob (i.e. `sizeof(BioSequences.encoded_data(seq))`).
!!! warning
Uses `unsafe_read`, makes the following assumptions - does not check any of them.
- The stream is at the correct position, such that an `unsafe_read` will
read the entirety of the sequence before hitting the end of file.
Hitting EOF should result in an error being thrown anyway so...
- The destination sequence is currently the correct size and has been resized
appropriately before this function has been called.
- The data in the stream of the datastore is encoded appropriately for the
sequence type.
"""
function _load_sequence_data!(ds::ReadDatastore{T}, seq::T) where {T<:LongSequence}
seqdata = BioSequences.encoded_data(seq)
GC.@preserve seqdata unsafe_read(stream(ds), pointer(seqdata), sizeof(seqdata))
return seq
end

function _load_sequence_from_file_pos!(ds::ReadDatastore{T}, pos::Integer, seq::T) where {T<:LongSequence}
seek(stream(ds), pos)
seqlen = read(stream(ds), UInt64)
resize!(seq, seqlen)
return _load_sequence_data!(ds, seq)
end

function _load_sequence_from_file_pos!(ds::ReadDatastore{T}, pos::ReadPosSize, seq::T) where {T<:LongSequence}
seek(stream(ds), pos.offset)
resize!(seq, pos.sequence_size)
return _load_sequence_data!(ds, seq)
end


@inline function inbounds_load_sequence!(ds::ReadDatastore{T}, i::Integer, seq::T) where {T<:LongSequence}
pos = _inbounds_index_of_sequence(ds, i)
return _load_sequence_from_file_pos!(ds, pos, seq)
end

@inline function load_sequence!(ds::ReadDatastore{T}, i::Integer, seq::T) where {T<:LongSequence}
checkbounds(ds, i)
return inbounds_load_sequence!(ds, i, seq)
end

end # module
84 changes: 16 additions & 68 deletions src/long-reads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,21 @@ end

Base.:(==)(x::ReadPosSize, y::ReadPosSize) = x.offset == y.offset && x.sequence_size == y.sequence_size

struct LongReadDatastore
struct LongReads <: ReadDatastore{LongSequence{DNAAlphabet{4}}}
filename::String
name::String
default_name::String
read_to_file_positions::Vector{ReadPosSize}
stream::IO
end

const LRDS = LongReadDatastore

index(lrds::LRDS) = lrds.read_to_file_positions
@inline stream(lrds::LongReads) = lrds.stream
index(lrds::LongReads) = lrds.read_to_file_positions

const LongDS_Version = 0x0001

###
### LongReadDatastore Header
### LongReads Header
###

# | Field | Value | Type |
Expand All @@ -31,13 +30,13 @@ const LongDS_Version = 0x0001
# | Index position in file | N/A | UInt64 | 8
# | Default name of the datastore | N/A | String | N

function LongReadDatastore(rdr::FASTQ.Reader, outfile::String, name::String, min_size::UInt64)
function LongReads(rdr::FASTQ.Reader, outfile::String, name::String, min_size::UInt64)
discarded = 0

read_to_file_position = Vector{ReadPosSize}()
ofs = open(outfile, "w")

write(ofs, SeqDataStoreMAGIC, LongDS, LongDS_Version, zero(UInt64))
write(ofs, ReadDatastoreMAGIC, LongDS, LongDS_Version, zero(UInt64))

writestring(ofs, name)

Expand Down Expand Up @@ -72,93 +71,42 @@ function LongReadDatastore(rdr::FASTQ.Reader, outfile::String, name::String, min
write_flat_vector(ofs, read_to_file_position)

# Go to the top and dump the number of reads and the position of the index.
seek(ofs, sizeof(SeqDataStoreMAGIC) + sizeof(Filetype) + sizeof(LongDS_Version))
seek(ofs, sizeof(ReadDatastoreMAGIC) + sizeof(Filetype) + sizeof(LongDS_Version))
write(ofs, fpos)
close(ofs)

@info string("Built long read datastore with ", length(read_to_file_position), " reads")

stream = open(outfile, "r+")
return LongReadDatastore(outfile, name, name, read_to_file_position, stream)
return LongReads(outfile, name, name, read_to_file_position, stream)
end

function Base.open(::Type{LongReadDatastore}, filename::String)
function Base.open(::Type{LongReads}, filename::String)
fd = open(filename, "r")
magic = read(fd, UInt16)
dstype = reinterpret(Filetype, read(fd, UInt16))
version = read(fd, UInt16)

@assert magic == SeqDataStoreMAGIC
@assert magic == ReadDatastoreMAGIC
@assert dstype == LongDS
@assert version == LongDS_Version

fpos = read(fd, UInt64)

default_name = readuntil(fd, '\0')

seek(fd, fpos)

read_to_file_position = read_flat_vector(fd, ReadPosSize)

return LongReadDatastore(filename, default_name, default_name, read_to_file_position, fd)
return LongReads(filename, default_name, default_name, read_to_file_position, fd)
end

###
### Getting a sequence
###

Base.length(lrds::LRDS) = length(lrds.read_to_file_positions)

Base.firstindex(lrds::LRDS) = 1
Base.lastindex(lrds::LRDS) = length(lrds)
Base.eachindex(lrds::LRDS) = Base.OneTo(lastindex(lrds))

@inline function Base.checkbounds(lrds::LRDS, i::Integer)
if firstindex(lrds) i lastindex(lrds)
return true
end
throw(BoundsError(lrds, i))
end

@inbounds inbounds_position_and_size(lrds::LRDS, idx::Integer) = @inbounds lrds.read_to_file_positions[idx]

@inbounds function position_and_size(lrds::LRDS, idx::Integer)
checkbounds(lrds, idx)
return inbounds_position_and_size(lrds, idx)
end

@inline function unsafe_load_read!(lrds::LRDS, pos_size::ReadPosSize, seq::LongSequence{DNAAlphabet{4}})
seek(lrds.stream, pos_size.offset)
resize!(seq, pos_size.sequence_size)
unsafe_read(lrds.stream, pointer(seq.data), length(seq.data) * sizeof(UInt64))
return seq
end
Base.length(lrds::LongReads) = length(lrds.read_to_file_positions)

@inline function inbounds_load_read!(lrds::LRDS, idx::Integer, seq::LongSequence{DNAAlphabet{4}})
pos_size = inbounds_position_and_size(lrds, idx)
return unsafe_load_read!(lrds, pos_size, seq)
end
@inline _inbounds_index_of_sequence(lrds::LongReads, idx::Integer) = @inbounds lrds.read_to_file_positions[idx]

@inline function load_read!(lrds::LRDS, idx::Integer, seq::LongSequence{DNAAlphabet{4}})
checkbounds(lrds, idx)
return inbounds_load_read!(lrds, idx, seq)
end

@inline function Base.getindex(lrds::LRDS, idx::Integer)
@inline function Base.getindex(lrds::LongReads, idx::Integer)
@boundscheck checkbounds(lrds, idx)
pos_size = inbounds_position_and_size(lrds, idx)
pos_size = _inbounds_index_of_sequence(lrds, idx)
seq = LongDNASeq(pos_size.sequence_size)
return unsafe_load_read!(lrds, pos_size, seq)
return _load_sequence_from_file_pos!(lrds, pos_size, seq)
end

Base.IteratorSize(lrds::LRDS) = Base.HasLength()
Base.IteratorEltype(lrds::LRDS) = Base.HasEltype()
Base.eltype(lrds::LRDS) = LongSequence{DNAAlphabet{4}}

@inline function Base.iterate(lrds::LRDS, state = 1)
@inbounds if firstindex(lrds) state lastindex(lrds)
return lrds[state], state + 1
else
return nothing
end
end
Loading

0 comments on commit 2e7e4fc

Please sign in to comment.