Skip to content

Commit

Permalink
ARROW-10946: [Rust] Simplified bit chunk iterator
Browse files Browse the repository at this point in the history
This PR makes the bit chunk iterator not depend on a `Buffer`, since we were only using it to keep track of the lifetime, and we can just use a byte slice for it.

Closes apache#8942 from jorgecarleitao/improve_chunk_iter

Authored-by: Jorge C. Leitao <jorgecarleitao@gmail.com>
Signed-off-by: Neville Dipale <nevilledips@gmail.com>
  • Loading branch information
jorgecarleitao authored and Dandandan committed Dec 22, 2020
1 parent 97a95fb commit a7cb4ce
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 14 deletions.
2 changes: 1 addition & 1 deletion rust/arrow/src/buffer.rs
Expand Up @@ -183,7 +183,7 @@ impl Buffer {
/// in larger chunks and starting at arbitrary bit offsets.
/// Note that both `offset` and `length` are measured in bits.
pub fn bit_chunks(&self, offset: usize, len: usize) -> BitChunks {
BitChunks::new(&self, offset, len)
BitChunks::new(&self.data.as_slice()[self.offset..], offset, len)
}

/// Returns the number of 1-bits in this buffer.
Expand Down
20 changes: 7 additions & 13 deletions rust/arrow/src/util/bit_chunk_iterator.rs
Expand Up @@ -14,14 +14,12 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::buffer::Buffer;
use crate::util::bit_util::ceil;
use std::fmt::Debug;

#[derive(Debug)]
pub struct BitChunks<'a> {
buffer: &'a Buffer,
raw_data: *const u8,
buffer: &'a [u8],
/// offset inside a byte, guaranteed to be between 0 and 7 (inclusive)
bit_offset: usize,
/// number of complete u64 chunks
Expand All @@ -31,22 +29,19 @@ pub struct BitChunks<'a> {
}

impl<'a> BitChunks<'a> {
pub fn new(buffer: &'a Buffer, offset: usize, len: usize) -> Self {
pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
assert!(ceil(offset + len, 8) <= buffer.len() * 8);

let byte_offset = offset / 8;
let bit_offset = offset % 8;

let raw_data = unsafe { buffer.raw_data().add(byte_offset) };

let chunk_bits = 8 * std::mem::size_of::<u64>();

let chunk_len = len / chunk_bits;
let remainder_len = len & (chunk_bits - 1);

BitChunks::<'a> {
buffer: &buffer,
raw_data,
buffer: &buffer[byte_offset..],
bit_offset,
chunk_len,
remainder_len,
Expand All @@ -56,8 +51,7 @@ impl<'a> BitChunks<'a> {

#[derive(Debug)]
pub struct BitChunkIterator<'a> {
buffer: &'a Buffer,
raw_data: *const u8,
buffer: &'a [u8],
bit_offset: usize,
chunk_len: usize,
index: usize,
Expand All @@ -83,7 +77,8 @@ impl<'a> BitChunks<'a> {
let byte_len = ceil(bit_len + bit_offset, 8);
// pointer to remainder bytes after all complete chunks
let base = unsafe {
self.raw_data
self.buffer
.as_ptr()
.add(self.chunk_len * std::mem::size_of::<u64>())
};

Expand All @@ -102,7 +97,6 @@ impl<'a> BitChunks<'a> {
pub const fn iter(&self) -> BitChunkIterator<'a> {
BitChunkIterator::<'a> {
buffer: self.buffer,
raw_data: self.raw_data,
bit_offset: self.bit_offset,
chunk_len: self.chunk_len,
index: 0,
Expand Down Expand Up @@ -131,7 +125,7 @@ impl Iterator for BitChunkIterator<'_> {

// cast to *const u64 should be fine since we are using read_unaligned below
#[allow(clippy::cast_ptr_alignment)]
let raw_data = self.raw_data as *const u64;
let raw_data = self.buffer.as_ptr() as *const u64;

// bit-packed buffers are stored starting with the least-significant byte first
// so when reading as u64 on a big-endian machine, the bytes need to be swapped
Expand Down

0 comments on commit a7cb4ce

Please sign in to comment.