From fe73d382eeabaed0c37425388ab15ad690e5906f Mon Sep 17 00:00:00 2001 From: Kang Seonghoon Date: Tue, 3 Mar 2015 00:34:50 +0900 Subject: [PATCH] metadata: Compact integer encoding. Previously every auto-serialized tags are strongly typed. However this is not strictly required, and instead it can be exploited to provide the optimal encoding for smaller integers. This commit repurposes `EsI8`/`EsU8` through `EsI64`/`EsU64` tags to represent *any* integers with given ranges: It is now possible to encode `42u64` as two bytes `EsU8 0x2a`, for example. There are some limitations: * It does not apply to non-auto-serialized tags for obvious reasons. Fortunately, we have already eliminated the biggest source of such tag in favor of auto-serialized tags: `tag_table_id`. * Bigger tags cannot be used to represent smaller types. * Signed tags and unsigned tags do not mix. --- src/librbml/lib.rs | 207 +++++++++++++++++++++++++-------------------- 1 file changed, 114 insertions(+), 93 deletions(-) diff --git a/src/librbml/lib.rs b/src/librbml/lib.rs index fee9a69897e0e..c13aeb4cd614b 100644 --- a/src/librbml/lib.rs +++ b/src/librbml/lib.rs @@ -87,39 +87,37 @@ pub enum EbmlEncoderTag { // tags 00..1f are reserved for auto-serialization. // first NUM_IMPLICIT_TAGS tags are implicitly sized and lengths are not encoded. - EsUint = 0x00, // + 8 bytes - EsU64 = 0x01, // + 8 bytes - EsU32 = 0x02, // + 4 bytes - EsU16 = 0x03, // + 2 bytes - EsU8 = 0x04, // + 1 byte - EsInt = 0x05, // + 8 bytes - EsI64 = 0x06, // + 8 bytes - EsI32 = 0x07, // + 4 bytes - EsI16 = 0x08, // + 2 bytes - EsI8 = 0x09, // + 1 byte - EsBool = 0x0a, // + 1 byte - EsChar = 0x0b, // + 4 bytes - EsF64 = 0x0c, // + 8 bytes - EsF32 = 0x0d, // + 4 bytes - EsSub8 = 0x0e, // + 1 byte - EsSub32 = 0x0f, // + 4 bytes - - EsStr = 0x10, - EsEnum = 0x11, // encodes the variant id as the first EsSub* - EsVec = 0x12, // encodes the # of elements as the first EsSub* - EsVecElt = 0x13, - EsMap = 0x14, // encodes the # of pairs as the first EsSub* - EsMapKey = 0x15, - EsMapVal = 0x16, - EsOpaque = 0x17, + EsU64 = 0x00, // + 8 bytes + EsU32 = 0x01, // + 4 bytes + EsU16 = 0x02, // + 2 bytes + EsU8 = 0x03, // + 1 byte + EsI64 = 0x04, // + 8 bytes + EsI32 = 0x05, // + 4 bytes + EsI16 = 0x06, // + 2 bytes + EsI8 = 0x07, // + 1 byte + EsBool = 0x08, // + 1 byte + EsChar = 0x09, // + 4 bytes + EsF64 = 0x0a, // + 8 bytes + EsF32 = 0x0b, // + 4 bytes + EsSub8 = 0x0c, // + 1 byte + EsSub32 = 0x0d, // + 4 bytes + + EsStr = 0x0e, + EsEnum = 0x0f, // encodes the variant id as the first EsSub* + EsVec = 0x10, // encodes the # of elements as the first EsSub* + EsVecElt = 0x11, + EsMap = 0x12, // encodes the # of pairs as the first EsSub* + EsMapKey = 0x13, + EsMapVal = 0x14, + EsOpaque = 0x15, } const NUM_TAGS: uint = 0x1000; -const NUM_IMPLICIT_TAGS: uint = 0x10; +const NUM_IMPLICIT_TAGS: uint = 0x0e; static TAG_IMPLICIT_LEN: [i8; NUM_IMPLICIT_TAGS] = [ - 8, 8, 4, 2, 1, // EsU* - 8, 8, 4, 2, 1, // ESI* + 8, 4, 2, 1, // EsU* + 8, 4, 2, 1, // ESI* 1, // EsBool 4, // EsChar 8, 4, // EsF* @@ -154,9 +152,9 @@ pub mod reader { use serialize; use super::{ ApplicationError, EsVec, EsMap, EsEnum, EsSub8, EsSub32, - EsVecElt, EsMapKey, EsU64, EsU32, EsU16, EsU8, EsInt, EsI64, + EsVecElt, EsMapKey, EsU64, EsU32, EsU16, EsU8, EsI64, EsI32, EsI16, EsI8, EsBool, EsF64, EsF32, EsChar, EsStr, EsMapVal, - EsUint, EsOpaque, EbmlEncoderTag, Doc, TaggedDoc, + EsOpaque, EbmlEncoderTag, Doc, TaggedDoc, Error, IntTooBig, InvalidTag, Expected, NUM_IMPLICIT_TAGS, TAG_IMPLICIT_LEN }; pub type DecodeResult = Result; @@ -420,37 +418,6 @@ pub mod reader { Ok(r_doc) } - fn next_doc2(&mut self, - exp_tag1: EbmlEncoderTag, - exp_tag2: EbmlEncoderTag) -> DecodeResult<(bool, Doc<'doc>)> { - assert!((exp_tag1 as uint) != (exp_tag2 as uint)); - debug!(". next_doc2(exp_tag1={:?}, exp_tag2={:?})", exp_tag1, exp_tag2); - if self.pos >= self.parent.end { - return Err(Expected(format!("no more documents in \ - current node!"))); - } - let TaggedDoc { tag: r_tag, doc: r_doc } = - try!(doc_at(self.parent.data, self.pos)); - debug!("self.parent={:?}-{:?} self.pos={:?} r_tag={:?} r_doc={:?}-{:?}", - self.parent.start, - self.parent.end, - self.pos, - r_tag, - r_doc.start, - r_doc.end); - if r_tag != (exp_tag1 as uint) && r_tag != (exp_tag2 as uint) { - return Err(Expected(format!("expected EBML doc with tag {:?} or {:?} but \ - found tag {:?}", exp_tag1, exp_tag2, r_tag))); - } - if r_doc.end > self.parent.end { - return Err(Expected(format!("invalid EBML, child extends to \ - {:#x}, parent to {:#x}", - r_doc.end, self.parent.end))); - } - self.pos = r_doc.end; - Ok((r_tag == (exp_tag2 as uint), r_doc)) - } - fn push_doc(&mut self, exp_tag: EbmlEncoderTag, f: F) -> DecodeResult where F: FnOnce(&mut Decoder<'doc>) -> DecodeResult, { @@ -471,16 +438,59 @@ pub mod reader { return Ok(0); } - let (big, doc) = try!(self.next_doc2(EsSub8, EsSub32)); - let r = if big { - doc_as_u32(doc) as uint + let TaggedDoc { tag: r_tag, doc: r_doc } = + try!(doc_at(self.parent.data, self.pos)); + let r = if r_tag == (EsSub8 as uint) { + doc_as_u8(r_doc) as uint + } else if r_tag == (EsSub32 as uint) { + doc_as_u32(r_doc) as uint } else { - doc_as_u8(doc) as uint + return Err(Expected(format!("expected EBML doc with tag {:?} or {:?} but \ + found tag {:?}", EsSub8, EsSub32, r_tag))); }; + if r_doc.end > self.parent.end { + return Err(Expected(format!("invalid EBML, child extends to \ + {:#x}, parent to {:#x}", + r_doc.end, self.parent.end))); + } + self.pos = r_doc.end; debug!("_next_sub result={:?}", r); Ok(r) } + // variable-length unsigned integer with different tags + fn _next_int(&mut self, + first_tag: EbmlEncoderTag, + last_tag: EbmlEncoderTag) -> DecodeResult { + if self.pos >= self.parent.end { + return Err(Expected(format!("no more documents in \ + current node!"))); + } + + let TaggedDoc { tag: r_tag, doc: r_doc } = + try!(doc_at(self.parent.data, self.pos)); + let r = if first_tag as uint <= r_tag && r_tag <= last_tag as uint { + match last_tag as uint - r_tag { + 0 => doc_as_u8(r_doc) as u64, + 1 => doc_as_u16(r_doc) as u64, + 2 => doc_as_u32(r_doc) as u64, + 3 => doc_as_u64(r_doc) as u64, + _ => unreachable!(), + } + } else { + return Err(Expected(format!("expected EBML doc with tag {:?} through {:?} but \ + found tag {:?}", first_tag, last_tag, r_tag))); + }; + if r_doc.end > self.parent.end { + return Err(Expected(format!("invalid EBML, child extends to \ + {:#x}, parent to {:#x}", + r_doc.end, self.parent.end))); + } + self.pos = r_doc.end; + debug!("_next_int({:?}, {:?}) result={:?}", first_tag, last_tag, r); + Ok(r) + } + pub fn read_opaque(&mut self, op: F) -> DecodeResult where F: FnOnce(&mut Decoder, Doc) -> DecodeResult, { @@ -502,12 +512,12 @@ pub mod reader { type Error = Error; fn read_nil(&mut self) -> DecodeResult<()> { Ok(()) } - fn read_u64(&mut self) -> DecodeResult { Ok(doc_as_u64(try!(self.next_doc(EsU64)))) } - fn read_u32(&mut self) -> DecodeResult { Ok(doc_as_u32(try!(self.next_doc(EsU32)))) } - fn read_u16(&mut self) -> DecodeResult { Ok(doc_as_u16(try!(self.next_doc(EsU16)))) } - fn read_u8 (&mut self) -> DecodeResult { Ok(doc_as_u8 (try!(self.next_doc(EsU8 )))) } + fn read_u64(&mut self) -> DecodeResult { self._next_int(EsU64, EsU8) } + fn read_u32(&mut self) -> DecodeResult { Ok(try!(self._next_int(EsU32, EsU8)) as u32) } + fn read_u16(&mut self) -> DecodeResult { Ok(try!(self._next_int(EsU16, EsU8)) as u16) } + fn read_u8(&mut self) -> DecodeResult { Ok(doc_as_u8(try!(self.next_doc(EsU8)))) } fn read_uint(&mut self) -> DecodeResult { - let v = doc_as_u64(try!(self.next_doc(EsUint))); + let v = try!(self._next_int(EsU64, EsU8)); if v > (::std::usize::MAX as u64) { Err(IntTooBig(v as uint)) } else { @@ -515,20 +525,12 @@ pub mod reader { } } - fn read_i64(&mut self) -> DecodeResult { - Ok(doc_as_u64(try!(self.next_doc(EsI64))) as i64) - } - fn read_i32(&mut self) -> DecodeResult { - Ok(doc_as_u32(try!(self.next_doc(EsI32))) as i32) - } - fn read_i16(&mut self) -> DecodeResult { - Ok(doc_as_u16(try!(self.next_doc(EsI16))) as i16) - } - fn read_i8 (&mut self) -> DecodeResult { - Ok(doc_as_u8(try!(self.next_doc(EsI8 ))) as i8) - } + fn read_i64(&mut self) -> DecodeResult { Ok(try!(self._next_int(EsI64, EsI8)) as i64) } + fn read_i32(&mut self) -> DecodeResult { Ok(try!(self._next_int(EsI32, EsI8)) as i32) } + fn read_i16(&mut self) -> DecodeResult { Ok(try!(self._next_int(EsI16, EsI8)) as i16) } + fn read_i8(&mut self) -> DecodeResult { Ok(doc_as_u8(try!(self.next_doc(EsI8))) as i8) } fn read_int(&mut self) -> DecodeResult { - let v = doc_as_u64(try!(self.next_doc(EsInt))) as i64; + let v = try!(self._next_int(EsI64, EsI8)) as i64; if v > (isize::MAX as i64) || v < (isize::MIN as i64) { debug!("FIXME \\#6122: Removing this makes this function miscompile"); Err(IntTooBig(v as uint)) @@ -739,10 +741,11 @@ pub mod writer { use std::old_io::{Writer, Seek}; use std::old_io; use std::slice::bytes; + use std::num::ToPrimitive; use super::{ EsVec, EsMap, EsEnum, EsSub8, EsSub32, EsVecElt, EsMapKey, - EsU64, EsU32, EsU16, EsU8, EsInt, EsI64, EsI32, EsI16, EsI8, - EsBool, EsF64, EsF32, EsChar, EsStr, EsMapVal, EsUint, + EsU64, EsU32, EsU16, EsU8, EsI64, EsI32, EsI16, EsI8, + EsBool, EsF64, EsF32, EsChar, EsStr, EsMapVal, EsOpaque, NUM_IMPLICIT_TAGS, NUM_TAGS }; use super::io::SeekableMemWriter; @@ -1010,32 +1013,50 @@ pub mod writer { } fn emit_uint(&mut self, v: uint) -> EncodeResult { - self.wr_tagged_raw_u64(EsUint as uint, v as u64) + self.emit_u64(v as u64) } fn emit_u64(&mut self, v: u64) -> EncodeResult { - self.wr_tagged_raw_u64(EsU64 as uint, v) + match v.to_u32() { + Some(v) => self.emit_u32(v), + None => self.wr_tagged_raw_u64(EsU64 as uint, v) + } } fn emit_u32(&mut self, v: u32) -> EncodeResult { - self.wr_tagged_raw_u32(EsU32 as uint, v) + match v.to_u16() { + Some(v) => self.emit_u16(v), + None => self.wr_tagged_raw_u32(EsU32 as uint, v) + } } fn emit_u16(&mut self, v: u16) -> EncodeResult { - self.wr_tagged_raw_u16(EsU16 as uint, v) + match v.to_u8() { + Some(v) => self.emit_u8(v), + None => self.wr_tagged_raw_u16(EsU16 as uint, v) + } } fn emit_u8(&mut self, v: u8) -> EncodeResult { self.wr_tagged_raw_u8(EsU8 as uint, v) } fn emit_int(&mut self, v: int) -> EncodeResult { - self.wr_tagged_raw_i64(EsInt as uint, v as i64) + self.emit_i64(v as i64) } fn emit_i64(&mut self, v: i64) -> EncodeResult { - self.wr_tagged_raw_i64(EsI64 as uint, v) + match v.to_i32() { + Some(v) => self.emit_i32(v), + None => self.wr_tagged_raw_i64(EsI64 as uint, v) + } } fn emit_i32(&mut self, v: i32) -> EncodeResult { - self.wr_tagged_raw_i32(EsI32 as uint, v) + match v.to_i16() { + Some(v) => self.emit_i16(v), + None => self.wr_tagged_raw_i32(EsI32 as uint, v) + } } fn emit_i16(&mut self, v: i16) -> EncodeResult { - self.wr_tagged_raw_i16(EsI16 as uint, v) + match v.to_i8() { + Some(v) => self.emit_i8(v), + None => self.wr_tagged_raw_i16(EsI16 as uint, v) + } } fn emit_i8(&mut self, v: i8) -> EncodeResult { self.wr_tagged_raw_i8(EsI8 as uint, v)