Skip to content
This repository has been archived by the owner on Jul 7, 2023. It is now read-only.

Commit

Permalink
progress
Browse files Browse the repository at this point in the history
  • Loading branch information
BurntSushi committed Oct 20, 2020
1 parent fa7c2c7 commit 4555a4e
Show file tree
Hide file tree
Showing 12 changed files with 813 additions and 272 deletions.
2 changes: 2 additions & 0 deletions PLANS.md
Expand Up @@ -81,6 +81,8 @@ Straw man:

At this point, I think the bug should resolve itself.

^^^^ DONE! IT WORKS!

-----


Expand Down
13 changes: 8 additions & 5 deletions src/bytes.rs
Expand Up @@ -273,11 +273,14 @@ pub fn skip_initial_padding(slice: &[u8]) -> usize {
///
/// In practice, padding is often zero.
pub fn alloc_aligned_buffer<S: StateID>(size: usize) -> (Vec<u8>, usize) {
// TODO: This is a kludge because there's no easy way to allocate a Vec<u8>
// with an alignment guaranteed to be greater than 1. We could create a
// Vec<usize>, but this cannot be safely transmuted to a Vec<u8> without
// concern, since reallocing or dropping the Vec<u8> is UB (different
// alignment than the initial allocation).
// FIXME: This is a kludge because there's no easy way to allocate a
// Vec<u8> with an alignment guaranteed to be greater than 1. We could
// create a Vec<usize>, but this cannot be safely transmuted to a Vec<u8>
// without concern, since reallocing or dropping the Vec<u8> is UB
// (different alignment than the initial allocation). It's plausible
// that if there was a reliable way to create a Vec<u8> with a different
// alignment, then other aspects of this library could be simplified as
// well.
let mut buf = vec![0; size];
let align = core::mem::align_of::<S>();
let address = buf.as_ptr() as usize;
Expand Down
91 changes: 48 additions & 43 deletions src/dfa/dense.rs
Expand Up @@ -590,7 +590,10 @@ impl Default for Builder {
}

/// A convenience alias for an owned DFA. We use this particular instantiation
/// a lot in this crate, so it's worth giving it a name.
/// a lot in this crate, so it's worth giving it a name. This instantiation
/// is commonly used for mutable APIs on the DFA while building it. The main
/// reason for making it generic is no_std support, and more generally, making
/// it possible to load a DFA from an arbitrary slice of bytes.
pub(crate) type OwnedDFA<S> = DFA<Vec<S>, Vec<u8>, S>;

/// A dense table-based deterministic finite automaton (DFA).
Expand Down Expand Up @@ -695,8 +698,8 @@ pub struct DFA<T, A, S = usize> {
///
/// If a state is accelerated, then there exist only a small number of
/// bytes that can cause the DFA to leave the state. This permits searching
/// to use optimized routines to find those specific bytes instead of
/// using the transition table.
/// to use optimized routines to find those specific bytes instead of using
/// the transition table.
///
/// All accelerated states exist in a contiguous range in the DFA's
/// transition table. See dfa/special.rs for more details on how states are
Expand Down Expand Up @@ -1974,8 +1977,10 @@ impl<S: StateID> OwnedDFA<S> {
) {
// The determinizer always adds a quit state and it is always second.
self.special.quit_id = self.from_index(1);
// If all we have are the dead and quit states, then we're done.
// If all we have are the dead and quit states, then we're done and
// the DFA will never produce a match.
if self.tt.count <= 2 {
self.special.set_max();
return;
}

Expand Down Expand Up @@ -2961,6 +2966,23 @@ pub struct StartTable<T, S> {
_state_id: PhantomData<S>,
}

impl<S: StateID> StartTable<Vec<S>, S> {
/// Create a valid set of start states all pointing to the dead state.
///
/// When the corresponding DFA is constructed with start states for each
/// pattern, then `patterns` should be the number of patterns. Otherwise,
/// it should be zero.
fn dead(patterns: usize) -> StartTable<Vec<S>, S> {
let stride = Start::count();
StartTable {
table: vec![dead_id(); stride + (stride * patterns)],
stride,
patterns,
_state_id: PhantomData,
}
}
}

impl<'a, S: StateID> StartTable<&'a [S], S> {
/// Deserialize a table of start state IDs starting at the beginning of
/// `slice`. Upon success, return the total number of bytes read along with
Expand Down Expand Up @@ -3001,15 +3023,25 @@ impl<'a, S: StateID> StartTable<&'a [S], S> {
"invalid starting table stride",
));
}
// TODO: It feels weird to invoke thompson's pattern limit here.
// Maybe pattern limit should be defined at the top-level?
if patterns > crate::nfa::thompson::pattern_limit() {
if patterns > crate::pattern_limit() {
return Err(DeserializeError::generic(
"invalid number of patterns",
));
}
let count =
stride.checked_add(stride.checked_mul(patterns).unwrap()).unwrap();
let pattern_table_size = match stride.checked_mul(patterns) {
Some(x) => x,
None => {
return Err(DeserializeError::generic("invalid pattern count"))
}
};
let count = match stride.checked_add(pattern_table_size) {
Some(x) => x,
None => {
return Err(DeserializeError::generic(
"invalid pattern+stride",
))
}
};
let table_bytes_len = count * core::mem::size_of::<S>();
let nread = 16 + table_bytes_len;
if slice.len() < table_bytes_len {
Expand All @@ -3032,23 +3064,6 @@ impl<'a, S: StateID> StartTable<&'a [S], S> {
}
}

impl<S: StateID> StartTable<Vec<S>, S> {
/// Create a valid set of start states all pointing to the dead state.
///
/// When the corresponding DFA is constructed with start states for each
/// pattern, then `patterns` should be the number of patterns. Otherwise,
/// it should be zero.
fn dead(patterns: usize) -> StartTable<Vec<S>, S> {
let stride = Start::count();
StartTable {
table: vec![dead_id(); stride + (stride * patterns)],
stride,
patterns,
_state_id: PhantomData,
}
}
}

impl<T: AsRef<[S]>, S: StateID> StartTable<T, S> {
/// Writes a serialized form of this start table to the buffer given. If
/// the buffer is too small, then an error is returned. To determine how
Expand Down Expand Up @@ -3138,12 +3153,7 @@ impl<T: AsRef<[S]>, S: StateID> StartTable<T, S> {
if max_state_id > S2::max_id() {
return Err(Error::state_id_overflow(S2::max_id()));
}
let mut st = StartTable {
table: vec![dead_id::<S2>(); self.table().len()],
stride: self.stride,
patterns: self.patterns,
_state_id: PhantomData,
};
let mut st = StartTable::dead(self.patterns);
for (i, id) in st.table.iter_mut().enumerate() {
// This is always correct since we've verified above that the
// maximum state ID can fit into S2.
Expand Down Expand Up @@ -3218,12 +3228,7 @@ impl<T: AsRef<[S]>, S: StateID> StartTable<T, S> {
}

impl<T: AsMut<[S]>, S: StateID> StartTable<T, S> {
/// Return the start state for the given index and pattern ID. If the
/// pattern ID is None, then the corresponding start state for the entire
/// DFA is returned. If the pattern ID is not None, then the corresponding
/// starting state for the given pattern is returned. If this start table
/// does not have individual starting states for each pattern, then this
/// panics.
/// Set the start state for the given index and pattern.
fn set_start(
&mut self,
index: Start,
Expand Down Expand Up @@ -3272,12 +3277,12 @@ impl<'a, S: StateID> Iterator for StartStateIter<'a, S> {
// This unwrap is okay since the stride of any DFA must always match
// the number of start state types.
let start_type = Start::from_usize(i % self.st.stride).unwrap();
if i < self.st.stride {
Some((table[i], start_type, None))
let pid = if i < self.st.stride {
None
} else {
let pid = (i - self.st.stride) / self.st.stride;
Some((table[i], start_type, Some(pid as u32)))
}
Some(((i - self.st.stride) / self.st.stride) as u32)
};
Some((table[i], start_type, pid))
}
}

Expand Down
14 changes: 6 additions & 8 deletions src/dfa/determinize.rs
Expand Up @@ -277,13 +277,11 @@ impl<'a, S: StateID> Runner<'a, S> {
| thompson::State::Fail
| thompson::State::Look { .. } => {}
thompson::State::Match(mid) => {
// TODO: Make this work. Currently this fails with
// MatchStates serialization. Think of something elegant.
// if self.nfa.match_len() <= 1 {
// facts.state.matches = Matches::One;
// } else {
facts.state.matches.add(mid);
// }
if self.nfa.match_len() <= 1 {
facts.state.matches = Matches::One;
} else {
facts.state.matches.add(mid);
}
if !self.continue_past_first_match() {
break;
}
Expand Down Expand Up @@ -658,7 +656,7 @@ impl Matches {
fn into_vec(self) -> Option<Vec<PatternID>> {
match self {
Matches::None => None,
Matches::One => Some(vec![]),
Matches::One => Some(vec![0]),
Matches::Many(pids) => Some(pids),
}
}
Expand Down
12 changes: 9 additions & 3 deletions src/dfa/search.rs
Expand Up @@ -14,7 +14,9 @@ pub fn find_earliest_fwd<A: Automaton + ?Sized>(
start: usize,
end: usize,
) -> Result<Option<HalfMatch>, MatchError> {
if pre.is_some() {
// Searching with a pattern ID is always anchored, so we should never use
// a prefilter.
if pre.is_some() && pattern_id.is_none() {
find_fwd(pre, true, dfa, pattern_id, bytes, start, end)
} else {
find_fwd(None, true, dfa, pattern_id, bytes, start, end)
Expand All @@ -30,7 +32,9 @@ pub fn find_leftmost_fwd<A: Automaton + ?Sized>(
start: usize,
end: usize,
) -> Result<Option<HalfMatch>, MatchError> {
if pre.is_some() {
// Searching with a pattern ID is always anchored, so we should never use
// a prefilter.
if pre.is_some() && pattern_id.is_none() {
find_fwd(pre, false, dfa, pattern_id, bytes, start, end)
} else {
find_fwd(None, false, dfa, pattern_id, bytes, start, end)
Expand Down Expand Up @@ -238,7 +242,9 @@ pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
end: usize,
caller_state: &mut State<A::ID>,
) -> Result<Option<HalfMatch>, MatchError> {
if pre.is_some() {
// Searching with a pattern ID is always anchored, so we should never use
// a prefilter.
if pre.is_some() && pattern_id.is_none() {
find_overlapping_fwd_imp(
pre,
dfa,
Expand Down

0 comments on commit 4555a4e

Please sign in to comment.