progress

BurntSushi · Oct 20, 2020 · 4555a4e · 4555a4e
1 parent fa7c2c7
commit 4555a4e
Show file tree

Hide file tree

Showing 12 changed files with 813 additions and 272 deletions.
diff --git a/PLANS.md b/PLANS.md
@@ -81,6 +81,8 @@ Straw man:
 
 At this point, I think the bug should resolve itself.
 
+^^^^ DONE! IT WORKS!
+
 -----
 
 

diff --git a/src/bytes.rs b/src/bytes.rs
@@ -273,11 +273,14 @@ pub fn skip_initial_padding(slice: &[u8]) -> usize {
 ///
 /// In practice, padding is often zero.
 pub fn alloc_aligned_buffer<S: StateID>(size: usize) -> (Vec<u8>, usize) {
-    // TODO: This is a kludge because there's no easy way to allocate a Vec<u8>
-    // with an alignment guaranteed to be greater than 1. We could create a
-    // Vec<usize>, but this cannot be safely transmuted to a Vec<u8> without
-    // concern, since reallocing or dropping the Vec<u8> is UB (different
-    // alignment than the initial allocation).
+    // FIXME: This is a kludge because there's no easy way to allocate a
+    // Vec<u8> with an alignment guaranteed to be greater than 1. We could
+    // create a Vec<usize>, but this cannot be safely transmuted to a Vec<u8>
+    // without concern, since reallocing or dropping the Vec<u8> is UB
+    // (different alignment than the initial allocation). It's plausible
+    // that if there was a reliable way to create a Vec<u8> with a different
+    // alignment, then other aspects of this library could be simplified as
+    // well.
     let mut buf = vec![0; size];
     let align = core::mem::align_of::<S>();
     let address = buf.as_ptr() as usize;

diff --git a/src/dfa/dense.rs b/src/dfa/dense.rs
@@ -590,7 +590,10 @@ impl Default for Builder {
 }
 
 /// A convenience alias for an owned DFA. We use this particular instantiation
-/// a lot in this crate, so it's worth giving it a name.
+/// a lot in this crate, so it's worth giving it a name. This instantiation
+/// is commonly used for mutable APIs on the DFA while building it. The main
+/// reason for making it generic is no_std support, and more generally, making
+/// it possible to load a DFA from an arbitrary slice of bytes.
 pub(crate) type OwnedDFA<S> = DFA<Vec<S>, Vec<u8>, S>;
 
 /// A dense table-based deterministic finite automaton (DFA).
@@ -695,8 +698,8 @@ pub struct DFA<T, A, S = usize> {
     ///
     /// If a state is accelerated, then there exist only a small number of
     /// bytes that can cause the DFA to leave the state. This permits searching
-    /// to use optimized routines to find those specific bytes instead of
-    /// using the transition table.
+    /// to use optimized routines to find those specific bytes instead of using
+    /// the transition table.
     ///
     /// All accelerated states exist in a contiguous range in the DFA's
     /// transition table. See dfa/special.rs for more details on how states are
@@ -1974,8 +1977,10 @@ impl<S: StateID> OwnedDFA<S> {
     ) {
         // The determinizer always adds a quit state and it is always second.
         self.special.quit_id = self.from_index(1);
-        // If all we have are the dead and quit states, then we're done.
+        // If all we have are the dead and quit states, then we're done and
+        // the DFA will never produce a match.
         if self.tt.count <= 2 {
+            self.special.set_max();
             return;
         }
 
@@ -2961,6 +2966,23 @@ pub struct StartTable<T, S> {
     _state_id: PhantomData<S>,
 }
 
+impl<S: StateID> StartTable<Vec<S>, S> {
+    /// Create a valid set of start states all pointing to the dead state.
+    ///
+    /// When the corresponding DFA is constructed with start states for each
+    /// pattern, then `patterns` should be the number of patterns. Otherwise,
+    /// it should be zero.
+    fn dead(patterns: usize) -> StartTable<Vec<S>, S> {
+        let stride = Start::count();
+        StartTable {
+            table: vec![dead_id(); stride + (stride * patterns)],
+            stride,
+            patterns,
+            _state_id: PhantomData,
+        }
+    }
+}
+
 impl<'a, S: StateID> StartTable<&'a [S], S> {
     /// Deserialize a table of start state IDs starting at the beginning of
     /// `slice`. Upon success, return the total number of bytes read along with
@@ -3001,15 +3023,25 @@ impl<'a, S: StateID> StartTable<&'a [S], S> {
                 "invalid starting table stride",
             ));
         }
-        // TODO: It feels weird to invoke thompson's pattern limit here.
-        // Maybe pattern limit should be defined at the top-level?
-        if patterns > crate::nfa::thompson::pattern_limit() {
+        if patterns > crate::pattern_limit() {
             return Err(DeserializeError::generic(
                 "invalid number of patterns",
             ));
         }
-        let count =
-            stride.checked_add(stride.checked_mul(patterns).unwrap()).unwrap();
+        let pattern_table_size = match stride.checked_mul(patterns) {
+            Some(x) => x,
+            None => {
+                return Err(DeserializeError::generic("invalid pattern count"))
+            }
+        };
+        let count = match stride.checked_add(pattern_table_size) {
+            Some(x) => x,
+            None => {
+                return Err(DeserializeError::generic(
+                    "invalid pattern+stride",
+                ))
+            }
+        };
         let table_bytes_len = count * core::mem::size_of::<S>();
         let nread = 16 + table_bytes_len;
         if slice.len() < table_bytes_len {
@@ -3032,23 +3064,6 @@ impl<'a, S: StateID> StartTable<&'a [S], S> {
     }
 }
 
-impl<S: StateID> StartTable<Vec<S>, S> {
-    /// Create a valid set of start states all pointing to the dead state.
-    ///
-    /// When the corresponding DFA is constructed with start states for each
-    /// pattern, then `patterns` should be the number of patterns. Otherwise,
-    /// it should be zero.
-    fn dead(patterns: usize) -> StartTable<Vec<S>, S> {
-        let stride = Start::count();
-        StartTable {
-            table: vec![dead_id(); stride + (stride * patterns)],
-            stride,
-            patterns,
-            _state_id: PhantomData,
-        }
-    }
-}
-
 impl<T: AsRef<[S]>, S: StateID> StartTable<T, S> {
     /// Writes a serialized form of this start table to the buffer given. If
     /// the buffer is too small, then an error is returned. To determine how
@@ -3138,12 +3153,7 @@ impl<T: AsRef<[S]>, S: StateID> StartTable<T, S> {
         if max_state_id > S2::max_id() {
             return Err(Error::state_id_overflow(S2::max_id()));
         }
-        let mut st = StartTable {
-            table: vec![dead_id::<S2>(); self.table().len()],
-            stride: self.stride,
-            patterns: self.patterns,
-            _state_id: PhantomData,
-        };
+        let mut st = StartTable::dead(self.patterns);
         for (i, id) in st.table.iter_mut().enumerate() {
             // This is always correct since we've verified above that the
             // maximum state ID can fit into S2.
@@ -3218,12 +3228,7 @@ impl<T: AsRef<[S]>, S: StateID> StartTable<T, S> {
 }
 
 impl<T: AsMut<[S]>, S: StateID> StartTable<T, S> {
-    /// Return the start state for the given index and pattern ID. If the
-    /// pattern ID is None, then the corresponding start state for the entire
-    /// DFA is returned. If the pattern ID is not None, then the corresponding
-    /// starting state for the given pattern is returned. If this start table
-    /// does not have individual starting states for each pattern, then this
-    /// panics.
+    /// Set the start state for the given index and pattern.
     fn set_start(
         &mut self,
         index: Start,
@@ -3272,12 +3277,12 @@ impl<'a, S: StateID> Iterator for StartStateIter<'a, S> {
         // This unwrap is okay since the stride of any DFA must always match
         // the number of start state types.
         let start_type = Start::from_usize(i % self.st.stride).unwrap();
-        if i < self.st.stride {
-            Some((table[i], start_type, None))
+        let pid = if i < self.st.stride {
+            None
         } else {
-            let pid = (i - self.st.stride) / self.st.stride;
-            Some((table[i], start_type, Some(pid as u32)))
-        }
+            Some(((i - self.st.stride) / self.st.stride) as u32)
+        };
+        Some((table[i], start_type, pid))
     }
 }
 

diff --git a/src/dfa/determinize.rs b/src/dfa/determinize.rs
@@ -277,13 +277,11 @@ impl<'a, S: StateID> Runner<'a, S> {
                 | thompson::State::Fail
                 | thompson::State::Look { .. } => {}
                 thompson::State::Match(mid) => {
-                    // TODO: Make this work. Currently this fails with
-                    // MatchStates serialization. Think of something elegant.
-                    // if self.nfa.match_len() <= 1 {
-                    // facts.state.matches = Matches::One;
-                    // } else {
-                    facts.state.matches.add(mid);
-                    // }
+                    if self.nfa.match_len() <= 1 {
+                        facts.state.matches = Matches::One;
+                    } else {
+                        facts.state.matches.add(mid);
+                    }
                     if !self.continue_past_first_match() {
                         break;
                     }
@@ -658,7 +656,7 @@ impl Matches {
     fn into_vec(self) -> Option<Vec<PatternID>> {
         match self {
             Matches::None => None,
-            Matches::One => Some(vec![]),
+            Matches::One => Some(vec![0]),
             Matches::Many(pids) => Some(pids),
         }
     }

diff --git a/src/dfa/search.rs b/src/dfa/search.rs
@@ -14,7 +14,9 @@ pub fn find_earliest_fwd<A: Automaton + ?Sized>(
     start: usize,
     end: usize,
 ) -> Result<Option<HalfMatch>, MatchError> {
-    if pre.is_some() {
+    // Searching with a pattern ID is always anchored, so we should never use
+    // a prefilter.
+    if pre.is_some() && pattern_id.is_none() {
         find_fwd(pre, true, dfa, pattern_id, bytes, start, end)
     } else {
         find_fwd(None, true, dfa, pattern_id, bytes, start, end)
@@ -30,7 +32,9 @@ pub fn find_leftmost_fwd<A: Automaton + ?Sized>(
     start: usize,
     end: usize,
 ) -> Result<Option<HalfMatch>, MatchError> {
-    if pre.is_some() {
+    // Searching with a pattern ID is always anchored, so we should never use
+    // a prefilter.
+    if pre.is_some() && pattern_id.is_none() {
         find_fwd(pre, false, dfa, pattern_id, bytes, start, end)
     } else {
         find_fwd(None, false, dfa, pattern_id, bytes, start, end)
@@ -238,7 +242,9 @@ pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
     end: usize,
     caller_state: &mut State<A::ID>,
 ) -> Result<Option<HalfMatch>, MatchError> {
-    if pre.is_some() {
+    // Searching with a pattern ID is always anchored, so we should never use
+    // a prefilter.
+    if pre.is_some() && pattern_id.is_none() {
         find_overlapping_fwd_imp(
             pre,
             dfa,