Skip to content

Commit

Permalink
Switched to the two-way algorithm for string searching
Browse files Browse the repository at this point in the history
test str::bench::bench_contains_bad_naive                   ... bench:       300 ns/iter (+/- 12)     from 1309 ns/iter (+/- 36)
test str::bench::bench_contains_equal                       ... bench:       154 ns/iter (+/- 7)      from  137 ns/iter (+/- 2)
test str::bench::bench_contains_short_long                  ... bench:      2998 ns/iter (+/- 74)     from 5473 ns/iter (+/- 14)
test str::bench::bench_contains_short_short                 ... bench:        65 ns/iter (+/- 2)      from   57 ns/iter (+/- 6)
  • Loading branch information
Jonathan S committed May 15, 2014
1 parent 8a32a2a commit 39cb5b1
Showing 1 changed file with 206 additions and 26 deletions.
232 changes: 206 additions & 26 deletions src/libcore/str.rs
Expand Up @@ -15,16 +15,19 @@
use mem;
use char;
use clone::Clone;
use cmp;
use cmp::{Eq, TotalEq};
use container::Container;
use default::Default;
use iter::{Filter, Map, Iterator};
use iter::{Rev, DoubleEndedIterator, ExactSize};
use iter::range;
use num::Saturating;
use option::{None, Option, Some};
use raw::Repr;
use slice::{ImmutableVector, Vector};
use slice;
use uint;

/*
Section: Creating a string
Expand Down Expand Up @@ -316,13 +319,207 @@ impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
}
}

/// The internal state of an iterator that searches for matches of a substring
/// within a larger string using naive search
#[deriving(Clone)]
struct NaiveSearcher {
position: uint
}

impl NaiveSearcher {
fn new() -> NaiveSearcher {
NaiveSearcher { position: 0 }
}

fn next(&mut self, haystack: &[u8], needle: &[u8]) -> Option<(uint, uint)> {
while self.position + needle.len() <= haystack.len() {
if haystack.slice(self.position, self.position + needle.len()) == needle {
let matchPos = self.position;
self.position += needle.len(); // add 1 for all matches
return Some((matchPos, matchPos + needle.len()));
} else {
self.position += 1;
}
}
None
}
}

/// The internal state of an iterator that searches for matches of a substring
/// within a larger string using two-way search
#[deriving(Clone)]
struct TwoWaySearcher {
// constants
critPos: uint,
period: uint,
byteset: u64,

// variables
position: uint,
memory: uint
}

impl TwoWaySearcher {
fn new(needle: &[u8]) -> TwoWaySearcher {
let (critPos1, period1) = TwoWaySearcher::maximal_suffix(needle, false);
let (critPos2, period2) = TwoWaySearcher::maximal_suffix(needle, true);

let critPos;
let period;
if critPos1 > critPos2 {
critPos = critPos1;
period = period1;
} else {
critPos = critPos2;
period = period2;
}

let byteset = needle.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a);

if needle.slice_to(critPos) == needle.slice_from(needle.len() - critPos) {
TwoWaySearcher {
critPos: critPos,
period: period,
byteset: byteset,

position: 0,
memory: 0
}
} else {
TwoWaySearcher {
critPos: critPos,
period: cmp::max(critPos, needle.len() - critPos) + 1,
byteset: byteset,

position: 0,
memory: uint::MAX // Dummy value to signify that the period is long
}
}
}

#[inline]
fn next(&mut self, haystack: &[u8], needle: &[u8], longPeriod: bool) -> Option<(uint, uint)> {
'search: loop {
// Check that we have room to search in
if self.position + needle.len() > haystack.len() {
return None;
}

// Quickly skip by large portions unrelated to our substring
if (self.byteset >> (haystack[self.position + needle.len() - 1] & 0x3f)) & 1 == 0 {
self.position += needle.len();
continue 'search;
}

// See if the right part of the needle matches
let start = if longPeriod { self.critPos } else { cmp::max(self.critPos, self.memory) };
for i in range(start, needle.len()) {
if needle[i] != haystack[self.position + i] {
self.position += i - self.critPos + 1;
if !longPeriod {
self.memory = 0;
}
continue 'search;
}
}

// See if the left part of the needle matches
let start = if longPeriod { 0 } else { self.memory };
for i in range(start, self.critPos).rev() {
if needle[i] != haystack[self.position + i] {
self.position += self.period;
if !longPeriod {
self.memory = needle.len() - self.period;
}
continue 'search;
}
}

// We have found a match!
let matchPos = self.position;
self.position += needle.len(); // add self.period for all matches
if !longPeriod {
self.memory = 0; // set to needle.len() - self.period for all matches
}
return Some((matchPos, matchPos + needle.len()));
}
}

#[inline]
fn maximal_suffix(arr: &[u8], reversed: bool) -> (uint, uint) {
let mut left = -1; // Corresponds to i in the paper
let mut right = 0; // Corresponds to j in the paper
let mut offset = 1; // Corresponds to k in the paper
let mut period = 1; // Corresponds to p in the paper

while right + offset < arr.len() {
let a;
let b;
if reversed {
a = arr[left + offset];
b = arr[right + offset];
} else {
a = arr[right + offset];
b = arr[left + offset];
}
if a < b {
// Suffix is smaller, period is entire prefix so far.
right += offset;
offset = 1;
period = right - left;
} else if a == b {
// Advance through repetition of the current period.
if offset == period {
right += offset;
offset = 1;
} else {
offset += 1;
}
} else {
// Suffix is larger, start over from current location.
left = right;
right += 1;
offset = 1;
period = 1;
}
}
(left + 1, period)
}
}

/// The internal state of an iterator that searches for matches of a substring
/// within a larger string using a dynamically chosed search algorithm
#[deriving(Clone)]
enum Searcher {
Naive(NaiveSearcher),
TwoWay(TwoWaySearcher),
TwoWayLong(TwoWaySearcher)
}

impl Searcher {
fn new(haystack: &[u8], needle: &[u8]) -> Searcher {
// FIXME: Tune this.
if needle.len() > haystack.len() - 20 {
Naive(NaiveSearcher::new())
} else {
let searcher = TwoWaySearcher::new(needle);
if searcher.memory == uint::MAX { // If the period is long
TwoWayLong(searcher)
} else {
TwoWay(searcher)
}
}
}
}

/// An iterator over the start and end indices of the matches of a
/// substring within a larger string
#[deriving(Clone)]
pub struct MatchIndices<'a> {
// constants
haystack: &'a str,
needle: &'a str,
position: uint,
searcher: Searcher
}

/// An iterator over the substrings of a string separated by a given
Expand All @@ -337,31 +534,14 @@ pub struct StrSplits<'a> {
impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
#[inline]
fn next(&mut self) -> Option<(uint, uint)> {
// See Issue #1932 for why this is a naive search
let (h_len, n_len) = (self.haystack.len(), self.needle.len());
let mut match_start = 0;
let mut match_i = 0;

while self.position < h_len {
if self.haystack[self.position] == self.needle[match_i] {
if match_i == 0 { match_start = self.position; }
match_i += 1;
self.position += 1;

if match_i == n_len {
// found a match!
return Some((match_start, self.position));
}
} else {
// failed match, backtrack
if match_i > 0 {
match_i = 0;
self.position = match_start;
}
self.position += 1;
}
match self.searcher {
Naive(ref mut searcher)
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes()),
TwoWay(ref mut searcher)
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), false),
TwoWayLong(ref mut searcher)
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), true)
}
None
}
}

Expand Down Expand Up @@ -1581,7 +1761,7 @@ impl<'a> StrSlice<'a> for &'a str {
MatchIndices {
haystack: *self,
needle: sep,
position: 0
searcher: Searcher::new(self.as_bytes(), sep.as_bytes())
}
}

Expand Down

5 comments on commit 39cb5b1

@bors
Copy link
Contributor

@bors bors commented on 39cb5b1 May 16, 2014

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

saw approval from brson
at gereeter@39cb5b1

@bors
Copy link
Contributor

@bors bors commented on 39cb5b1 May 16, 2014

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

merging gereeter/rust/two-way-search = 39cb5b1 into auto

@bors
Copy link
Contributor

@bors bors commented on 39cb5b1 May 16, 2014

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gereeter/rust/two-way-search = 39cb5b1 merged ok, testing candidate = cea4803

@bors
Copy link
Contributor

@bors bors commented on 39cb5b1 May 16, 2014

@bors
Copy link
Contributor

@bors bors commented on 39cb5b1 May 16, 2014

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fast-forwarding master to auto = cea4803

Please sign in to comment.