Skip to content

Commit

Permalink
benchmarks: start work on a more curated set of benchmarks
Browse files Browse the repository at this point in the history
This is a WIP for now.
  • Loading branch information
BurntSushi committed Sep 18, 2023
1 parent dd84b6d commit 3ed2c38
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 3 deletions.
45 changes: 45 additions & 0 deletions benchmarks/definitions/curated.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
analysis = '''
This is a WIP for building our a curated set of Aho-Corasick benchmarks.
The next step is not to actually add more benchmarks, but to hook up more
Aho-Corasick libraries. There are a lot of them and it's a fair bit of work to
do.
'''

[[bench]]
model = "count"
name = "sherlock"
regex = [
'Sherlock Holmes',
'John Watson',
'Irene Adler',
'Inspector Lestrade',
'Professor Moriarty',
]
haystack = { path = "opensubtitles/en-sampled.txt" }
count = 714
engines = [
"rust/aho-corasick/default/standard",
"rust/aho-corasick/default/leftmost-first",
"daachorse/bytewise/standard",
"daachorse/bytewise/leftmost-first",
"naive/rust/memchr/memmem",
]

[[bench]]
model = "count"
name = "dictionary-15"
regex = { path = "dictionary/english/length-15.txt", per-line = "pattern" }
haystack = { path = "opensubtitles/en-sampled.txt" }
count = 15
engines = [
"rust/aho-corasick/default/standard",
"rust/aho-corasick/default/overlapping",
"rust/aho-corasick/default/leftmost-first",
"daachorse/bytewise/standard",
"daachorse/bytewise/overlapping",
"daachorse/bytewise/leftmost-first",
"naive/rust/memchr/memmem",
]
analysis = '''
Looks for occurrences of 2,663 words of length at least 15.
'''
48 changes: 48 additions & 0 deletions benchmarks/engines.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,22 @@
bin = "cargo"
args = ["clean"]

[[engine]]
name = "rust/aho-corasick/default/overlapping"
cwd = "./engines/rust-aho-corasick"
[engine.version]
bin = "./target/release/main"
args = ["--version"]
[engine.run]
bin = "./target/release/main"
args = ["default/overlapping"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]

[[engine]]
name = "rust/aho-corasick/default/leftmost-first"
cwd = "./engines/rust-aho-corasick"
Expand Down Expand Up @@ -237,6 +253,38 @@
#
# Ref: https://github.com/daac-tools/daachorse

[[engine]]
name = "daachorse/bytewise/standard"
cwd = "./engines/rust-daachorse"
[engine.version]
bin = "./target/release/main"
args = ["--version"]
[engine.run]
bin = "./target/release/main"
args = ["bytewise/standard"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]

[[engine]]
name = "daachorse/bytewise/overlapping"
cwd = "./engines/rust-daachorse"
[engine.version]
bin = "./target/release/main"
args = ["--version"]
[engine.run]
bin = "./target/release/main"
args = ["bytewise/overlapping"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]

[[engine]]
name = "daachorse/bytewise/leftmost-first"
cwd = "./engines/rust-daachorse"
Expand Down
15 changes: 14 additions & 1 deletion benchmarks/engines/rust-aho-corasick/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ fn main() -> anyhow::Result<()> {
let b = Benchmark::from_stdin()
.context("failed to read KLV data from <stdin>")?;
let samples = match (b.model.as_str(), engine.as_str()) {
// These first 6 configurations are meant to test the default settings
// These first 7 configurations are meant to test the default settings
// on each of {compile, count} x {standard, leftmost-{first,longest}}.
// We don't also test each of them with {nfa/(non-)?contiguous, dfa}
// because it would just get ridiculous.
Expand All @@ -68,6 +68,10 @@ fn main() -> anyhow::Result<()> {
let ac = builder_ac(&b)?.build(&b.needles)?;
model_count_ac(&b, &ac)?
}
("count", "default/overlapping") => {
let ac = builder_ac(&b)?.build(&b.needles)?;
model_count_ac_overlapping(&b, &ac)?
}
("count", "default/leftmost-first") => {
let ac = builder_ac(&b)?
.match_kind(MatchKind::LeftmostFirst)
Expand Down Expand Up @@ -196,6 +200,15 @@ fn model_count_ac(
shared::run(b, || Ok(ac.find_iter(haystack).count()))
}

/// Implements the "count all overlapping matches" model for `AhoCorasick`.
fn model_count_ac_overlapping(
b: &Benchmark,
ac: &AhoCorasick,
) -> anyhow::Result<Vec<Sample>> {
let haystack = &*b.haystack;
shared::run(b, || Ok(ac.find_overlapping_iter(haystack).count()))
}

/// Implements the "count all matches" model for packed substring search.
fn model_count_packed(
b: &Benchmark,
Expand Down
49 changes: 47 additions & 2 deletions benchmarks/engines/rust-daachorse/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,19 @@ fn main() -> anyhow::Result<()> {
let b = Benchmark::from_stdin()
.context("failed to read KLV data from <stdin>")?;
let samples = match (b.model.as_str(), engine.as_str()) {
("compile", "bytewise/standard") => {
model_compile_bytewise_standard(&b)?
}
("compile", "bytewise/leftmost-first") => {
model_compile_bytewise_leftmost(&b, MatchKind::LeftmostFirst)?
}
("compile", "bytewise/leftmost-longest") => {
model_compile_bytewise_leftmost(&b, MatchKind::LeftmostLongest)?
}
("count", "bytewise/standard") => model_count_bytewise_standard(&b)?,
("count", "bytewise/overlapping") => {
model_count_bytewise_overlapping(&b)?
}
("count", "bytewise/leftmost-first") => {
model_count_bytewise_leftmost(&b, MatchKind::LeftmostFirst)?
}
Expand All @@ -71,6 +78,22 @@ fn main() -> anyhow::Result<()> {
Ok(())
}

/// Implements the "compile a matcher" model for a bytewise daachorse automaton
/// using "standard" (i.e., what's found in a textbook description of
/// Aho-Corasick for a non-overlapping search) match semantics.
fn model_compile_bytewise_standard(
b: &Benchmark,
) -> anyhow::Result<Vec<Sample>> {
let haystack = &*b.haystack;
shared::run_and_count(
b,
|ac: daachorse::DoubleArrayAhoCorasick<u32>| {
Ok(ac.find_iter(haystack).count())
},
|| compile_bytewise(b, MatchKind::Standard),
)
}

/// Implements the "compile a matcher" model for a bytewise daachorse automaton
/// using the given match semantics. The match semantics must be either
/// leftmost-first or leftmost-longest.
Expand All @@ -88,8 +111,30 @@ fn model_compile_bytewise_leftmost(
)
}

/// Implements a naive multi-substring algorithm using the `memchr` crate's
/// `memmem` implementation.
/// Implements a multi-substring algorithm using daachorse's bytewise
/// Aho-Corasick automaton. This uses "standard" match semantics.
fn model_count_bytewise_standard(
b: &Benchmark,
) -> anyhow::Result<Vec<Sample>> {
let haystack = &*b.haystack;
let ac = compile_bytewise(b, MatchKind::Standard)?;
shared::run(b, || Ok(ac.find_iter(haystack).count()))
}

/// Implements a multi-substring algorithm using daachorse's bytewise
/// Aho-Corasick automaton. This uses "standard" match semantics and finds all
/// overlapping matches.
fn model_count_bytewise_overlapping(
b: &Benchmark,
) -> anyhow::Result<Vec<Sample>> {
let haystack = &*b.haystack;
let ac = compile_bytewise(b, MatchKind::Standard)?;
shared::run(b, || Ok(ac.find_overlapping_iter(haystack).count()))
}

/// Implements a multi-substring algorithm using daachorse's bytewise
/// Aho-Corasick automaton. This requires leftmost-first or leftmost-longest
/// match semantics.
fn model_count_bytewise_leftmost(
b: &Benchmark,
kind: MatchKind,
Expand Down

0 comments on commit 3ed2c38

Please sign in to comment.