From 22e0f74e01cb3f38b3f4831a362eeddafdc38b16 Mon Sep 17 00:00:00 2001 From: ayazhafiz Date: Thu, 26 Dec 2019 19:40:20 -0600 Subject: [PATCH 1/5] implement 8-bit fuse xor filters This commit adds a `Fuse8` filter, a xor filter with 8-bit fingerprints that uses the [fuse graph](https://arxiv.org/abs/1907.04749) data structure to achieves better space ratios than the `Xor8` filter (and is probably faster to populate). The implementation is very similar to the C one, which was added in https://github.com/FastFilter/xor_singleheader/commit/4392b5e03f82037fa61e70cf0ca824ecbe038a65. `Fuse8` filters require a large number of keys (> 100_000), but have the same false positive percentage as `Xor8` filters (< 0.4%) while using much less space (~9.1 bits/entry as compared to `Xor8`'s ~9.9 bits/entry). 16-bit fingerprint versions of the xor filter using fuse graphs should be trivial to implement. Closes #8 --- fusefilter.go | 188 +++++++++++++++++++++++++++++++++++++++++++++ fusefilter_test.go | 62 +++++++++++++++ xorfilter_test.go | 1 + 3 files changed, 251 insertions(+) create mode 100644 fusefilter.go create mode 100644 fusefilter_test.go diff --git a/fusefilter.go b/fusefilter.go new file mode 100644 index 0000000..c12d307 --- /dev/null +++ b/fusefilter.go @@ -0,0 +1,188 @@ +package xorfilter + +import ( + "math" +) + +const ARITY = 3 +const SEGMENT_COUNT = 100 +const SLOTS = SEGMENT_COUNT + ARITY - 1 + +// The Fuse8 xor filter uses 8-bit fingerprints. It offers the same <0.4% false-positive probability +// as the xor filter, but uses less space (~9.1 bits/entry vs ~9.9 bits/entry). +// +// The Fuse8 xor filter uses the fuse data structure, which requires a large number of keys to be +// operational. Experimentally, this number is somewhere >1e5. For smaller key sets, prefer thhe +// Xor8 filter. +// +// For more information on the fuse graph data structure, see https://arxiv.org/abs/1907.04749. +// This implementation is referenced from the C implemenation at https://github.com/FastFilter/xor_singleheader/pull/11. +type Fuse8 struct { + Seed uint64 + SegmentLength uint32 + Fingerprints []uint8 +} + +type h012 struct { + h0 uint32 + h1 uint32 + h2 uint32 +} + +// Contains returns `true` if key is part of the set with a false positive probability of <0.4%. +func (filter *Fuse8) Contains(key uint64) bool { + hash := mixsplit(key, filter.Seed) + f := uint8(fingerprint(hash)) + r0 := uint32(hash) + r1 := uint32(rotl64(hash, 21)) + r2 := uint32(rotl64(hash, 42)) + r3 := uint32((0xBF58476D1CE4E5B9 * hash) >> 32) + seg := reduce(r0, SEGMENT_COUNT) + h0 := (seg+0)*filter.SegmentLength + reduce(r1, filter.SegmentLength) + h1 := (seg+1)*filter.SegmentLength + reduce(r2, filter.SegmentLength) + h2 := (seg+2)*filter.SegmentLength + reduce(r3, filter.SegmentLength) + return f == (filter.Fingerprints[h0] ^ filter.Fingerprints[h1] ^ + filter.Fingerprints[h2]) +} + +func (filter *Fuse8) makeKeyHashes(k uint64) hashes { + hash := mixsplit(k, filter.Seed) + answer := hashes{} + answer.h = hash + r0 := uint32(hash) + r1 := uint32(rotl64(hash, 21)) + r2 := uint32(rotl64(hash, 42)) + r3 := uint32((0xBF58476D1CE4E5B9 * hash) >> 32) + seg := reduce(r0, SEGMENT_COUNT) + answer.h0 = (seg+0)*filter.SegmentLength + reduce(r1, filter.SegmentLength) + answer.h1 = (seg+1)*filter.SegmentLength + reduce(r2, filter.SegmentLength) + answer.h2 = (seg+2)*filter.SegmentLength + reduce(r3, filter.SegmentLength) + return answer +} + +func (filter *Fuse8) geth012(hash uint64) h012 { + answer := h012{} + r0 := uint32(hash) + r1 := uint32(rotl64(hash, 21)) + r2 := uint32(rotl64(hash, 42)) + r3 := uint32((0xBF58476D1CE4E5B9 * hash) >> 32) + seg := reduce(r0, SEGMENT_COUNT) + answer.h0 = (seg+0)*filter.SegmentLength + reduce(r1, filter.SegmentLength) + answer.h1 = (seg+1)*filter.SegmentLength + reduce(r2, filter.SegmentLength) + answer.h2 = (seg+2)*filter.SegmentLength + reduce(r3, filter.SegmentLength) + return answer +} + +// Populate fills a Fuse8 filter with provided keys. +// The caller is responsible for ensuring there are no duplicate keys provided. +func PopulateFuse8(keys []uint64) *Fuse8 { + // ref: Algorithm 3 + size := len(keys) + capacity := uint32(math.Ceil(float64(1.0) / float64(0.879) * float64(size))) + capacity = capacity / SLOTS * SLOTS + rngcounter := uint64(1) + + filter := &Fuse8{} + filter.SegmentLength = capacity / SLOTS + filter.Fingerprints = make([]uint8, capacity, capacity) + filter.Seed = splitmix64(&rngcounter) + + H := make([]xorset, capacity, capacity) + Q := make([]keyindex, capacity, capacity) + stack := make([]keyindex, size, size) + + for true { + // Add all keys to the construction array. + for i := 0; i < size; i++ { + key := keys[i] + hs := filter.makeKeyHashes(key) + + H[hs.h0].xormask ^= hs.h + H[hs.h0].count++ + H[hs.h1].xormask ^= hs.h + H[hs.h1].count++ + H[hs.h2].xormask ^= hs.h + H[hs.h2].count++ + } + + Qsize := 0 + // Add sets with one key to the queue. + for i := uint32(0); i < capacity; i++ { + if H[i].count == 1 { + Q[Qsize].index = i + Q[Qsize].hash = H[i].xormask + Qsize++ + } + } + + stacksize := 0 + for Qsize > 0 { + Qsize-- + ki := Q[Qsize] + index := ki.index + if H[index].count == 0 { + continue // not actually possible after the initial scan + } + + hash := ki.hash + hs := filter.geth012(hash) + + stack[stacksize] = ki + stacksize++ + + // Remove key added to stack from all sets in the construction array and + // enqueue sets that now have one key. + H[hs.h0].xormask ^= hash + H[hs.h0].count-- + if H[hs.h0].count == 1 { + Q[Qsize].index = hs.h0 + Q[Qsize].hash = H[hs.h0].xormask + Qsize++ + } + H[hs.h1].xormask ^= hash + H[hs.h1].count-- + if H[hs.h1].count == 1 { + Q[Qsize].index = hs.h1 + Q[Qsize].hash = H[hs.h1].xormask + Qsize++ + } + H[hs.h2].xormask ^= hash + H[hs.h2].count-- + if H[hs.h2].count == 1 { + Q[Qsize].index = hs.h2 + Q[Qsize].hash = H[hs.h2].xormask + Qsize++ + } + } + + if stacksize == size { + // Success + break + } + + for i := range H { + H[i] = xorset{0, 0} + } + filter.Seed = splitmix64(&rngcounter) + } + + // ref: Algorithm 4 + stacksize := size + for stacksize > 0 { + stacksize-- + ki := stack[stacksize] + hs := filter.geth012(ki.hash) + fp := uint8(fingerprint(ki.hash)) + switch ki.index { + case hs.h0: + fp ^= filter.Fingerprints[hs.h1] ^ filter.Fingerprints[hs.h2] + case hs.h1: + fp ^= filter.Fingerprints[hs.h0] ^ filter.Fingerprints[hs.h2] + default: + fp ^= filter.Fingerprints[hs.h0] ^ filter.Fingerprints[hs.h1] + } + filter.Fingerprints[ki.index] = fp + } + + return filter +} diff --git a/fusefilter_test.go b/fusefilter_test.go new file mode 100644 index 0000000..18f266c --- /dev/null +++ b/fusefilter_test.go @@ -0,0 +1,62 @@ +package xorfilter + +import ( + "fmt" + "math/rand" + "testing" + + "github.com/stretchr/testify/assert" +) + +const NUM_KEYS = 1e6 + +func TestFuse8Basic(t *testing.T) { + testsize := 1000000 + keys := make([]uint64, NUM_KEYS) + for i := range keys { + keys[i] = rand.Uint64() + } + filter := PopulateFuse8(keys) + for _, v := range keys { + assert.Equal(t, true, filter.Contains(v)) + } + falsesize := 1000000 + matches := 0 + bpv := float64(len(filter.Fingerprints)) * 8.0 / float64(testsize) + fmt.Println("bits per entry ", bpv) + assert.Equal(t, true, bpv < 9.101) + for i := 0; i < falsesize; i++ { + v := rand.Uint64() + if filter.Contains(v) { + matches++ + } + } + fpp := float64(matches) * 100.0 / float64(falsesize) + fmt.Println("false positive rate ", fpp) + assert.Equal(t, true, fpp < 0.40) +} + +func BenchmarkFuse8Populate1000000(b *testing.B) { + keys := make([]uint64, NUM_KEYS, NUM_KEYS) + for i := range keys { + keys[i] = rand.Uint64() + } + + b.ResetTimer() + for n := 0; n < b.N; n++ { + PopulateFuse8(keys) + } +} + +func BenchmarkFuse8Contains1000000(b *testing.B) { + keys := make([]uint64, NUM_KEYS, NUM_KEYS) + for i := range keys { + keys[i] = rand.Uint64() + } + filter := PopulateFuse8(keys) + + b.ResetTimer() + for n := 0; n < b.N; n++ { + filter.Contains(keys[n%len(keys)]) + } +} diff --git a/xorfilter_test.go b/xorfilter_test.go index 596ed5a..56e425e 100644 --- a/xorfilter_test.go +++ b/xorfilter_test.go @@ -21,6 +21,7 @@ func TestBasic(t *testing.T) { falsesize := 1000000 matches := 0 bpv := float64(len(filter.Fingerprints)) * 8.0 / float64(testsize) + fmt.Println("Xor8 Filter:") fmt.Println("bits per entry ", bpv) assert.Equal(t, true, bpv < 10.) for i := 0; i < falsesize; i++ { From 4386237150f11ae80713dad8f2c72add1c3cc3a5 Mon Sep 17 00:00:00 2001 From: ayazhafiz Date: Thu, 26 Dec 2019 19:53:12 -0600 Subject: [PATCH 2/5] fixup! implement 8-bit fuse xor filters --- fusefilter.go | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fusefilter.go b/fusefilter.go index c12d307..92fbd5c 100644 --- a/fusefilter.go +++ b/fusefilter.go @@ -1,13 +1,5 @@ package xorfilter -import ( - "math" -) - -const ARITY = 3 -const SEGMENT_COUNT = 100 -const SLOTS = SEGMENT_COUNT + ARITY - 1 - // The Fuse8 xor filter uses 8-bit fingerprints. It offers the same <0.4% false-positive probability // as the xor filter, but uses less space (~9.1 bits/entry vs ~9.9 bits/entry). // @@ -29,6 +21,10 @@ type h012 struct { h2 uint32 } +const ARITY = 3 +const SEGMENT_COUNT = 100 +const SLOTS = SEGMENT_COUNT + ARITY - 1 + // Contains returns `true` if key is part of the set with a false positive probability of <0.4%. func (filter *Fuse8) Contains(key uint64) bool { hash := mixsplit(key, filter.Seed) @@ -76,9 +72,11 @@ func (filter *Fuse8) geth012(hash uint64) h012 { // Populate fills a Fuse8 filter with provided keys. // The caller is responsible for ensuring there are no duplicate keys provided. func PopulateFuse8(keys []uint64) *Fuse8 { + const FUSE_OVERHEAD = 1.0 / 0.879 + // ref: Algorithm 3 size := len(keys) - capacity := uint32(math.Ceil(float64(1.0) / float64(0.879) * float64(size))) + capacity := uint32(FUSE_OVERHEAD * float64(size)) capacity = capacity / SLOTS * SLOTS rngcounter := uint64(1) From 648fc8382943dd0c0826582751c92ab8aa2658b1 Mon Sep 17 00:00:00 2001 From: ayazhafiz Date: Thu, 26 Dec 2019 20:02:54 -0600 Subject: [PATCH 3/5] fixup! implement 8-bit fuse xor filters --- fusefilter.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fusefilter.go b/fusefilter.go index 92fbd5c..fcb46ce 100644 --- a/fusefilter.go +++ b/fusefilter.go @@ -91,8 +91,7 @@ func PopulateFuse8(keys []uint64) *Fuse8 { for true { // Add all keys to the construction array. - for i := 0; i < size; i++ { - key := keys[i] + for _, key := range keys { hs := filter.makeKeyHashes(key) H[hs.h0].xormask ^= hs.h From d925c6e49ddeb3a58fbec9f4a588b80d04dee2b8 Mon Sep 17 00:00:00 2001 From: ayazhafiz Date: Fri, 27 Dec 2019 01:33:43 -0600 Subject: [PATCH 4/5] fixup! implement 8-bit fuse xor filters --- fusefilter_test.go | 1 + xorfilter_test.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fusefilter_test.go b/fusefilter_test.go index 18f266c..596a4d4 100644 --- a/fusefilter_test.go +++ b/fusefilter_test.go @@ -23,6 +23,7 @@ func TestFuse8Basic(t *testing.T) { falsesize := 1000000 matches := 0 bpv := float64(len(filter.Fingerprints)) * 8.0 / float64(testsize) + fmt.Println("Fuse8 filter:") fmt.Println("bits per entry ", bpv) assert.Equal(t, true, bpv < 9.101) for i := 0; i < falsesize; i++ { diff --git a/xorfilter_test.go b/xorfilter_test.go index 56e425e..de5a835 100644 --- a/xorfilter_test.go +++ b/xorfilter_test.go @@ -21,7 +21,7 @@ func TestBasic(t *testing.T) { falsesize := 1000000 matches := 0 bpv := float64(len(filter.Fingerprints)) * 8.0 / float64(testsize) - fmt.Println("Xor8 Filter:") + fmt.Println("Xor8 filter:") fmt.Println("bits per entry ", bpv) assert.Equal(t, true, bpv < 10.) for i := 0; i < falsesize; i++ { From f0c1a8d110d07089925ee74a277886efaef5305b Mon Sep 17 00:00:00 2001 From: ayazhafiz Date: Fri, 27 Dec 2019 11:46:27 -0600 Subject: [PATCH 5/5] fixup! implement 8-bit fuse xor filters --- fusefilter.go | 4 ++-- xorfilter.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fusefilter.go b/fusefilter.go index fcb46ce..34402bd 100644 --- a/fusefilter.go +++ b/fusefilter.go @@ -34,7 +34,7 @@ func (filter *Fuse8) Contains(key uint64) bool { r2 := uint32(rotl64(hash, 42)) r3 := uint32((0xBF58476D1CE4E5B9 * hash) >> 32) seg := reduce(r0, SEGMENT_COUNT) - h0 := (seg+0)*filter.SegmentLength + reduce(r1, filter.SegmentLength) + h0 := seg*filter.SegmentLength + reduce(r1, filter.SegmentLength) h1 := (seg+1)*filter.SegmentLength + reduce(r2, filter.SegmentLength) h2 := (seg+2)*filter.SegmentLength + reduce(r3, filter.SegmentLength) return f == (filter.Fingerprints[h0] ^ filter.Fingerprints[h1] ^ @@ -72,7 +72,7 @@ func (filter *Fuse8) geth012(hash uint64) h012 { // Populate fills a Fuse8 filter with provided keys. // The caller is responsible for ensuring there are no duplicate keys provided. func PopulateFuse8(keys []uint64) *Fuse8 { - const FUSE_OVERHEAD = 1.0 / 0.879 + const FUSE_OVERHEAD = 1.0 / 0.879 // ref: Algorithm 3 size := len(keys) diff --git a/xorfilter.go b/xorfilter.go index fde3067..62d0ced 100644 --- a/xorfilter.go +++ b/xorfilter.go @@ -51,7 +51,7 @@ func mixsplit(key, seed uint64) uint64 { } func rotl64(n uint64, c int) uint64 { - return (n << uint(c & 63)) | (n >> uint((-c) & 63)) + return (n << uint(c&63)) | (n >> uint((-c)&63)) } func reduce(hash, n uint32) uint32 {