Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 185 additions & 0 deletions fusefilter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
package xorfilter

// The Fuse8 xor filter uses 8-bit fingerprints. It offers the same <0.4% false-positive probability
// as the xor filter, but uses less space (~9.1 bits/entry vs ~9.9 bits/entry).
//
// The Fuse8 xor filter uses the fuse data structure, which requires a large number of keys to be
// operational. Experimentally, this number is somewhere >1e5. For smaller key sets, prefer thhe
// Xor8 filter.
//
// For more information on the fuse graph data structure, see https://arxiv.org/abs/1907.04749.
// This implementation is referenced from the C implemenation at https://github.com/FastFilter/xor_singleheader/pull/11.
type Fuse8 struct {
Seed uint64
SegmentLength uint32
Fingerprints []uint8
}

type h012 struct {
h0 uint32
h1 uint32
h2 uint32
}

const ARITY = 3
const SEGMENT_COUNT = 100
const SLOTS = SEGMENT_COUNT + ARITY - 1

// Contains returns `true` if key is part of the set with a false positive probability of <0.4%.
func (filter *Fuse8) Contains(key uint64) bool {
hash := mixsplit(key, filter.Seed)
f := uint8(fingerprint(hash))
r0 := uint32(hash)
r1 := uint32(rotl64(hash, 21))
r2 := uint32(rotl64(hash, 42))
r3 := uint32((0xBF58476D1CE4E5B9 * hash) >> 32)
seg := reduce(r0, SEGMENT_COUNT)
h0 := seg*filter.SegmentLength + reduce(r1, filter.SegmentLength)
h1 := (seg+1)*filter.SegmentLength + reduce(r2, filter.SegmentLength)
h2 := (seg+2)*filter.SegmentLength + reduce(r3, filter.SegmentLength)
return f == (filter.Fingerprints[h0] ^ filter.Fingerprints[h1] ^
filter.Fingerprints[h2])
}

func (filter *Fuse8) makeKeyHashes(k uint64) hashes {
hash := mixsplit(k, filter.Seed)
answer := hashes{}
answer.h = hash
r0 := uint32(hash)
r1 := uint32(rotl64(hash, 21))
r2 := uint32(rotl64(hash, 42))
r3 := uint32((0xBF58476D1CE4E5B9 * hash) >> 32)
seg := reduce(r0, SEGMENT_COUNT)
answer.h0 = (seg+0)*filter.SegmentLength + reduce(r1, filter.SegmentLength)
answer.h1 = (seg+1)*filter.SegmentLength + reduce(r2, filter.SegmentLength)
answer.h2 = (seg+2)*filter.SegmentLength + reduce(r3, filter.SegmentLength)
return answer
}

func (filter *Fuse8) geth012(hash uint64) h012 {
answer := h012{}
r0 := uint32(hash)
r1 := uint32(rotl64(hash, 21))
r2 := uint32(rotl64(hash, 42))
r3 := uint32((0xBF58476D1CE4E5B9 * hash) >> 32)
seg := reduce(r0, SEGMENT_COUNT)
answer.h0 = (seg+0)*filter.SegmentLength + reduce(r1, filter.SegmentLength)
answer.h1 = (seg+1)*filter.SegmentLength + reduce(r2, filter.SegmentLength)
answer.h2 = (seg+2)*filter.SegmentLength + reduce(r3, filter.SegmentLength)
return answer
}

// Populate fills a Fuse8 filter with provided keys.
// The caller is responsible for ensuring there are no duplicate keys provided.
func PopulateFuse8(keys []uint64) *Fuse8 {
const FUSE_OVERHEAD = 1.0 / 0.879

// ref: Algorithm 3
size := len(keys)
capacity := uint32(FUSE_OVERHEAD * float64(size))
capacity = capacity / SLOTS * SLOTS
rngcounter := uint64(1)

filter := &Fuse8{}
filter.SegmentLength = capacity / SLOTS
filter.Fingerprints = make([]uint8, capacity, capacity)
filter.Seed = splitmix64(&rngcounter)

H := make([]xorset, capacity, capacity)
Q := make([]keyindex, capacity, capacity)
stack := make([]keyindex, size, size)

for true {
// Add all keys to the construction array.
for _, key := range keys {
hs := filter.makeKeyHashes(key)

H[hs.h0].xormask ^= hs.h
H[hs.h0].count++
H[hs.h1].xormask ^= hs.h
H[hs.h1].count++
H[hs.h2].xormask ^= hs.h
H[hs.h2].count++
}

Qsize := 0
// Add sets with one key to the queue.
for i := uint32(0); i < capacity; i++ {
if H[i].count == 1 {
Q[Qsize].index = i
Q[Qsize].hash = H[i].xormask
Qsize++
}
}

stacksize := 0
for Qsize > 0 {
Qsize--
ki := Q[Qsize]
index := ki.index
if H[index].count == 0 {
continue // not actually possible after the initial scan
}

hash := ki.hash
hs := filter.geth012(hash)

stack[stacksize] = ki
stacksize++

// Remove key added to stack from all sets in the construction array and
// enqueue sets that now have one key.
H[hs.h0].xormask ^= hash
H[hs.h0].count--
if H[hs.h0].count == 1 {
Q[Qsize].index = hs.h0
Q[Qsize].hash = H[hs.h0].xormask
Qsize++
}
H[hs.h1].xormask ^= hash
H[hs.h1].count--
if H[hs.h1].count == 1 {
Q[Qsize].index = hs.h1
Q[Qsize].hash = H[hs.h1].xormask
Qsize++
}
H[hs.h2].xormask ^= hash
H[hs.h2].count--
if H[hs.h2].count == 1 {
Q[Qsize].index = hs.h2
Q[Qsize].hash = H[hs.h2].xormask
Qsize++
}
}

if stacksize == size {
// Success
break
}

for i := range H {
H[i] = xorset{0, 0}
}
filter.Seed = splitmix64(&rngcounter)
}

// ref: Algorithm 4
stacksize := size
for stacksize > 0 {
stacksize--
ki := stack[stacksize]
hs := filter.geth012(ki.hash)
fp := uint8(fingerprint(ki.hash))
switch ki.index {
case hs.h0:
fp ^= filter.Fingerprints[hs.h1] ^ filter.Fingerprints[hs.h2]
case hs.h1:
fp ^= filter.Fingerprints[hs.h0] ^ filter.Fingerprints[hs.h2]
default:
fp ^= filter.Fingerprints[hs.h0] ^ filter.Fingerprints[hs.h1]
}
filter.Fingerprints[ki.index] = fp
}

return filter
}
63 changes: 63 additions & 0 deletions fusefilter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package xorfilter

import (
"fmt"
"math/rand"
"testing"

"github.com/stretchr/testify/assert"
)

const NUM_KEYS = 1e6

func TestFuse8Basic(t *testing.T) {
testsize := 1000000
keys := make([]uint64, NUM_KEYS)
for i := range keys {
keys[i] = rand.Uint64()
}
filter := PopulateFuse8(keys)
for _, v := range keys {
assert.Equal(t, true, filter.Contains(v))
}
falsesize := 1000000
matches := 0
bpv := float64(len(filter.Fingerprints)) * 8.0 / float64(testsize)
fmt.Println("Fuse8 filter:")
fmt.Println("bits per entry ", bpv)
assert.Equal(t, true, bpv < 9.101)
for i := 0; i < falsesize; i++ {
v := rand.Uint64()
if filter.Contains(v) {
matches++
}
}
fpp := float64(matches) * 100.0 / float64(falsesize)
fmt.Println("false positive rate ", fpp)
assert.Equal(t, true, fpp < 0.40)
}

func BenchmarkFuse8Populate1000000(b *testing.B) {
keys := make([]uint64, NUM_KEYS, NUM_KEYS)
for i := range keys {
keys[i] = rand.Uint64()
}

b.ResetTimer()
for n := 0; n < b.N; n++ {
PopulateFuse8(keys)
}
}

func BenchmarkFuse8Contains1000000(b *testing.B) {
keys := make([]uint64, NUM_KEYS, NUM_KEYS)
for i := range keys {
keys[i] = rand.Uint64()
}
filter := PopulateFuse8(keys)

b.ResetTimer()
for n := 0; n < b.N; n++ {
filter.Contains(keys[n%len(keys)])
}
}
2 changes: 1 addition & 1 deletion xorfilter.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func mixsplit(key, seed uint64) uint64 {
}

func rotl64(n uint64, c int) uint64 {
return (n << uint(c & 63)) | (n >> uint((-c) & 63))
return (n << uint(c&63)) | (n >> uint((-c)&63))
}

func reduce(hash, n uint32) uint32 {
Expand Down
1 change: 1 addition & 0 deletions xorfilter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ func TestBasic(t *testing.T) {
falsesize := 1000000
matches := 0
bpv := float64(len(filter.Fingerprints)) * 8.0 / float64(testsize)
fmt.Println("Xor8 filter:")
fmt.Println("bits per entry ", bpv)
assert.Equal(t, true, bpv < 10.)
for i := 0; i < falsesize; i++ {
Expand Down