# Code Written by:
**Shweta Tiwari**
*20 Oct 2023*

## Algorithm:  Bloom Filter

In [1]:
import time

In [3]:
!pip install bitarray

Collecting bitarray
  Downloading bitarray-2.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (286 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/286.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/286.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.5/286.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitarray
Successfully installed bitarray-2.8.2


In [4]:
import numpy as np
from collections import deque
from bitarray import bitarray

# Algorithm

In [5]:
%%time
def ihash(x):
    h = 86813
    while True:
        for i in x:
            h = ((h + i) * 127733) % (1 << 32)
        yield h

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 11 µs


In [6]:
%%time
def bloom_filter(array_bytes, k):
    array = bitarray(array_bytes * 8)
    array.setall(0)

    def _hash(x):
        for _, h in zip(range(k), ihash(x)):
            yield h % len(array)

    def _add(x):
        for h in _hash(x):
            array[h] = 1

    def _contains(x):
        return all(array[h] for h in _hash(x))

    return _add, _contains

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.58 µs


In [7]:
%%time
def measure_accuracy(A, B, array_bytes, k):
    add, contains = bloom_filter(array_bytes, k)

    # store A
    deque((add(x) for x in A), 0)

    # find false positives in B
    fp = sum(contains(x) for x in B)

    # result
    acc = 1 - fp / len(B)
    print('{} hashes, {} false positives, {:.4f} accuracy'.format(k, fp, acc))

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 9.06 µs


# Run

In [8]:
%%time
n = 10 ** 6
A = set(map(tuple, np.random.randint(0, 256, (n, 4))))
B = set(map(tuple, np.random.randint(0, 256, (n, 4)))) - A
len(A), len(B)

CPU times: user 4.23 s, sys: 361 ms, total: 4.59 s
Wall time: 5.25 s


(999859, 999642)

In [9]:
%%time
for k in [1, 2, 3, 4]:
    measure_accuracy(A, B, n, k)

1 hashes, 118271 false positives, 0.8817 accuracy
2 hashes, 67302 false positives, 0.9327 accuracy
3 hashes, 39999 false positives, 0.9600 accuracy
4 hashes, 61307 false positives, 0.9387 accuracy
CPU times: user 35.5 s, sys: 75.8 ms, total: 35.6 s
Wall time: 35.8 s


In [10]:
%%time
for k in [1, 2, 4, 6, 8]:
    measure_accuracy(A, B, n * 4, k)

1 hashes, 30755 false positives, 0.9692 accuracy
2 hashes, 5537 false positives, 0.9945 accuracy
4 hashes, 980 false positives, 0.9990 accuracy
6 hashes, 349 false positives, 0.9997 accuracy
8 hashes, 450 false positives, 0.9995 accuracy
CPU times: user 55.9 s, sys: 131 ms, total: 56.1 s
Wall time: 56.8 s


# The End