In [1]:
import bitarray
import mmh3

#### Declare the BloomFilter class

- `size`: The number of bits in the filter.
- `hash_count`: The number of hash functions to apply.
- `hash_f`: A hash function that generates multiple hash values.

##### Methods:
- `add(item)`: Inserts an item into the filter.
- `[item]`: Checks if an item is possibly in the filter.

In [2]:
class BloomFilter:
    def __init__(self, size, hash_count, hash_f):
        """
        :param size: The number of bits in the filter.
        :param hash_count: The number of hash functions to apply.
        :param hash_f: A hash function that generates multiple hash values.
        """
        self.size = size
        self.hash_count = hash_count
        self.bit_array = bitarray.bitarray(size)
        self.bit_array.setall(0)
        self.hash_f = hash_f

    def add(self, item: str):
        """
        Insert an item into the filter.

        :param item: The item to be added, represented as a string.
        """
        hashes = self._hashes(item)
        for hash_value in hashes:
            self.bit_array[hash_value] = 1

    def _hashes(self, item: str) -> list[int]:
        """
        Generate hash values for the given item.

        :param item: The input item to hash.
        :return: A list of hash values corresponding to bit positions in the Bloom filter.
        """
        return self.hash_f(item, self.size, self.hash_count)

    def __contains__(self, item: str) -> bool:
        """
        Check whether an item is possibly in the Bloom filter.

        :param item: The item to check, represented as a string.
        :return: True if the item might be in the filter, False if it is definitely not.
        """
        return all(self.bit_array[hash_value] for hash_value in self._hashes(item))

#### Declare the hash function

To generate multiple hash values for our Bloom filter, we use **MurmurHash3**, a non-cryptographic hash function known for its speed and good dispersion.

The function `mmh3_hashes` takes:
- `data`: The input string.
- `m`: The Bloom filter size.
- `k`: The number of hash functions.

It returns a list of `k` different hash values, each mapped to a position in the bit array.

In [3]:
def mmh3_hashes(data: str, m: int, k: int) -> list[int]:
    return [mmh3.hash(data, i) % m for i in range(k)]

#### Usage example

Initialize a Bloom filter with 1000 bits and 3 hash functions

In [4]:
bloom = BloomFilter(size=1000, hash_count=3, hash_f=mmh3_hashes)

Add some elements

In [5]:
bloom.add('item_1')
bloom.add('item_2')
bloom.add('item_3')

Check for presence

In [6]:
print('item_1' in bloom)  # Expected output: True
print('item_2' in bloom)  # Expected output: True
print('item_3' in bloom)  # Expected output: True

True
True
True


Check for an element not in the filter

In [7]:
print('item_4' in bloom)  # Expected output: False (but could be True due to false positives)

False
