In [20]:
import mmh3

class Logger:
    def __init__(self, enabled = True):
        self.enabled = enabled

    def log(self, message: str):
        if self.enabled:
            print(message)

class BloomFilter:
    def __init__(self, size, num_hashes, logger: Logger = None):
        self.size = size
        self.num_hashes = num_hashes
        self.bit_array = [0] * size

        self.logger = logger if logger else Logger(enabled = False)

    def add(self, item):
        self.logger.log(f"\nAdding: {item}")
        
        for i in range(self.num_hashes):
            index = self.__index_hash__(item, i)

            self.bit_array[index] = 1

    def contains(self, item):
        self.logger.log(f"\nContains?: {item}")

        for i in range(self.num_hashes):
            index = self.__index_hash__(item, i)
            if self.bit_array[index] == 0:
                return False
            
        return True
    
    def __index_hash__(self, item, i: int) -> int:
        index = mmh3.hash(item, i) % self.size
        
        self.logger.log(f"Hash index: {index}")

        return index

if __name__ == "__main__":
    b_filter = BloomFilter(100, 3, Logger(enabled=True))
    # b_filter = BloomFilter(100, 10)

    items = ["apple", "banana", "orange"]

    for item in items:
        b_filter.add(item)

    has_banana = b_filter.contains("banana")
    print("Has banana?", has_banana)


Adding: apple
Hash index: 20
Hash index: 23
Hash index: 59

Adding: banana
Hash index: 35
Hash index: 7
Hash index: 24

Adding: orange
Hash index: 35
Hash index: 94
Hash index: 81

Contains?: banana
Hash index: 35
Hash index: 7
Hash index: 24
Has banana? True


In [29]:
def emoji(condition: bool): return "✅" if condition else "❌"

def check_if_contains(bf: BloomFilter, key: str, is_false_positive):
    is_present = bf.contains(key)
    legend = f"BF({emoji(is_present)}) FP({emoji(is_false_positive)})"
    print(f"{legend} {key}")
    return is_present

if __name__ == "__main__":
    # bf = BloomFilter(10, 3, Logger(enabled=True))
    bf = BloomFilter(10, 3, Logger(enabled=False))

    # Items we have in our set.
    included_items = ["apple", "banana", "orange"]

    for item in included_items:
        bf.add(item)

    # Items we check.
    all_items = ["apple", "banana", "orange", "grape", "kiwi"]

    print("\nℹ️ BF = Bloom Filter\nℹ️ FP = False Positive\n")

    for item in all_items:
        is_false_positive = item in included_items
        check_if_contains(bf, item, is_false_positive)

    # Potential False Positives.
    # - Item not in set, but bloom_filter might return True sometimes if the indexes overlap.
    # No possibility for False Negatives.
    # - Item not in set, bloom_filter will always return False.


ℹ️ BF = Bloom Filter
ℹ️ FP = False Positive

BF(✅) FP(✅) apple
BF(✅) FP(✅) banana
BF(✅) FP(✅) orange
BF(❌) FP(❌) grape
BF(✅) FP(❌) kiwi
