[Reference](https://medium.com/@diehardankush/the-significance-of-bloom-filters-in-parquet-a-deep-dive-c93a74e0ebcd)

In [2]:
!pip install bitarray

Collecting bitarray
  Downloading bitarray-2.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.4/287.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitarray
Successfully installed bitarray-2.8.3


In [3]:
import math
import hashlib
from bitarray import bitarray

class BloomFilter:

    def __init__(self, n_items, fp_prob):
        '''
        n_items : int
            Number of items expected to be stored in bloom filter
        fp_prob : float
            False Positive probability in decimal
        '''
        # False possible probability in decimal
        self.fp_prob = fp_prob

        # Size of bit array to use
        self.size = self.get_size(n_items,fp_prob)

        # number of hash functions to use
        self.hash_count = self.get_hash_count(self.size,n_items)

        # Bit array of given size
        self.bit_array = bitarray(self.size)

        # initialize all bits as 0
        self.bit_array.setall(0)

    def add(self, item):
        '''
        Add an item in the filter
        '''
        for i in range(self.hash_count):
            digest = hashlib.md5(str(item).encode('utf-8'))
            # perform double hashing
            result = int(digest.hexdigest(), 16)
            bit = result % self.size
            self.bit_array[bit] = True

    def check(self, item):
        '''
        Check for existence of an item in filter
        '''
        for i in range(self.hash_count):
            digest = hashlib.md5(str(item).encode('utf-8'))
            result = int(digest.hexdigest(), 16)
            bit = result % self.size
            if self.bit_array[bit] == False:
                return False
        return True

    @classmethod
    def get_size(self,n,p):
        '''
        Return the size of bit array(m) to be used
        '''
        m = -(n * math.log(p))/(math.log(2)**2)
        return int(m)

    @classmethod
    def get_hash_count(self, m, n):
        '''
        Return the hash function(k) to be used
        '''
        k = (m/n) * math.log(2)
        return int(k)

In [4]:
from random import shuffle

n = 20  # no of items to add
p = 0.05  # false positive probability

bloomf = BloomFilter(n,p)
print("Size of bit array: {}".format(bloomf.size))
print("False positive Probability: {}".format(bloomf.fp_prob))
print("Number of hash functions: {}".format(bloomf.hash_count))

# words to be added
word_present = ['abound','abounds','abundance','abundant','accessable',
                'bloom','blossom','bolster','bonny','bonus','bonuses',
                'coherent','cohesive','colorful','comely','comfort',
                'gems','generosity','generous','generously','genial']

# word not added
word_absent = ['bluff','cheater','hate','war','humanity',
               'racism','hurt','nuke','gloomy','facebook',
               'geeksforgeeks','twitter']

for item in word_present:
    bloomf.add(item)

shuffle(word_present)
shuffle(word_absent)

test_words = word_present[:10] + word_absent
shuffle(test_words)

for word in test_words:
    if bloomf.check(word):
        if word in word_absent:
            print("'{}' is a false positive!".format(word))
        else:
            print("'{}' is probably present!".format(word))
    else:
        print("'{}' is definitely not present!".format(word))

Size of bit array: 124
False positive Probability: 0.05
Number of hash functions: 4
'hate' is definitely not present!
'accessable' is probably present!
'bluff' is a false positive!
'hurt' is a false positive!
'bonus' is probably present!
'abound' is probably present!
'comfort' is probably present!
'genial' is probably present!
'war' is definitely not present!
'cheater' is definitely not present!
'twitter' is definitely not present!
'abounds' is probably present!
'gems' is probably present!
'bolster' is probably present!
'gloomy' is definitely not present!
'cohesive' is probably present!
'nuke' is definitely not present!
'geeksforgeeks' is definitely not present!
'facebook' is definitely not present!
'blossom' is probably present!
'racism' is definitely not present!
'humanity' is definitely not present!
