---
**<font color='red'> If you have issues with pyhash (possible on a mac) run this notebook in google colab </font>**

---

In [None]:
!pip install pyhash

In [30]:
import pyhash
import math
import numpy as np

#### 1. Start by creating a bit vector bit_vector of size 20 and initialize it to all zeros.

In [15]:
bit_vector = np.zeros(20)
print(bit_vector)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


#### 2. Define two hash function first_hasher and second_hasher. For example, you can istall the package pyhash: (pypi.org/project/pyhash/) and use MurmurHash and FNV (Fowler-Noll-Vo) hash algorithms.

In [13]:
# Define the hash functions
first_hasher = pyhash.murmur3_32()
second_hasher = pyhash.fnv1_32()



MurmurHash of data: 1777475617
FNV hash of data: 3316498663


#### 3. Calculate the output of first_hasher and second_hasher modulus 20 for Pikachu and Charmander and print them.

In [25]:
first_hash_pikachu = first_hasher("Pikachu")%20
second_hash_pikachu = second_hasher("Pikachu")%20

first_hash_charmander = first_hasher("Charmander")%20
second_hash_charmander = second_hasher("Charmander")%20

print(f"FNV hash output for Pikachu: {first_hash_pikachu}")
print(f"Murmur hash output for Pikachu: {second_hash_pikachu}")
print(f"FNV hash output for Charmander: {first_hash_charmander}")
print(f"Murmur hash output for Charmander: {second_hash_charmander}")

FNV hash output for Pikachu: 15
Murmur hash output for Pikachu: 19
FNV hash output for Charmander: 6
Murmur hash output for Charmander: 3


#### 4 .Flip the bits of bit_vector in the corresponding locations from the above hashes.

In [18]:
bit_vector[first_hash_pikachu] = 1
bit_vector[second_hash_pikachu] = 1
bit_vector[first_hash_charmander] = 1
bit_vector[second_hash_charmander] = 1
print(bit_vector)

[0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.]


#### 5. A wild Bulbasaur appears! Use your filter to check if Bulbasaur have been already caught. If not, update the filter.

In [24]:
# Calculate the hashed values for "Bulbasaur"
first_hash_bulbasaur = first_hasher("Bulbasaur")%20
second_hash_bulbasaur = second_hasher("Bulbasaur")%20

print(f"FNV hash output for Bulbasaur: {first_hash_bulbasaur}")
print(f"Murmur hash output for Bulbasaur: {second_hash_bulbasaur}")

FNV hash output for Bulbasaur: 6
Murmur hash output for Bulbasaur: 3


In [21]:
# Check the corresponding elements in bit_vector
if bit_vector[first_hash_bulbasaur] == 0:
    print("Bulbasaur not caught")
    bit_vector[first_hash_bulbasaur] = 1

if bit_vector[second_hash_bulbasaur] == 0:
    print("Bulbasaur not caught")
    bit_vector[second_hash_bulbasaur] = 1

In [23]:
# We caught a new pokemon - update the pokedex
print(bit_vector)

[0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.]


In [None]:
# Check new state of bit_vector
print(bit_vector)

#### 6. Assume that you have caught the following pokemons... Write a loop which allows you to update the filter. Print it out. What do you observe?

In [27]:
caught_pokemon = ["Bulbasaur", "Ivysaur", "Venusaur", "Charmander", "Charmeleon", "Charizard",  "Squirtle", "Wartortle", "Blastoise", "Caterpie", "Metapod", "Butterfree", "Weedle",    "Kakuna", "Beedrill", "Pidgeotto", "Pidgeot", "Rattata", "Raticate", "Spearow", "Fearow",    "Ekans", "Arbok", "Pikachu", "Raichu", "Sandshrew", "Sandslash", "Nidoran (female)", "Nidorina",   "Nidoqueen", "Nidoran (male)", "Nidorino", "Nidoking", "Clefairy", "Clefable", "Vulpix",   "Ninetales", "Jigglypuff", "Wigglytuff", "Zubat", "Golbat", "Oddish", "Gloom", "Vileplume",    "Paras", "Parasect", "Venonat", "Venomoth", "Diglet", "Dugtrio", "Meowth", "Persian", "Psyduck",   "Golduck", "Mankey", "Primeape", "Growlithe", "Arcanine", "Poliwag", "Poliwhirl", "Poliwrath",  "Abra", "Kadabra", "Alakazam", "Machop", "Machoke", "Machamp", "Bellsprout", "Weepinbell",  "Victreebel", "Tentacool", "Tentacruel" ]

In [29]:
# Re-initialise the pokedex
pokedex = np.zeros(20)

# Update the bloom filter pokedex with all pokemon you have caught
for pokemon in caught_pokemon:
    pokedex[first_hasher(pokemon)%20] =1
    pokedex[second_hasher(pokemon)%20] =1

print(pokedex)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


#### 7. Assuming that the total number of Pokemon one can encounter is 151, find how big your Bloom filter needs to be? Determine also how many hash functions we should use.

In [31]:
def p_false_positive(k: int, n: int, m: int) -> float:
    return (1 - (1 - 1/n)**(k*m))**k

def n_optimal_hash(n: int, m: int) -> float:
    return n/m * np.log(2)

In [37]:

n = 3000    # size of bloom filter
m = 151    # number of pokemon already caught
k = n_optimal_hash(n,m)    # number of hash fucntions
p_false_positive(k, n, m)

7.164171486604895e-05

In [33]:
n_optimal_hash(n, m)

13.771136037614808

#### 8. We can try you new Bloom filter with the following set:

In [38]:
caught_pokemon = ["Bulbasaur", "Ivysaur", "Venusaur", "Charmander", "Charmeleon", "Charizard",  "Squirtle", "Wartortle", "Blastoise", "Caterpie", "Metapod", "Butterfree", "Weedle", "Kakuna", "Beedrill", "Pidgeotto", "Pidgeot", "Rattata", "Raticate", "Spearow", "Fearow", "Ekans", "Arbok", "Pikachu", "Raichu", "Sandshrew", "Sandslash", "Nidoran (female)", "Nidorina", "Nidoqueen", "Nidoran (male)", "Nidorino", "Nidoking", "Clefairy", "Clefable", "Vulpix","Ninetales", "Jigglypuff", "Wigglytuff", "Zubat", "Golbat", "Oddish", "Gloom", "Vileplume", "Paras", "Parasect", "Venonat", "Venomoth", "Diglet", "Dugtrio", "Meowth", "Persian", "Psyduck", "Golduck", "Mankey", "Primeape", "Growlithe", "Arcanine", "Poliwag", "Poliwhirl", "Poliwrath", "Abra", "Kadabra", "Alakazam", "Machop", "Machoke", "Machamp", "Bellsprout", "Weepinbell","Victreebel", "Tentacool", "Tentacruel", "Geodude", "Graveler", "Golem", "Ponyta", "Rapidash", "Slowpoke", "Slowbro", "Magnemite", "Magneton", "Farfetc'd", "Doduo", "Dodrio", "Seel","Dewgong", "Grimer", "Muk", "Shellder", "Cloyster", "Gastly", "Haunter", "Gengar", "Onyx", "Drowzee", "Hypno", "Krabby", "Kingler", "Voltorb", "Electrode", "Exeggcute", "Exeggutor","Cubone", "Marowak", "Hitmonlee", "Hitmonchan", "Lickitung", "Koffing", "Weezing", "Rhyhorn", "Rhydon", "Chansey", "Tangela", "Kangaskhan", "Horsea", "Seadra", "Goldeen", "Seaking","Staryu", "Starmie", "Mr. Mime", "Scyther", "Jynx", "Electabuzz", "Magmar", "Pinsir", "Tauros", "Magikarp", "Gyrados", "Lapras", "Ditto", "Eevee", "Vaporeon", "Jolteon", "Flareon", "Porygon", "Omanyte", "Omastar", "Kabuto", "Kabutops", "Aerodactyl", "Snorlax", "Articuno", "Zapdos","Moltres", "Dratini", "Dragonair", "Dragonite", "Mewtwo", "Mew" ]

In [39]:
K = 2
N = 3000
M = len(set(caught_pokemon))

In [40]:
# Initialize pokedex
pokedex = np.zeros(N)

# Update the bloom filter pokedex with all pokemon you have caught
for pokemon in caught_pokemon:
  first_hash = first_hasher(pokemon) % N
  second_hash = second_hasher(pokemon) % N

  pokedex[first_hash] = 1
  pokedex[second_hash] = 1


print(pokedex)

[0. 0. 0. ... 0. 0. 0.]


In [42]:
# Calculate the hashed values for new pokemon and check if we already caught it

pokemon = 'Squirtle'

first_hash_pidgey = first_hasher(pokemon) % N
second_hash_pidgey = second_hasher(pokemon) % N

print(pokedex[first_hash_pidgey])
print(pokedex[second_hash_pidgey])

1.0
1.0
