# BDA Lab 2 (Bloom Filter)
> Name: Debatreya Das<br>
> Roll No. 12212070 <br>
> CS A4

## The Bloom Filter class

In [2]:
!pip install mmh3

Collecting mmh3
  Downloading mmh3-5.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading mmh3-5.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (99 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mmh3
Successfully installed mmh3-5.1.0


In [4]:
!pip install bitarray

Collecting bitarray
  Downloading bitarray-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Downloading bitarray-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (278 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.3/278.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: bitarray
Successfully installed bitarray-3.0.0


In [5]:
import math
import mmh3        # a fast non-cryptographic hash library
from bitarray import bitarray
import hashlib
import numpy as np
import pandas as pd

# A simple Bloom Filter implementation:
class BloomFilter:
    def __init__(self, items_count, fp_prob):
        """
        items_count : int
            Number of items expected to be stored in bloom filter
        fp_prob : float
            False Positive probability in decimal
        """
        self.fp_prob = fp_prob
        self.size = self.get_size(items_count, fp_prob)
        self.hash_count = self.get_hash_count(self.size, items_count)
        self.bit_array = bitarray(self.size)
        self.bit_array.setall(0)
        
    def add(self, item):
        """Add an item (a string) to the bloom filter."""
        for i in range(self.hash_count):
            # Use mmh3.hash with different seeds (i)
            digest = mmh3.hash(item, i) % self.size
            self.bit_array[digest] = True
            
    def check(self, item):
        """
        Check for membership of an item in the bloom filter.
        Returns True if the item might be present (with fp_prob chance of false positive)
        and False if the item is definitely not present.
        """
        for i in range(self.hash_count):
            digest = mmh3.hash(item, i) % self.size
            if not self.bit_array[digest]:
                return False
        return True

    @classmethod
    def get_size(cls, n, p):
        """
        Return the size of bit array(m) to used using:
            m = -(n * ln(p)) / (ln(2)^2)
        n : int (number of items)
        p : float (false positive probability)
        """
        m = -(n * math.log(p))/(math.log(2)**2)
        return int(m)

    @classmethod
    def get_hash_count(cls, m, n):
        """
        Return the number of hash functions(k) to be used using:
            k = (m/n) * ln(2)
        """
        k = (m/n) * math.log(2)
        return int(k)

## Functions for converting text to vectors

### Using GloVe

In [10]:
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Example usage:
glove_path = "/kaggle/input/glove6b100dtxt/glove.6B.100d.txt"
glove_embeddings = load_glove_embeddings(glove_path)
glove_dim = 100

### Converting a tweet into a vector
average the embedding vectors for each word in the tweet

In [19]:
def tweet_to_vector(tweet, embeddings, dim):
    """
    Convert a tweet (email text) to an average vector using given embeddings.
    If no words are found in the embeddings, return a zero vector.
    """
    tweet = str(tweet)
    
    words = tweet.split()  # Simple tokenization.
    vectors = [embeddings[word.lower()] for word in words if word.lower() in embeddings]
    
    if len(vectors) == 0:
        return np.zeros(dim)
    else:
        return np.mean(vectors, axis=0)


### Converting a vector to a hashable string

In [13]:
def vector_to_hash(vector):
    """
    Convert a numpy vector to a hashable hex string.
    """
    vector_bytes = vector.tobytes()
    return hashlib.sha256(vector_bytes).hexdigest()


## Load and Preprocess Dataset

In [20]:
# Load the dataset
df = pd.read_csv('/kaggle/input/spam-or-not-spam-dataset/spam_or_not_spam.csv')

In [22]:

# Split into spam and ham DataFrames
spam_df = df[df['label'] == 1]
ham_df = df[df['label'] == 0]

# Ensure missing values are filled and all entries are strings
df['email'] = df['email'].fillna('').astype(str)

# If you already have separate DataFrames for spam and ham:
spam_df['email'] = spam_df['email'].fillna('').astype(str)
ham_df['email'] = ham_df['email'].fillna('').astype(str)



# Preview the data:
print(df.head())
print(df['email'].dtype)  # Check the data type of the column
print("Number of Spam ",len(spam_df))
print("Number of Ham ",len(ham_df))

                                               email  label
0   date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0
1  martin a posted tassos papadopoulos the greek ...      0
2  man threatens explosion in moscow thursday aug...      0
3  klez the virus that won t die already the most...      0
4   in adding cream to spaghetti carbonara which ...      0
object
Number of Spam  500
Number of Ham  2500


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spam_df['email'] = spam_df['email'].fillna('').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ham_df['email'] = ham_df['email'].fillna('').astype(str)


## Build the Bloom Filter and Insert Spam Messages

In [24]:
# Define false positive probability and count of spam messages
fp_prob = 0.01
spam_count = len(spam_df)

# Ensure that the email column is cleaned
spam_df['email'] = spam_df['email'].fillna('').astype(str)

# Create a Bloom Filter instance for GloVe embeddings:
bf_glove = BloomFilter(spam_count, fp_prob)

# Insert each spam email into the Bloom Filter
for tweet in spam_df['email']:
    vec = tweet_to_vector(tweet, glove_embeddings, glove_dim)
    tweet_hash = vector_to_hash(vec)
    bf_glove.add(tweet_hash)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spam_df['email'] = spam_df['email'].fillna('').astype(str)


## Evaluate the Filter: Compute TPR and FPR

### True Positive Rates

In [26]:
spam_tp = 0
for tweet in spam_df['email']:
    vec = tweet_to_vector(tweet, glove_embeddings, glove_dim)
    tweet_hash = vector_to_hash(vec)
    if bf_glove.check(tweet_hash):
        spam_tp += 1

spam_tpr = spam_tp / len(spam_df)
print("GloVe-based Filter:")
print("True Positive Rate (Spam correctly flagged):", spam_tpr)


GloVe-based Filter:
True Positive Rate (Spam correctly flagged): 1.0


### False Positive Rates

In [27]:
ham_fp = 0
for tweet in ham_df['email']:
    vec = tweet_to_vector(tweet, glove_embeddings, glove_dim)
    tweet_hash = vector_to_hash(vec)
    if bf_glove.check(tweet_hash):
        ham_fp += 1

ham_fpr = ham_fp / len(ham_df)
print("False Positive Rate (Ham incorrectly flagged):", ham_fpr)


False Positive Rate (Ham incorrectly flagged): 0.0032


## Results 
using GloVe embeddings

- True Positive Rate (Spam correctly flagged): 1.0
- False Positive Rate (Ham incorrectly flagged): 0.0032