In [None]:
class Murmur3:
    # Constants for 32-bit variant
    C1_32 = 0xcc9e2d51
    C2_32 = 0x1b873593
    R1_32 = 15
    R2_32 = 13
    M_32 = 5
    N_32 = 0xe6546b64

    # Constants for 128-bit variant
    C1 = 0x87c37b91114253d5
    C2 = 0x4cf5ad432745937f
    R1 = 31
    R2 = 27
    R3 = 33
    M = 5
    N1 = 0x52dce729
    N2 = 0x38495ab5

    DEFAULT_SEED = 0
    @staticmethod
    def hash32(data):
        return Murmur3.hash32_with_seed(data, len(data), Murmur3.DEFAULT_SEED)
    @staticmethod
    def hash32_with_seed(data, length, seed):
        
        hash_value = seed
        nblocks = length >> 2

    # Body
        for i in range(nblocks):
            i_4 = i << 2
            k = (data[i_4] & 0xff) | ((data[i_4 + 1] & 0xff) << 8) | ((data[i_4 + 2] & 0xff) << 16) | ((data[i_4 + 3] & 0xff) << 24)

        # Mix functions
            k *= Murmur3.C1_32
            k = (k << Murmur3.R1_32) | (k >> (32 - Murmur3.R1_32))
            k *= Murmur3.C2_32
            hash_value ^= k
            hash_value = (hash_value << Murmur3.R2_32) | (hash_value >> (32 - Murmur3.R2_32))
            hash_value = (hash_value * Murmur3.M_32) + Murmur3.N_32

    # Tail
        idx = nblocks << 2
        k1 = 0
        remaining = length - idx
        if remaining >= 3:
            k1 ^= data[idx + 2] << 16
        if remaining >= 2:
            k1 ^= data[idx + 1] << 8
        if remaining >= 1:
            k1 ^= data[idx]

        # Mix functions
            k1 *= Murmur3.C1_32
            k1 = (k1 << Murmur3.R1_32) | (k1 >> (32 - Murmur3.R1_32))
            k1 *= Murmur3.C2_32
            hash_value ^= k1

    # Finalization
        hash_value ^= length
        hash_value ^= (hash_value >> 16)
        hash_value *= 0x85ebca6b
        hash_value ^= (hash_value >> 13)
        hash_value *= 0xc2b2ae35
        hash_value ^= (hash_value >> 16)

        return hash_value
    @staticmethod
    def hash64(data):
        return Murmur3.hash64_with_seed(data, len(data), Murmur3.DEFAULT_SEED)
    @staticmethod
    def hash64_with_seed(data, length, seed):
        hash_value = seed
        nblocks = length >> 3

    # Body
        for i in range(nblocks):
            i8 = i << 3
            k = ((data[i8] & 0xff) | ((data[i8 + 1] & 0xff) << 8) | ((data[i8 + 2] & 0xff) << 16) |
             ((data[i8 + 3] & 0xff) << 24) | ((data[i8 + 4] & 0xff) << 32) | ((data[i8 + 5] & 0xff) << 40) |
             ((data[i8 + 6] & 0xff) << 48) | ((data[i8 + 7] & 0xff) << 56))

        # Mix functions
            k *= Murmur3.C1
            k = (k << Murmur3.R1) | (k >> (64 - Murmur3.R1))
            k *= Murmur3.C2
            hash_value ^= k
            hash_value = (hash_value << Murmur3.R2) | (hash_value >> (64 - Murmur3.R2))
            hash_value = (hash_value * Murmur3.M) + Murmur3.N1

    # Tail
        k1 = 0
        tail_start = nblocks << 3
        remaining = length - tail_start
        if remaining >= 7:
            k1 ^= (data[tail_start + 6] & 0xff) << 48
        if remaining >= 6:
            k1 ^= (data[tail_start + 5] & 0xff) << 40
        if remaining >= 5:
            k1 ^= (data[tail_start + 4] & 0xff) << 32
        if remaining >= 4:
            k1 ^= (data[tail_start + 3] & 0xff) << 24
        if remaining >= 3:
            k1 ^= (data[tail_start + 2] & 0xff) << 16
        if remaining >= 2:
            k1 ^= (data[tail_start + 1] & 0xff) << 8
        if remaining >= 1:
            k1 ^= (data[tail_start] & 0xff)
            k1 *= Murmur3.C1
            k1 = (k1 << Murmur3.R1) | (k1 >> (64 - Murmur3.R1))
            k1 *= Murmur3.C2
            hash_value ^= k1

    # Finalization
        hash_value ^= length
        hash_value = Murmur3.fmix64(hash_value)

        return hash_value
    @staticmethod
    def hash128(data):
        return Murmur3.hash128_with_seed(data, len(data), Murmur3.DEFAULT_SEED)
    @staticmethod
    def hash128_with_seed(data, length, seed):
        h1 = seed
        h2 = seed
        nblocks = length >> 4

    # Body
        for i in range(nblocks):
            i16 = i << 4
            k1 = (data[i16] & 0xff) | ((data[i16 + 1] & 0xff) << 8) | ((data[i16 + 2] & 0xff) << 16) | \
             ((data[i16 + 3] & 0xff) << 24) | ((data[i16 + 4] & 0xff) << 32) | ((data[i16 + 5] & 0xff) << 40) | \
             ((data[i16 + 6] & 0xff) << 48) | ((data[i16 + 7] & 0xff) << 56)

            k2 = (data[i16 + 8] & 0xff) | ((data[i16 + 9] & 0xff) << 8) | ((data[i16 + 10] & 0xff) << 16) | \
             ((data[i16 + 11] & 0xff) << 24) | ((data[i16 + 12] & 0xff) << 32) | ((data[i16 + 13] & 0xff) << 40) | \
             ((data[i16 + 14] & 0xff) << 48) | ((data[i16 + 15] & 0xff) << 56)

        # Mix functions for k1
            k1 *= C1
            k1 = (k1 << R1) & 0xffffffffffffffff
            k1 *= C2
            h1 ^= k1
            h1 = (h1 << R2) | (h1 >> (64 - R2))
            h1 = (h1 * M) & 0xffffffffffffffff
            h1 += h2

        # Mix functions for k2
            k2 *= C2
            k2 = (k2 << R3) & 0xffffffffffffffff
            k2 *= C1
            h2 ^= k2
            h2 = (h2 << R1) | (h2 >> (64 - R1))
            h2 = (h2 * M) & 0xffffffffffffffff
            h2 += h1

    # Tail
        k1 = 0
        k2 = 0
        tail_start = nblocks << 4
        remaining = length - tail_start

        if remaining >= 15:
            k2 ^= (data[tail_start + 14] & 0xff) << 48
        if remaining >= 14:
            k2 ^= (data[tail_start + 13] & 0xff) << 40
        if remaining >= 13:
            k2 ^= (data[tail_start + 12] & 0xff) << 32
        if remaining >= 12:
            k2 ^= (data[tail_start + 11] & 0xff) << 24
        if remaining >= 11:
            k2 ^= (data[tail_start + 10] & 0xff) << 16
        if remaining >= 10:
            k2 ^= (data[tail_start + 9] & 0xff) << 8
        if remaining >= 9:
            k2 ^= (data[tail_start + 8] & 0xff)
            k2 *= C2
            k2 = (k2 << R3) & 0xffffffffffffffff
            k2 *= C1
            h2 ^= k2

        if remaining >= 8:
            k1 ^= (data[tail_start + 7] & 0xff) << 56
        if remaining >= 7:
            k1 ^= (data[tail_start + 6] & 0xff) << 48
        if remaining >= 6:
            k1 ^= (data[tail_start + 5] & 0xff) << 40
        if remaining >= 5:
            k1 ^= (data[tail_start + 4] & 0xff) << 32
        if remaining >= 4:
            k1 ^= (data[tail_start + 3] & 0xff) << 24
        if remaining >= 3:
            k1 ^= (data[tail_start + 2] & 0xff) << 16
        if remaining >= 2:
            k1 ^= (data[tail_start + 1] & 0xff) << 8
        if remaining >= 1:
            k1 ^= (data[tail_start] & 0xff)
            k1 *= C1
            k1 = (k1 << R1) & 0xffffffffffffffff
            k1 *= C2
            h1 ^= k1

    # Finalization
        h1 ^= length
        h2 ^= length

        h1 += h2
        h2 += h1

        h1 = (h1 ^ (h1 >> 33)) & 0xffffffffffffffff
        h1 = (h1 * 0xff51afd7ed558ccd) & 0xffffffffffffffff
        h1 = (h1 ^ (h1 >> 33)) & 0xffffffffffffffff

        h2 = (h2 ^ (h2 >> 33)) & 0xffffffffffffffff
        h2 = (h2 * 0xc4ceb9fe1a85ec53) & 0xffffffffffffffff
        h2 = (h2 ^ (h2 >> 33)) & 0xffffffffffffffff

        h1 += h2
        h2 += h1

        return [h1, h2]
    def fmix64(h):
        h ^= (h >> 33)
        h *= 0xff51afd7ed558ccd
        h ^= (h >> 33)
        h *= 0xc4ceb9fe1a85ec53
        h ^= (h >> 33)
        return h



In [1]:
import math
import struct


class CountMinSketch:
    DEFAULT_DELTA = 0.01
    DEFAULT_EPSILON = 0.01

    def __init__(self, delta=None, epsilon=None, width=None, depth=None, multiset=None):
        if delta is None and epsilon is None:
            delta = CountMinSketch.DEFAULT_DELTA
            epsilon = CountMinSketch.DEFAULT_EPSILON

        if width is None and depth is None:
            width = math.ceil(math.exp(1.0) / epsilon)
            depth = math.ceil(math.log(1.0 / delta))

        if multiset is None:
            multiset = [[0] * width for _ in range(depth)]

        self.w = width
        self.d = depth
        self.multiset = multiset

    def get_width(self):
        return self.w

    def get_depth(self):
        return self.d

    def get_size_in_bytes(self):
        return ((self.w * self.d) + 2) * (struct.calcsize("i") // 8)

    def set(self, key, inc):
        hash64 = Murmur3.hash64(key)
        hash1 = hash64 & 0xFFFFFFFF
        hash2 = hash64 >> 32

        for i in range(1, self.d + 1):
            combined_hash = hash1 + (i * hash2)
            if combined_hash < 0:
                combined_hash = ~combined_hash
            pos = combined_hash % self.w
            self.multiset[i - 1][pos] += inc

    def set_string(self, val, inc):
        print(type(val))
        self.set(val.encode(), inc)

    @staticmethod
    def int_to_byte_array_le(val):
        return val.to_bytes(4, "little", signed=True)

    @staticmethod
    def long_to_byte_array_le(val):
        return val.to_bytes(8, "little", signed=True)

    def get_estimated_count(self, key):
        hash64 = Murmur3.hash64(key)
        hash1 = hash64 & 0xFFFFFFFF
        hash2 = hash64 >> 32
        min_val = float("inf")

        for i in range(1, self.d + 1):
            combined_hash = hash1 + (i * hash2)
            if combined_hash < 0:
                combined_hash = ~combined_hash
            pos = combined_hash % self.w
            min_val = min(min_val, self.multiset[i - 1][pos])

        return min_val

    def get_estimated_count_string(self, val):
        return self.get_estimated_count(val.encode())

    def get_estimated_count_byte(self, val):
        return self.get_estimated_count(bytes([val]))

    def get_estimated_count_int(self, val):
        return self.get_estimated_count(CountMinSketch.int_to_byte_array_le(val))

    def get_estimated_count_long(self, val):
        return self.get_estimated_count(CountMinSketch.long_to_byte_array_le(val))

    def get_estimated_count_float(self, val):
        return self.get_estimated_count_int(struct.unpack("i", struct.pack("f", val))[0])

    def get_estimated_count_double(self, val):
        return self.get_estimated_count_long(struct.unpack("q", struct.pack("d", val))[0])

    def merge(self, other):
        if other is None:
            return

        if self.w != other.w:
            raise RuntimeError("Merge failed! Width of CountMinSketch does not match!")
        if self.d != other.d:
            raise RuntimeError("Merge failed! Depth of CountMinSketch does not match!")

        for i in range(self.d):
            for j in range(self.w):
                self.multiset[i][j] += other.multiset[i][j]

    @staticmethod
    def serialize(cms):
        serialized_size = cms.get_size_in_bytes()
        bb = bytearray(struct.pack("ii", cms.get_width(), cms.get_depth()))
        for i in range(cms.get_depth()):
            for j in range(cms.get_width()):
                bb.extend(struct.pack("q", cms.multiset[i][j]))
        return bytes(bb)

    @staticmethod
    def deserialize(serialized):
        width, depth = struct.unpack_from("ii", serialized)
        offset = struct.calcsize("ii")
        multiset = [[0] * width for _ in range(depth)]
        for i in range(depth):
            for j in range(width):
                multiset[i][j], = struct.unpack_from("q", serialized, offset)
                offset += struct.calcsize("q")
        return CountMinSketch(width=width, depth=depth, multiset=multiset)

    

In [2]:
cms = CountMinSketch(width=100, depth=5)


In [35]:
cms.set_string("apple", 1)
cms.set_string("banana", 5)
cms.set_string("orange", 8)

# Get estimated counts
count1 = cms.get_estimated_count_string("apple")
count2 = cms.get_estimated_count_string("banana")
count3 = cms.get_estimated_count_string("orange")
print(count1)  # Output: Estimated count for "apple"
print(count2)  # Output: Estimated count for "banana"
print(count3)  # Output: Estimated count for "orange

1
5
8


10
5
8
