# Current GEMstone bloom filter implementation

In [26]:
"""Bloom filter for genetic sequences. Stored as unsigned character array."""

import sys
import numpy as np
import pybloom_live
from BitVector import BitVector
from array import array
from collections import defaultdict
from phe import paillier
import time

In [27]:
#BLOOM FILTER DEFAULTS
K = 16
H = hash
HASH_MAX = sys.maxsize + 1
SIZE = 12000

def encode(gene, size=SIZE, k=K, h=H):
    """Creates a bloom filter. Used to encode a genetic sequence.
    Args:
        gene: A string holding all or part of a DNA sequence.
        size: The size of the bloom filter. Set to the default size if
            no size is given.
        k: The size of the k-mer in the filter (how many nucleotides
            [characters] are encoded at once). Set to the default k if no
            k is given.
        h: The hash used to encode each k-mer entered in the bloom filter.
            Set to the default hash if no hash is given.
    Returns:
        The corresponding bloom filter. An array where each each entry a hashed
        k-mer maps to is a one and all other entries are zero.
        The number of of unique k-mers in the gene.
    """
    bf = initialize_bloom_filter(size)
    gene = gene.upper()                         # Make gene all uppercase.

    # Loop through all k-mers for gene.
    for n in range(0, len(gene)-k + 1):
        # Get k-mer of length k and hash it.
        k_mer = gene[n:n + k]                   # TODO: ignore case, 'N's?
        k_hash = (h(k_mer) + HASH_MAX) % size   # Make hash positive and within
                                                # size of filter.

        # Set entry the bloom filter corresponding to the hashed k-mer to one.
        bf[k_hash] = 1

    return bf

def initialize_bloom_filter(size=SIZE):
    """ Creates empty bloom filter.
    Args:
        size: The size of the bloom filter. Set to the default size if
            no size is given.
    Returns:
        An unsigned character array of the given size where each entry is 0.
        The empty bloom filter.
    """
    bf = array('b',[0])
    bf = bf * size
    return bf

def tostring(bf):
    """ Prints bloom filter on one line.
    Args:
        bf: The bloom filter.
    """
    for x in bf:
        print(x, end='')
    print()
    
def dotproduct(v1, v2):
    """Finds the dot product of two vectors (arrays). The first vector must
    be binary.

    Args:
        v1: A binary vector (array).
        v2: A vector (array).

    Returns:
        The dot product of the two vectors.
    """

    dot = 0
    for i in range(0, len(v1)):
        if v1[i] == 1:
            dot += v2[i]

    return dot

In [28]:
start = time.time()
query = 'CATGCATTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCATAGCCCATATATGGAGTTCCGCGTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACGGTAAACTGCCCACTTGGCAGTACATCAAGTGTATCATATGCCAAGTACGCCCCCTATTGACGTCAATGACGGTAAATGGCCCGCCTGGCATTATGCCCAGTACATGACCTTATGGGACTTTCCTACTTGGCAGTACATCTACGTATTAGTCATCGCTATTACCATGGTGATGCGGTTTTGGCAGTACATCAATGGGCGTGGATAGCGGTTTGACTCACGGGGATTTCCAAGTCTCCACCCCATTGACGTCAATGGGAGTTTGTTTTGGCACCAAAATCAACGGGACTTTCCAAAATGTCGTAACAACTCCGCCCCATTGACGCAAATGGGCGGTAGGCGTGTACGGTGGGAGGTCTATATAAGCAGAGCTGGTTTAGTGAACCGTCAGATCCGCTAGCGCTACCGGTCGCCACCATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTCCGGACTCAGATCTCGAGCTCAAGCTTCGAATTCTGCAGTCGACTCATTCGGGAGCTGGATGGCTTGGGACATGTGCAGCCAAGACTCTGTATGGAGTGACATAGAGTGTGCTGCTCTGGTTGGTGAGGACCAGCCTCTTTGCCCAGATCTTCCTGAACTTGACCTTTCTGAACTTGATGTGAATGACTTGGATACAGACAGCTTTCTGGGTGGATTGAAGTGGTGTAGCGACCAATCGGAAATCATATCCAACCAGTACAACAATGAGCCTGCGAACATATTTGAGAAGATAGATGAAGAGAATGAGGCAAACTTGCTAGCGGTCCTCACAGAGACACTGGACAGTCTCCCCGTGGATGAAGACGGATTGCCCTCATTTGATGCACTGACAGATGGAGCCGTGACCACTGACAACGAGGCCAGTCCTTCCTCCATGCCTGACGGCACCCCTCCCCCTCAGGAGGCAGAAGAGCCGTCTCTACTTAAGAAGCTCTTACTGGCACCAGCCAACACTCAGCTCAGCTACAATGAATGCAGCGGTCTTAGCACTCAGAACCATGCAGCAAACCACACCCACAGGATCAGAACAAACCCTGCCATTGTTAAGACCGAGAATTCATGGAGCAATAAAGCGAAGAGCATTTGTCAACAGCAAAAGCCACAAAGACGTCCCTGCTCAGAGCTTCTCAAGTATCTGACCACAAACGATGACCCTCCTCACACCAAACCCACAGAAAACAGGAACAGCAGCAGAGACAAATGTGCTTCCAAAAAGAAGTCCCATACACAACCGCAGTCGCAACATGCTCAAGCCAAACCAACAACTTTATCTCTTCCTCTGACCCCAGAGTCACCAAATGACCCCAAGGGTTCCCCATTTGAGAACAAGACTATTGAGCGAACCTTAAGTGTGGAACTCTCTGGAACTGCAGGCCTAACTCCTCCCACAACTCCTCCTCATAAAGCCAACCAAGATAACCCTTTCAAGGCTTCGCCAAAGCTGAAGCCCTCTTGCAAGACCGTGGTGCCACCGCCAACCAAGAGGGCCCGGTACAGTGAGTGTTCTGGTACCCAAGGCAGCCACTCCACCAAGAAAGGGCCCGAGCAATCTGAGTTGTACGCACAACTCAGCAAGTCCTCAGGGCTCAGCCGAGGACACGAGGAAAGGAAGACTAAACGGCCCAGTCTCCGGCTGTTTGGTGACCATGACTACTGTCAGTCACTCAATTCCAAAACGGATATACTCATTAACATATCACAGGAGCTCCAAGACTCTAGACAACTAGACTTCAAAGATGCCTCCTGTGACTGGCAGGGGCACATCTGTTCTTCCACAGATTCAGGCCAGTGCTACCTGAGAGAGACTTTGGAGGCCAGCAAGCAGGTCTCTCCTTGCAGCACCAGAAAACAGCTCCAAGACCAGGAAATCCGAGCGGAGCTGAACAAGCACTTCGGTCATCCCTGTCAAGCTGTGTTTGACGACAAATCAGACAAGACCAGTGAACTAAGGGATGGCGACTTCAGTAATGAACAATTCTCCAAACTACCTGTGTTTATAAATTCAGGACTAGCCATGGATGGCCTATTTGATGACAGTGAAGATGAAAGTGATAAACTGAGCTACCCTTGGGATGGCACGCAGCCCTATTCATTGTTCGATGTGTCGCCTTCTTGCTCTTCCTTTAACTCTCCGTGTCGAGACTCAGTGTCACCACCGAAATCCTTATTTTCTCAAAGACCCCAAAGGATGCGCTCTCGTTCAAGATCCTTTTCTCGACACAGGTCGTGTTCCCGATCACCATATTCCAGGTCAAGATCAAGGTCCCCAGGCAGTAGATCCTCTTCAAGATCCTGTTACTACTATGAATCAAGCCACTACAGACACCGCACACACCGCAATTCTCCCTTGTATGTGAGATCACGTTCAAGGTCACCCTACAGCCGTAGGCCCAGGTACGACAGCTATGAAGCCTATGAGCACGAAAGGCTCAAGAGGGATGAATACCGCAAAGAGCACGAGAAGCGGGAGTCTGAAAGGGCCAAACAGAGAGAGAGGCAGAAGCAGAAAGCAATTGAAGAGCGCCGTGTGATTTACGTTGGTAAAATCAGACCTGACACAACGCGGACAGAATTGAGAGACCGCTTTGAAGTTTTTGGTGAAATTGAGGAATGCACCGTAAATCTGCGGGATGATGGAGACAGCTATGGTTTCATCACCTACCGTTACACCTGTGACGCTTTCGCTGCTCTTGAGAATGGATATACTTTACGCAGGTCGAACGAAACTGACTTCGAGCTGTACTTTTGTGGACGGAAGCAATTTTTCAAGTCTAACTATGCAGACCTAGATACCAACTCAGACGATTTTGACCCTGCTTCCACCAAGAGCAAGTATGACTCTCTGGATTTTGATAGTTTACTGAAGGAAGCTCAGAGAAGCTTGCGCAGGTAACGTGTTCCCAGGCTGAGGGATGACAGGGATCCACCGGATCTAGATAACTGATCATAATCAGCCATACCACATTTGTAGAGGTTTTACTTGCTTTAAAAAACCTCCCACACCTCCCCCTGAACCTGAAACATAAAATGAATGCAATTGTTGTTGTTAACTTGTTTATTGCAGCTTATAATGGTTACAAATAAAGCAATAGCATCACAAATTTCACAAATAAAGCATTTTTTTCACTGCATTCTAGTTGTGGTTTGTCCAAACTCATCAATGTATCTTAACGCGTAAATTGTAAGCGTTAATATTTTGTTAAAATTCGCGTTAAATTTTTGTTAAATCAGCTCATTTTTTAACCAATAGGCCGAAATCGGCAAAATCCCTTATAAATCAAAAGAATAGACCGAGATAGGGTTGAGTGTTGTTCCAGTTTGGAACAAGAGTCCACTATTAAAGAACGTGGACTCCAACGTCAAAGGGCGAAAAACCGTCTATCAGGGCGATGGCCCACTACGTGAACCATCACCCTAATCAAGTTTTTTGGGGTCGAGGTGCCGTAAAGCACTAAATCGGAACCCTAAAGGGAGCCCCCGATTTAGAGCTTGACGGGGAAAGCCGGCGAACGTGGCGAGAAAGGAAGGGAAGAAAGCGAAAGGAGCGGGCGCTAGGGCGCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCACCACACCCGCCGCGCTTAATGCGCCGCTACAGGGCGCGTCAGGTGGCACTTTTCGGGGAAATGTGCGCGGAACCCCTATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGAGACAATAACCCTGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTCCTGAGGCGGAAAGAACCAGCTGTGGAATGTGTGTCAGTTAGGGTGTGGAAAGTCCCCAGGCTCCCCAGCAGGCAGAAGTATGCAAAGCATGCATCTCAATTAGTCAGCAACCAGGTGTGGAAAGTCCCCAGGCTCCCCAGCAGGCAGAAGTATGCAAAGCATGCATCTCAATTAGTCAGCAACCATAGTCCCGCCCCTAACTCCGCCCATCCCGCCCCTAACTCCGCCCAGTTCCGCCCATTCTCCGCCCCATGGCTGACTAATTTTTTTTATTTATGCAGAGGCCGAGGCCGCCTCGGCCTCTGAGCTATTCCAGAAGTAGTGAGGAGGCTTTTTTGGAGGCCTAGGCTTTTGCAAAGATCGATCAAGAGACAGGATGAGGATCGTTTCGCATGATTGAACAAGATGGATTGCACGCAGGTTCTCCGGCCGCTTGGGTGGAGAGGCTATTCGGCTATGACTGGGCACAACAGACAATCGGCTGCTCTGATGCCGCCGTGTTCCGGCTGTCAGCGCAGGGGCGCCCGGTTCTTTTTGTCAAGACCGACCTGTCCGGTGCCCTGAATGAACTGCAAGACGAGGCAGCGCGGCTATCGTGGCTGGCCACGACGGGCGTTCCTTGCGCAGCTGTGCTCGACGTTGTCACTGAAGCGGGAAGGGACTGGCTGCTATTGGGCGAAGTGCCGGGGCAGGATCTCCTGTCATCTCACCTTGCTCCTGCCGAGAAAGTATCCATCATGGCTGATGCAATGCGGCGGCTGCATACGCTTGATCCGGCTACCTGCCCATTCGACCACCAAGCGAAACATCGCATCGAGCGAGCACGTACTCGGATGGAAGCCGGTCTTGTCGATCAGGATGATCTGGACGAAGAGCATCAGGGGCTCGCGCCAGCCGAACTGTTCGCCAGGCTCAAGGCGAGCATGCCCGACGGCGAGGATCTCGTCGTGACCCATGGCGATGCCTGCTTGCCGAATATCATGGTGGAAAATGGCCGCTTTTCTGGATTCATCGACTGTGGCCGGCTGGGTGTGGCGGACCGCTATCAGGACATAGCGTTGGCTACCCGTGATATTGCTGAAGAGCTTGGCGGCGAATGGGCTGACCGCTTCCTCGTGCTTTACGGTATCGCCGCTCCCGATTCGCAGCGCATCGCCTTCTATCGCCTTCTTGACGAGTTCTTCTGAGCGGGACTCTGGGGTTCGAAATGACCGACCAAGCGACGCCCAACCTGCCATCACGAGATTTCGATTCCACCGCCGCCTTCTATGAAAGGTTGGGCTTCGGAATCGTTTTCCGGGACGCCGGCTGGATGATCCTCCAGCGCGGGGATCTCATGCTGGAGTTCTTCGCCCACCCTAGGGGGAGGCTAACTGAAACACGGAAGGAGACAATACCGGAAGGAACCCGCGCTATGACGGCAATAAAAAGACAGAATAAAACGCACGGTGTTGGGTCGTTTGTTCATAAACGCGGGGTTCGGTCCCAGGGCTGGCACTCTGTCGATACCCCACCGAGACCCCATTGGGGCCAATACGCCCGCGTTTCTTCCTTTTCCCCACCCCACCCCCCAAGTTCGGGTGAAGGCCCAGGGCTCGCAGCCAACGTCGGGGCGGCAGGCCCTGCCATAGCCTCAGGTTACTCATATATACTTTAGATTGATTTAAAACTTCATTTTTAATTTAAAAGGATCTAGGTGAAGATCCTTTTTGATAATCTCATGACCAAAATCCCTTAACGTGAGTTTTCGTTCCACTGAGCGTCAGACCCCGTAGAAAAGATCAAAGGATCTTCTTGAGATCCTTTTTTTCTGCGCGTAATCTGCTGCTTGCAAACAAAAAAACCACCGCTACCAGCGGTGGTTTGTTTGCCGGATCAAGAGCTACCAACTCTTTTTCCGAAGGTAACTGGCTTCAGCAGAGCGCAGATACCAAATACTGTTCTTCTAGTGTAGCCGTAGTTAGGCCACCACTTCAAGAACTCTGTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTCGTGCACACAGCCCAGCTTGGAGCGAACGACCTACACCGAACTGAGATACCTACAGCGTGAGCTATGAGAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGGACAGGTATCCGGTAAGCGGCAGGGTCGGAACAGGAGAGCGCACGAGGGAGCTTCCAGGGGGAAACGCCTGGTATCTTTATAGTCCTGTCGGGTTTCGCCACCTCTGACTTGAGCGTCGATTTTTGTGATGCTCGTCAGGGGGGCGGAGCCTATGGAAAAACGCCAGCAACGCGGCCTTTTTACGGTTCCTGGCCTTTTGCTGGCCTTTTGCTCACATGTTCTTTCCTGCGTTATCCCCTGATTCTGTGGATAACCGTATTACCGC'
ref = 'catgcattagttattaatagtaatcaattacggggtcattagttcatagcccATATATGGAGTTCCGCGTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACGGTAAACTGCCCACTTGGCAGTACATCAAGTGTATCATATGCCAAGTACGCCCCCTATTGACGTCAATGACGGTAAATGGCCCGCCTGGCATTATGCCCAGTACATGACCTTATGGGACTTTCCTACTTGGCAGTACATCTACGTATTAGTCATCGCTATTACCATGGTGATGCGGTTTTGGCAGTACATCAATGGGCGTGGATAGCGGTTTGACTCACGGGGATTTCCAAGTCTCCACCCCATTGACGTCAATGGGAGTTTGTTTTGGCACCAAAATCAACGGGACTTTCCAAAATGTCGTAACAACTCCGCCCCATTGACGCAAATGGGCGGTAGGCGTGTACGGTGGGAGGTCTATATAAGCAGAGCTGGTTTAGTGAACCGTCAGATCCGCTAGCGCTACCGGTCGCCACCATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTCCGGACTCAGATCTCGAGCTCAAGCTTCGAATTCTGCAGTCGACTCATTCGGGAGCTGGATGGCTTGGGACATGTGCAGCCAAGACTCTGTATGGAGTGACATAGAGTGTGCTGCTCTGGTTGGTGAGGACCAGCCTCTTTGCCCAGATCTTCCTGAACTTGACCTTTCTGAACTTGATGTGAATGACTTGGATACAGACAGCTTTCTGGGTGGATTGAAGTGGTGTAGCGACCAATCGGAAATCATATCCAACCAGTACAACAATGAGCCTGCGAACATATTTGAGAAGATAGATGAAGAGAATGAGGCAAACTTGCTAGCGGTCCTCACAGAGACACTGGACAGTCTCCCCGTGGATGAAGACGGATTGCCCTCATTTGATGCACTGACAGATGGAGCCGTGACCACTGACAACGAGGCCAGTCCTTCCTCCATGCCTGACGGCACCCCTCCCCCTCAGGAGGCAGAAGAGCCGTCTCTACTTAAGAAGCTCTTACTGGCACCAGCCAACACTCAGCTCAGCTACAATGAATGCAGCGGTCTTAGCACTCAGAACCATGCAGCAAACCACACCCACAGGATCAGAACAAACCCTGCCATTGTTAAGACCGAGAATTCATGGAGCAATAAAGCGAAGAGCATTTGTCAACAGCAAAAGCCACAAAGACGTCCCTGCTCAGAGCTTCTCAAGTATCTGACCACAAACGATGACCCTCCTCACACCAAACCCACAGAAAACAGGAACAGCAGCAGAGACAAATGTGCTTCCAAAAAGAAGTCCCATACACAACCGCAGTCGCAACATGCTCAAGCCAAACCAACAACTTTATCTCTTCCTCTGACCCCAGAGTCACCAAATGACCCCAAGGGTTCCCCATTTGAGAACAAGACTATTGAGCGAACCTTAAGTGTGGAACTCTCTGGAACTGCAGGCCTAACTCCTCCCACAACTCCTCCTCATAAAGCCAACCAAGATAACCCTTTCAAGGCTTCGCCAAAGCTGAAGCCCTCTTGCAAGACCGTGGTGCCACCGCCAACCAAGAGGGCCCGGTACAGTGAGTGTTCTGGTACCCAAGGCAGCCACTCCACCAAGAAAGGGCCCGAGCAATCTGAGTTGTACGCACAACTCAGCAAGTCCTCAGGGCTCAGCCGAGGACACGAGGAAAGGAAGACTAAACGGCCCAGTCTCCGGCTGTTTGGTGACCATGACTACTGTCAGTCACTCAATTCCAAAACGGATATACTCATTAACATATCACAGGAGCTCCAAGACTCTAGACAACTAGACTTCAAAGATGCCTCCTGTGACTGGCAGGGGCACATCTGTTCTTCCACAGATTCAGGCCAGTGCTACCTGAGAGAGACTTTGGAGGCCAGCAAGCAGGTCTCTCCTTGCAGCACCAGAAAACAGCTCCAAGACCAGGAAATCCGAGCGGAGCTGAACAAGCACTTCGGTCATCCCTGTCAAGCTGTGTTTGACGACAAATCAGACAAGACCAGTGAACTAAGGGATGGCGACTTCAGTAATGAACAATTCTCCAAACTACCTGTGTTTATAAATTCAGGACTAGCCATGGATGGCCTATTTGATGACAGTGAAGATGAAAGTGATAAACTGAGCTACCCTTGGGATGGCACGCAGCCCTATTCATTGTTCGATGTGTCGCCTTCTTGCTCTTCCTTTAACTCTCCGTGTCGAGACTCAGTGTCACCACCGAAATCCTTATTTTCTCAAAGACCCCAAAGGATGCGCTCTCGTTCAAGATCCTTTTCTCGACACAGGTCGTGTTCCCGATCACCATATTCCAGGTCAAGATCAAGGTCCCCAGGCAGTAGATCCTCTTCAAGATCCTGTTACTACTATGAATCAAGCCACTACAGACACCGCACACACCGCAATTCTCCCTTGTATGTGAGATCACGTTCAAGGTCACCCTACAGCCGTAGGCCCAGGTACGACAGCTATGAAGCCTATGAGCACGAAAGGCTCAAGAGGGATGAATACCGCAAAGAGCACGAGAAGCGGGAGTCTGAAAGGGCCAAACAGAGAGAGAGGCAGAAGCAGAAAGCAATTGAAGAGCGCCGTGTGATTTACGTTGGTAAAATCAGACCTGACACAACGCGGACAGAATTGAGAGACCGCTTTGAAGTTTTTGGTGAAATTGAGGAATGCACCGTAAATCTGCGGGATGATGGAGACAGCTATGGTTTCATCACCTACCGTTACACCTGTGACGCTTTCGCTGCTCTTGAGAATGGATATACTTTACGCAGGTCGAACGAAACTGACTTCGAGCTGTACTTTTGTGGACGGAAGCAATTTTTCAAGTCTAACTATGCAGACCTAGATACCAACTCAGACGATTTTGACCCTGCTTCCACCAAGAGCAAGTATGACTCTCTGGATTTTGATAGTTTACTGAAGGAAGCTCAGAGAAGCTTGCGCAGGTAACGTGTTCCCAGGCTGAGGGATGACAGGGATCCACCGGATCTAGATAACTGATCATAATCAGCCATACCACATTTGTAGAGGTTTTACTTGCTTTAAAAAACCTCCCACACCTCCCCCTGAACCTGAAACATAAAATGAATGCAATTGTTGTTGTTAACTTGTTTATTGCAGCTTATAATGGTTACAAATAAAGCAATAGCATCACAAATTTCACAAATAAAGCATTTTTTTCACTGCATTCTAGTTGTGGTTTGTCCAAACTCATCAATGTATCTTAACGCGTAAATTGTAAGCGTTAATATTTTGTTAAAATTCGCGTTAAATTTTTGTTAAATCAGCTCATTTTTTAACCAATAGGCCGAAATCGGCAAAATCCCTTATAAATCAAAAGAATAGACCGAGATAGGGTTGAGTGTTGTTCCAGTTTGGAACAAGAGTCCACTATTAAAGAACGTGGACTCCAACGTCAAAGGGCGAAAAACCGTCTATCAGGGCGATGGCCCACTACGTGAACCATCACCCTAATCAAGTTTTTTGGGGTCGAGGTGCCGTAAAGCACTAAATCGGAACCCTAAAGGGAGCCCCCGATTTAGAGCTTGACGGGGAAAGCCGGCGAACGTGGCGAGAAAGGAAGGGAAGAAAGCGAAAGGAGCGGGCGCTAGGGCGCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCACCACACCCGCCGCGCTTAATGCGCCGCTACAGGGCGCGTCAGGTGGCACTTTTCGGGGAAATGTGCGCGGAACCCCTATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGAGACAATAACCCTGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTCCTGAGGCGGAAAGAACCAGCTGTGGAATGTGTGTCAGTTAGGGTGTGGAAAGTCCCCAGGCTCCCCAGCAGGCAGAAGTATGCAAAGCATGCATCTCAATTAGTCAGCAACCAGGTGTGGAAAGTCCCCAGGCTCCCCAGCAGGCAGAAGTATGCAAAGCATGCATCTCAATTAGTCAGCAACCATAGTCCCGCCCCTAACTCCGCCCATCCCGCCCCTAACTCCGCCCAGTTCCGCCCATTCTCCGCCCCATGGCTGACTAATTTTTTTTATTTATGCAGAGGCCGAGGCCGCCTCGGCCTCTGAGCTATTCCAGAAGTAGTGAGGAGGCTTTTTTGGAGGCCTAGGCTTTTGCAAAGATCGATCAAGAGACAGGATGAGGATCGTTTCGCATGATTGAACAAGATGGATTGCACGCAGGTTCTCCGGCCGCTTGGGTGGAGAGGCTATTCGGCTATGACTGGGCACAACAGACAATCGGCTGCTCTGATGCCGCCGTGTTCCGGCTGTCAGCGCAGGGGCGCCCGGTTCTTTTTGTCAAGACCGACCTGTCCGGTGCCCTGAATGAACTGCAAGACGAGGCAGCGCGGCTATCGTGGCTGGCCACGACGGGCGTTCCTTGCGCAGCTGTGCTCGACGTTGTCACTGAAGCGGGAAGGGACTGGCTGCTATTGGGCGAAGTGCCGGGGCAGGATCTCCTGTCATCTCACCTTGCTCCTGCCGAGAAAGTATCCATCATGGCTGATGCAATGCGGCGGCTGCATACGCTTGATCCGGCTACCTGCCCATTCGACCACCAAGCGAAACATCGCATCGAGCGAGCACGTACTCGGATGGAAGCCGGTCTTGTCGATCAGGATGATCTGGACGAAGAGCATCAGGGGCTCGCGCCAGCCGAACTGTTCGCCAGGCTCAAGGCGAGCATGCCCGACGGCGAGGATCTCGTCGTGACCCATGGCGATGCCTGCTTGCCGAATATCATGGTGGAAAATGGCCGCTTTTCTGGATTCATCGACTGTGGCCGGCTGGGTGTGGCGGACCGCTATCAGGACATAGCGTTGGCTACCCGTGATATTGCTGAAGAGCTTGGCGGCGAATGGGCTGACCGCTTCCTCGTGCTTTACGGTATCGCCGCTCCCGATTCGCAGCGCATCGCCTTCTATCGCCTTCTTGACGAGTTCTTCTGAGCGGGACTCTGGGGTTCGAAATGACCGACCAAGCGACGCCCAACCTGCCATCACGAGATTTCGATTCCACCGCCGCCTTCTATGAAAGGTTGGGCTTCGGAATCGTTTTCCGGGACGCCGGCTGGATGATCCTCCAGCGCGGGGATCTCATGCTGGAGTTCTTCGCCCACCCTAGGGGGAGGCTAACTGAAACACGGAAGGAGACAATACCGGAAGGAACCCGCGCTATGACGGCAATAAAAAGACAGAATAAAACGCACGGTGTTGGGTCGTTTGTTCATAAACGCGGGGTTCGGTCCCAGGGCTGGCACTCTGTCGATACCCCACCGAGACCCCATTGGGGCCAATACGCCCGCGTTTCTTCCTTTTCCCCACCCCACCCCCCAAGTTCGGGTGAAGGCCCAGGGCTCGCAGCCAACGTCGGGGCGGCAGGCCCTGCCATAGCCTCAGGTTACTCATATATACTTTAGATTGATTTAAAACTTCATTTTTAATTTAAAAGGATCTAGGTGAAGATCCTTTTTGATAATCTCATGACCAAAATCCCTTAACGTGAGTTTTCGTTCCACTGAGCGTCAGACCCCGTAGAAAAGATCAAAGGATCTTCTTGAGATCCTTTTTTTCTGCGCGTAATCTGCTGCTTGCAAACAAAAAAACCACCGCTACCAGCGGTGGTTTGTTTGCCGGATCAAGAGCTACCAACTCTTTTTCCGAAGGTAACTGGCTTCAGCAGAGCGCAGATACCAAATACTGTTCTTCTAGTGTAGCCGTAGTTAGGCCACCACTTCAAGAACTCTGTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTCGTGCACACAGCCCAGCTTGGAGCGAACGACCTACACCGAACTGAGATACCTACAGCGTGAGCTATGAGAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGGACAGGTATCCGGTAAGCGGCAGGGTCGGAACAGGAGAGCGCACGAGGGAGCTTCCAGGGGGAAACGCCTGGTATCTTTATAGTCCTGTCGGGTTTCGCCACCTCTGACTTGAGCGTCGATTTTTGTGATGCTCGTCAGGGGGGCGGAGCCTATGGAAAAACGCCAGCAACGCGGCCTTTTTACGGTTCCTGGCCTTTTGCTGGCCTTTTGCTCACATGTTCTTTCCTGCGTTATCCCCTGATTCTGTGGATAACCGTATTACCGC'

bf1 = encode(query)
bf2 = encode(ref)
end = time.time()
print(end - start)
bf1[:10]

0.006459712982177734


array('b', [0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

# New bloom filter functions

In [31]:
gene = query                         
gene2 = ref

K = K # defined above

def fill_bloom(bf, seq, k = K):    
    # Loop through all k-mers for gene.
    seq = seq.upper()
    for n in range(0, len(seq)-k + 1):
        # Get k-mer of length k and hash it.
        k_mer = seq[n:n + k]                   # TODO: ignore case, 'N's?

        # Set entry the bloom filter corresponding to the hashed k-mer to one.
        _ = bf.add(k_mer)

start = time.time()
bf1 = pybloom_live.BloomFilter(capacity = SIZE, error_rate=0.001)
bf2 = pybloom_live.BloomFilter(capacity = SIZE, error_rate=0.001)

fill_bloom(bf1, gene)
fill_bloom(bf2, gene2)
end = time.time()

print(end - start)

print(len(bf1), len(bf2))

bitstring1 = [1 if bit else 0 for bit in bf1.bitarray]
bitstring2 = [1 if bit else 0 for bit in bf2.bitarray]
type(bitstring1), bitstring1[:10]

0.10915589332580566
7065 7065


(list, [1, 0, 0, 0, 1, 0, 1, 1, 0, 1])

In [32]:
ba1 = BitVector( bitlist = bitstring1 )
ba2 = BitVector( bitlist = bitstring2 )
ba1.jaccard_similarity(ba2)

1.0

In [14]:
print(sum([1 for i,x in enumerate(query) if query[i] == ref[i]]))
print(len(ref), len(query), len('catgcattagttattaatagtaatcaattacggggtcattagttcatagccc'))
(7099)/7151

7099
7151 7151 52


0.9927282897496854

In [33]:
from paillier import *
import phe

In [34]:
q2 = 'CATGCATTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCATAGCCCATATATGGAGTTCCGCGTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACGGTAAACTGCCCACTTGGCAGTACATCAAGTGTATCATATGCCAAGTACGCCCCCTATTGACGTCAATGACGGTAAATGGCCCGCCTGGCATTATGCCCAGTACATGACCTTATGGGACTTTCCTACTTGGCAGTACATCTACGTATTAGTCATCGCTATTACCATGGTGATGCGGTTTTGGCAGTACATCAATGGGCGTGGATAGCGGTTTGACTCACGGGGATTTCCAAGTCTCCACCCCATTGACGTCAATGGGAGTTTGTTTTGGCACCAAAATCAACGGGACTTTCCAAAATGTCGTAACAACTCCGCCCCATTGACGCAAATGGGCGGTAGGCGTGTACGGTGGGAGGTCTATATAAGCAGAGCTGGTTTAGTGAACCGTCAGATCCGCTAGCGCTACCGGTCGCCACCATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGC'
q3 = 'CTAAATTGTAAGCGTTAATATTTTGTTAAAATTCGCGTTAAATTTTTGTTAAATCAGCTCATTTTTTAACCAATAGGCCGAAATCGGCAAAATCCCTTATAAATCAAAAGAATAGACCGAGATAGGGTTGAGTGGCCGCTACAGGGCGCTCCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGTTTCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGAGCGCGACGTAATACGACTCACTATAGGGCGAATTGGCGGAAGGCCGTCAAGGCCACGTGTCTTGTCCAGAGCTCGGATCCGAATTGGCCTCCACGGCCTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCATAGCCCATATATGGAGTTCCGCGTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACGGTAAACTGCCCACTTGGCAGTACATCAAGTGTATCATATGCCAAGTACGCCCCCTATTGACGTCAATGACGGTAAATGGCCCGCCTGGCATTATGCCCAGTACATGACCTTATGGGACTTTCCTACTTGGCAGTACATCTACGTATTAGTCATCGCTATTACCATGGTGATGCGGTTTTGGCAGTACATCAATGGGCGTGGATAGCGGTTTGACTCACGGGGATTTCCAAGTCTCCACCCCATTGACGTCAATGGGAGTTTGTTTTGGCACCAAAATCAACGGGACTTTCCAAAATGTCGTAACAACTCCGCCCCATTGACGCAAATGGGCGGTAGGCGTGTACGGTGGGAGGTCTATATAAGCAGAGCTGGTTTAGTGAACCGTCAGATCCGCTAGCGCTACCGGACTCAGATCTCGAGCTCAAGCTTCGAATTCTGCAGTCGACGGTACCGCGGGCCCGGGATCCATCGCCACCATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAAAGCGGCCGCGACTCTAGATCATAATCAGCCATACCACATTTGTAGAGGTTTTACTTGCTTTAAAAAACCTCCCACACCTCCCCCTGAACCTGAAACATAAAATGAATGCAATTGTTGTTGTTAACTTGTTTATTGCAGCTTATAATGGTTACAAATAAAGCAATAGCATCACAAATTTCACAAATAAAGCATTTTTTTCACTGCATTCTAGTTGTGGTTTGTCCAAACTCATCAATGTATCTTAAGGCGTACTATATCCCTGGAGACGGGCGCCGCTACAGGGCGCGTCCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGAGCGCGCGTAATACGACTCACTATAGGGCGAATTGGGTACCGGGCCCCCCCTCGAGGTCCTCCAGCTTTTGTTCCCTTTAGTGAGGGTTAATTGCGCGCTTGGCGTAATCATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCTAATGAGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCACCGGTCGTCTCCCTATTGGCCTCCACGGCCTTGTACAACGCGTGGTACCTGGAGCACAAGACTGGCCTCACGGGCCTTCCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAACATGGTCATAGCTGTTTCCTTGCGTATTGGGCGCTCTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGGTAAAGCCTGGGGTGCCTAATGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGAACCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCAC'
len(q2), len(q3)

(838, 4539)

In [35]:
bf1 = pybloom_live.BloomFilter(capacity = SIZE, error_rate=0.001)
bf2 = pybloom_live.BloomFilter(capacity = SIZE, error_rate=0.001)

fill_bloom(bf1, q2)
fill_bloom(bf2, q3)

bitstring1 = [1 if bit else 0 for bit in bf1.bitarray]
bitstring2 = [1 if bit else 0 for bit in bf2.bitarray]

In [36]:
p = Paillier(1024)
start = time.time()
#q1 = [p.Enc(x) for x in bitstring1]
#q2 = [p.Enc(x) for x in bitstring2]
end = time.time()
duration = end - start
print(duration)

2.574920654296875e-05


In [11]:
a = 111
pubkey, privkey = Paillier.generateKeys(1024)
A = Paillier.encrypt(a, pubkey)
b = Paillier.decrypt(A, privkey)
A == b

False

In [12]:
score_enc = np.dot(q2, q1)

score_dec = p.Dec(score_enc)

query_mag = np.sum(bitstring1)
result_mag = np.sum(bitstring2)

print(score_dec, query_mag, result_mag)

union = (query_mag + result_mag) - score_dec

print(score_dec/union)

-2.68027859873373859603e308 7981 38357
-1.0


In [None]:
ba1 = BitVector( bitlist = bitstring1 )
ba2 = BitVector( bitlist = bitstring2 )
ba1.jaccard_similarity(ba2)

0.2029282728901119

In [38]:
dotproduct(bitstring1, bitstring2), dotproduct(ba1, ba2)

(7817, 57917)

In [39]:
np.dot(bitstring1, bitstring2), np.dot(ba1, ba2)

(7817, 57917)