# Set up the environment

In [None]:
"""This is the primary code document for the execution of this workflow.
Queries the database for specific genes."""

# Load our packages for the environment
import sys, os
import time
from joblib import Parallel, delayed
from phe import paillier
from p_bloom_filter import encode
from p_database import search, magnitude
from optimize_invert import invert
from Bio import SeqIO
import random
from random import randint
import numpy as np

paillier.invert = invert
num_cores = 48 # Number of cores for parellel processing
query_len = 100

# Locate the data

In [None]:
base = '/SEAL/local_data/atitus/data/genomes/' 
f = '/SEAL/local_data/atitus/data/genomes/ecoli_queries/query100.fasta'
#d = os.path.join(base, 'ecoli/')
d = '/SEAL/local_data/atitus/data/bacterial_genomes/bacteria'

In [12]:
random.seed(2)

files = os.listdir(d)
num_files = len(files)
print('Number of files: %s' %str(num_files))

query_file_ind = randint(0, 100)

f = files[query_file_ind]

print('Selected "' + f + '" from file number %s' % str(query_file_ind))

f = os.path.join(d, f)
print(f)

Number of files: 75958
Selected "GCF_002407125.1_ASM240712v1_genomic.fasta" from file number 7
/SEAL/local_data/atitus/data/bacterial_genomes/bacteria/GCF_002407125.1_ASM240712v1_genomic.fasta


# Read the FASTA sequence file

In [13]:
seq = ''
with open(f, "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        seq += str(record.seq)
        
print(seq[:1000])

AAATGCAGATTGTTGCGCAAAAAGAAAATTTAAGTAAAGATTTTTTGCTTGAAAATATTGCTTGCGGTAAAATTATTATCCCTGCAAATATTAATCATAAAAGTCTTGATCCAAATGGTATAGGTTTTGGACTGCGTACTAAGGTTAATGTAAATTTAGGAGTTTCAAATGATTGCGTTGATTATAGCGAGGAAATGAAAAAAGTAGAGCTTGCTCATAAATTTGGCATAGAAGCGATTATGGACTTAAGCAATTATGGCAAAACAAGTCGTTTTAGAGACGAGCTTGTTAATGTTTCAAAAGCTATGATAGGAACAGTTCCTGTATATGATGCGGTAGGATTTTTAGAAAAAGATTTAAAGCAAATTAATGCCAAAGACTTTTTAGATGTTGTATATCATCATGCTAAAAGTGGGGTTGATTTTATGACAATTCATGCAGGTATTAATTCTCGCGCAGCGCATATTTTTAAACAAAGTAAAAGACTTACAAATATAGTTTCAAGAGGAGGCTCTGTACTTTATGCTTGGATGATGATGAAAGATGCTGAAAATCCTTTTTTTGAGTATTATGATGATTTGCTTGACATTTGTTTAAAATATGATGTAACTTTATCTTTGGGCGATGCTTTGCGTCCTGGCTCTACTCACGATGCAAGTGATGGGGCACAAATTTCAGAACTTATAGAATTATCACTCCTTACTCAAAGGGCTTGGGATGTTGGGATTCAAGTAATGATAGAAGGTCCAGGGCATATGGCTATTAATGAAATAGAGGCAAATATGCAATTAGAAAAGCGTTTATGTAAAGGAGCGCCTTTTTATGTCTTAGGACCTTTGGTAATAGATATTGGCGCAGGGTATGATCATATTAGTGGTGCTATTGGAGGAGCTGTGGCTGCAGCTAGCGGTGCTGATATGCTGTGTTATGTAACACCTGCTGAACACTTAAGACTTCCAAATTTAGAAGATGTTAGAGAGGGTATAGTTGCGACTAAAAT

In [14]:
seq[0:100]

'AAATGCAGATTGTTGCGCAAAAAGAAAATTTAAGTAAAGATTTTTTGCTTGAAAATATTGCTTGCGGTAAAATTATTATCCCTGCAAATATTAATCATAA'

# Vary the % of sequence mutation and calculated max IoU
- We are ranging from 100% mutated to ~0% mutated
- K = 50
- SIZE = 100,000
- SeqSize = 20,000

In [None]:
#out = []
#for prob in np.arange(0.0, 1.04, 0.05):
    print(prob)
    target_seq = aug_query(seq, prob)
    result = compare(target_seq, d)
    result.append(prob)
    out.append(result)
print('Done')    

# Pull out the IoUs and whether the max prediction matches the query

In [None]:
correct = [1 if out[i][4] == seq[:1000] else 0 for i,_ in enumerate(out)]
IoUs = [out[i][0] for i,_ in enumerate(out)]
for i,item in enumerate(out):
    print(correct[i], item[:3], item[5])

# Run query on full dataset

In [None]:
start = time.time()
result = compare(seq, d)
end = time.time() - start
print(end/60)
print(result)
print(result[3] == result[4])


Found 75958 entries in database

Using 50 entries from database



# Code for analysis

In [15]:
####################
# Augment query to test mutation effect on performance
####################
def aug_query(input_seq, mut_prob):
    global query_len
    
    query = ''
    
    probs = [random.uniform(0, 1) for i in range(int(query_len/2))]
    
    query = ''.join(input_seq[i] if probs[i] > mut_prob else 'X' for i,_ in enumerate(probs))
    query = query + input_seq[len(probs):]
    
    return query.upper()


####################
# Main function to run pipeline
####################
def compare(seq, d, dev = False):
    
    global query_len
    
    # Create the encryption public and private key pair
    public_key, private_key = paillier.generate_paillier_keypair()
    
    seq = seq[:query_len]
    
    max_iou, max_ioLquery, max_ioLresult, best_seq, best_mag = query(seq, public_key, private_key, dev = dev, data_dir = d)
    
    out = [max_iou, max_ioLquery, max_ioLresult, seq[:1000], best_seq[:1000]]
    return out


        
####################
# Query a database with a query and public key and decrypt using a private key
####################
def query(query, public_key, private_key, dev, data_dir):
    """Encodes a query and searches for it in the data base.

    Args:
        query: A genetic sequence (string) to be searched for.
        public_key: The public key for the paillier encryption.
        private_key: The private key for the paillier encryption.

    Returns:
        The 'Gene' that is the 'best match' to the query.

        The IOU for the 'best match' and the query.
    """
    global num_cores

    query = encode(query)
    query_mag = magnitude(query)
    
    query = Parallel(n_jobs=num_cores)(delayed(public_key.encrypt)(x) for x in query)
    
    scores = search(query, data_dir = data_dir)
    
    max_iou = 0
    best_id = 0
    best_seq = ''
    result_scores = Parallel(n_jobs=num_cores)(delayed(calc_iou)(id_, private_key, query_mag) for id_ in scores)
    
    for score_set in result_scores:
        if score_set[0] >= max_iou: 
            max_iou = score_set[0]
            max_ioLquery = score_set[1]
            max_ioLresult = score_set[2]  
            best_seq = score_set[3]
            result_mag = score_set[4]
                
    return max_iou, max_ioLquery, max_ioLresult, best_seq, result_mag


####################
# Calculate the Intersection over Union
####################
def calc_iou(id_, private_key, query_mag):
    intersection = private_key.decrypt(id_[0])
    Iou, IoLquery, IoLresult = iou(intersection, id_[1], query_mag)
    
    return Iou, IoLquery, IoLresult, id_[2], id_[1]


####################
# Calculate the Intersection over Union
####################
def iou(intersection, data_mag, query_mag):
    """Finds the IOU for two bloom filters.

    Args:
        intersection: The intersection of the two genes.
        data_mag: The magnitude of the gene being compared to.
        query_mag: The magnitude of the gene being searched for.

    Returns:
        The IOU for the two genes.
    """
    union = (data_mag + query_mag) - intersection
    
    iou = intersection/union
    max_ioLquery = intersection/query_mag
    max_ioLresult = intersection/data_mag
    
    return iou, max_ioLquery, max_ioLresult