# Shared Code

In [2]:
from __future__ import print_function
from __future__ import division

In [3]:
MARGIN = 200

In [4]:
import requests
def get_seqs(chrom, pos, ref, alt, margin):
    """Obtain reference and alternate sequences 
    from Ensembl.
    
    Returns (ref_seq, alt_seq) tuple
    """
    # Calculate start and end positions
    start = pos - margin
    end = pos + margin
    # Construct the URL for the REST query
    server = "http://grch37.rest.ensembl.org/"
    ext = "/sequence/region/human/{}:{}..{}:1?".format(chrom, start, end)
    # Send the HTTP request
    r = requests.get(server+ext, headers={ "Content-Type" : "text/plain"})
    # Extract reference sequence
    ref_seq = str(r.text)
    # Strip away any gaps when calculating length
    ref_len = len(ref.strip("-"))
    alt_len = len(alt.strip("-"))
    # Categorize the variant
    if ref_len < alt_len:  # Insertion
        prefix = ref_seq[:margin+1]
        suffix = ref_seq[margin+1:]
        alt_seq = prefix + alt + suffix
    elif ref_len > alt_len:  # Deletion
        prefix = ref_seq[:margin]
        suffix = ref_seq[margin+len(ref):]
        alt_seq = prefix + suffix
    else:  # SNP
        prefix = ref_seq[:margin]
        suffix = ref_seq[margin+1:]
        alt_seq = prefix + alt + suffix
    return ref_seq, alt_seq

# K-mer Approach

In [5]:
# Some constants
K = 10
IVAL = 2
MAX_ID = 2

In [6]:
def rev_comp(seq):
    """Return reverse complement"""
    cbases = {"A": "T",
              "T": "A",
              "G": "C",
              "C": "G",
              "N": "N"}
    comp = ""
    for base in seq[::-1]:
        comp += cbases[base]
    return comp

In [7]:
def kmer_iter(text, k, step, ival):
    """Iterate over k-mers using the same 
    subsequence pattern.
    
    Returns generator.
    """
    num_kmers = (len(text) - k * ival)//step + 1
    for i in range(num_kmers):
        kmer = text[i*step:i*step+k*ival:ival]
        yield kmer

In [8]:
def get_kmer_set(text, k, step, ival):
    """Generate set of k-mers from a given text
    and its reverse complement.
    
    Returns set.
    """
    kmers = set()
    for kmer in kmer_iter(text, k, step, ival):
        kmers.add(kmer)
    comp = rev_comp(text)
    for kmer in kmer_iter(comp, k, step, ival):
        kmers.add(kmer)
    return kmers

In [9]:
def calc_score(text, kmer_set, k, step, ival):
    """Returns score for k-mers present
    in the given k-mer set.
    
    Returns the count/score.
    """
    kmer_count = 0
    num_kmers = (len(text) - k)//step + 1
    for kmer in kmer_iter(text, k, step, ival):
        if kmer in kmer_set:
            kmer_count += 1
    return kmer_count

In [10]:
indels = {}
headers = ["id", "chrom", "start", "end", "ref", "alt", "ref_count", "alt_count", "vaf"]
with open("indels.txt") as infile:
    for line in infile:
        # Parse line
        indel = dict(zip(headers, line.rstrip("\n").split("\t")))
        id_num = int(indel["id"])
        # Obtain sequences
        ref_seq, alt_seq = get_seqs(indel["chrom"], int(indel["start"]), indel["ref"], indel["alt"], margin=MARGIN)
        indel["ref_seq"], indel["alt_seq"] = ref_seq, alt_seq
        # Generate k-mers from sequences
        indel["ref_kmers"] = get_kmer_set(ref_seq, k=K, step=1, ival=IVAL)
        indel["alt_kmers"] = get_kmer_set(alt_seq, k=K, step=1, ival=IVAL)
        # Store them for later
        indels[id_num] = indel
        # Limit number of indels for now
        if id_num >= MAX_ID:
            break

In [21]:
for i in range(1, 2000):
    # Iterate over reads
    temp = "reads/reads_{}.txt"
    with open(temp.format(i)) as reads:
        ref_count = 0
        alt_count = 0
        amb_count = 0
        indel = indels[i]
        ref_kmers, alt_kmers = indel["ref_kmers"], indel["alt_kmers"]
        for read in reads:
            read = read.rstrip("\n")
            ref_score = calc_score(read, ref_kmers, k=K, step=2, ival=IVAL)
            alt_score = calc_score(read, alt_kmers, k=K, step=2, ival=IVAL)
            if ref_score > alt_score:
                ref_count += 1
            elif ref_score < alt_score:
                alt_count += 1
            else:
                amb_count += 1
        vaf = round(alt_count/(alt_count + ref_count), 2)
        output = """
        ref_count    before \t{}
                     after  \t{}
        alt_count    before \t{}
                     after  \t{}
        vaf          before \t{}
                     after  \t{}
        amb_count    before \tN/A
                     after  \t{}
        """.format(indel["ref_count"], ref_count, indel["alt_count"], alt_count, indel["vaf"], vaf, amb_count)
        print(output)
    if i >= MAX_ID:
            break


        ref_count    before 	13
                     after  	15
        alt_count    before 	7
                     after  	15
        vaf          before 	0.35
                     after  	0.5
        amb_count    before 	N/A
                     after  	13
        

        ref_count    before 	36
                     after  	33
        alt_count    before 	0
                     after  	0
        vaf          before 	0
                     after  	0.0
        amb_count    before 	N/A
                     after  	9
        


# Local Alignment Approach

In [22]:
alphabet = ['A', 'C', 'G', 'T']
score = [[0, 4, 2, 4, 8],
         [4, 0, 4, 2, 8],
         [2, 4, 0, 4, 8],
         [4, 2, 4, 0, 8],
         [8, 8, 8, 8, 8]]

In [23]:
import numpy as np
def local_aln_score(t, p):

    # Create distance matrix
    D = np.zeros((len(p)+1,len(t)+1), dtype=np.int)
    
    # Initialize first row
    for i in range(1, len(t)+1):
        D[0,i] = 0
    
    # Initialize first column
    for i in range(1, len(p)+1):
        D[i,0] = D[i-1,0] + score[alphabet.index(p[i-1])][-1]
        
    # Fill rest of the matrix
    for i in range(1, len(p)+1):
        for j in range(1, len(t)+1):
            distHor = D[i,j-1] + score[-1][alphabet.index(t[j-1])]
            distVer = D[i-1,j] + score[alphabet.index(p[i-1])][-1]
            distDiag = D[i-1,j-1] + score[alphabet.index(p[i-1])][alphabet.index(t[j-1])]
            D[i][j] = min(distHor, distVer, distDiag)
    
    # Return min of bottom row
    return min(D[-1])

In [24]:
for i in range(1, 2000):
    # Iterate over reads
    temp = "reads/reads_{}.txt"
    with open(temp.format(i)) as reads:
        ref_count = 0
        alt_count = 0
        amb_count = 0
        indel = indels[i]
        ref_kmers, alt_kmers = indel["ref_kmers"], indel["alt_kmers"]
        for read in reads:
            read = read.rstrip("\n")
            ref_score = local_aln_score(ref_seq, read)
            alt_score = local_aln_score(alt_seq, read)
            if ref_score > alt_score:
                ref_count += 1
            elif ref_score < alt_score:
                alt_count += 1
            else:
                amb_count += 1
        vaf = round(alt_count/(alt_count + ref_count), 2)
        output = """
        ref_count    before \t{}
                     after  \t{}
        alt_count    before \t{}
                     after  \t{}
        vaf          before \t{}
                     after  \t{}
        amb_count    before \tN/A
                     after  \t{}
        """.format(indel["ref_count"], ref_count, indel["alt_count"], alt_count, indel["vaf"], vaf, amb_count)
        print(output)
    if i >= MAX_ID:
            break


        ref_count    before 	13
                     after  	11
        alt_count    before 	7
                     after  	5
        vaf          before 	0.35
                     after  	0.31
        amb_count    before 	N/A
                     after  	27
        

        ref_count    before 	36
                     after  	0
        alt_count    before 	0
                     after  	33
        vaf          before 	0
                     after  	1.0
        amb_count    before 	N/A
                     after  	9
        


# Hybrid Approach

In [25]:
from collections import defaultdict
def create_kmer_idx(text, k, step, ival):
    """Generate a k-mer index from a given text
    and its reverse complement.
    
    Returns index.
    """
    kmer_idx = defaultdict(set)
    for offset, kmer in enumerate(kmer_iter(text, k, step, ival)):
        kmer_idx[kmer].add(offset)
    return kmer_idx

In [26]:
def find_offset(p, kmer_idx, k, step, ival, min_support=2):
    """Find offset of pattern p in k-mer index.
    
    Returns offset as int.
    """
    offset_support = defaultdict(int)
    for kmer in kmer_iter(p, k, step, ival):
        offsets = kmer_idx[kmer] + kmer_idx[rev_comp(kmer)]
        for offset in offsets:
            offset_support[offset] += 1
        print(offset_support)
        if any(map(lambda x: x > min_support, offset_support.values())):
            max_support = max(offset_support.values())
            if offset_support.values().count(max_support) > 1:
                continue
            else:
                idx = offset_support.values().index(max_support)
                return offset_support[idx]

In [None]:
for i in range(1, 2000):
    # Iterate over reads
    temp = "reads/reads_{}.txt"
    with open(temp.format(i)) as reads:
        ref_count = 0
        alt_count = 0
        amb_count = 0
        indel = indels[i]
        ref_kmers, alt_kmers = indel["ref_kmers"], indel["alt_kmers"]
        for read in reads:
            read = read.rstrip("\n")
            ref_score = calc_score(read, ref_kmers, k=K, step=2, ival=IVAL)
            alt_score = calc_score(read, alt_kmers, k=K, step=2, ival=IVAL)
            if ref_score > alt_score:
                ref_count += 1
            elif ref_score < alt_score:
                alt_count += 1
            else:
                amb_count += 1
        vaf = round(alt_count/(alt_count + ref_count), 2)
        output = """
        ref_count    before \t{}
                     after  \t{}
        alt_count    before \t{}
                     after  \t{}
        vaf          before \t{}
                     after  \t{}
        amb_count    before \tN/A
                     after  \t{}
        """.format(indel["ref_count"], ref_count, indel["alt_count"], alt_count, indel["vaf"], vaf, amb_count)
        print(output)
    if i >= MAX_ID:
            break