# Setting some variables

**According to Strelka**

```
ref_count = 25
alt_count = 8
vaf = 8 / (8 + 25) = 0.24
```

In [258]:
from __future__ import print_function
from __future__ import division

In [259]:
ref_seq = "CTAAGCTTCATATATGAAAGAAAGATACAGTATTTTTCAGAAAAACAAATGCTGAAACAATTCACACTATCAAGCCACCACTACAAGAATTGCTAAAAGGAGTTCTAAATCTTGAAAAAAATCCTGGAAACACATCAAAACAGATCCTCTTTAAGCATAAATCTTACAGGACCTATAAAACAAAAATACAATTACAAAACAAAAAATCAAGGTATACAGCCAACAAATAGCATGATGAATGAAATGGTACGTCACATCTCAATACCAGTGTTGAATGTAAATGGTATAAATGCTCCACTTAAAAGACACAGAATTGTAGAATGGATAAGAATTCGCCAAACAACTATCTGCTGCCTTCAAGAGATTCACCTAACACATAAGGACTCACACAAACTTAAGGTAAAGGGGTAGAAAGAGACATTTCATGCAAATGGACACCAAAACCAAGTAGGAGTAGCTATCCTTATATCAGACAAAACAACCTTTAAAGCAAAAACAGTTAAAAATACAAAAAGGAAAATTATATGATGATAAAAGGTATTATCCAACAGGAAAATATCACAATCCTAAACATATATGCACCTATAACGCTGGGGCTCACAAATTTATAAAACAATTACTATTAGATCTAAGAAATGAAAGAGACAGCAACACAATAATAGTGGGGGACTTCAGTACTACACTAAGAGCACAAGACAGGTCATCAAGACAGAAAGTCAACAAAGAAACAATGAATTTAAACTATAGCCTGAAACAAATGGACTTAATGTATATGCAAAGAACATTCCATCCAACAACCACAGAATATACATGCTATTCAACAGCACATGAAACTTTCTCCAAGCTAGACCATATGATAAACCACAAAAGAAGCCACAGTAAAATTAAGAAAATTGAAATTATATCAAGCACTCTCTCAGACCACAGTGGAATAAAACTGCAAATCAACTCTAAAAGGAACCCTCAAAATCATGCAAATACAGGATAAATAACCTGCTCCT"
alt_seq = "CTAAGCTTCATATATGAAAGAAAGATACAGTATTTTTCAGAAAAACAAATGCTGAAACAATTCACACTATCAAGCCACCACTACAAGAATTGCTAAAAGGAGTTCTAAATCTTGAAAAAAATCCTGGAAACACATCAAAACAGATCCTCTTTAAGCATAAATCTTACAGGACCTATAAAACAAAAATACAATTACAAAACAAAAAATCAAGGTATACAGCCAACAAATAGCATGATGAATGAAATGGTACGTCACATCTCAATACCAGTGTTGAATGTAAATGGTATAAATGCTCCACTTAAAAGACACAGAATTGTAGAATGGATAAGAATTCGCCAAACAACTATCTGCTGCCTTCAAGAGATTCACCTAACACATAAGGACTCACACAAACTTAAGGTAAAGGGGTAGAAAGAGACATTTCATGCAAATGGACACCAAAACCAAGTAGGAGTAGCTATCCTTATATCAGACAAAACAACCTTTAAAGCAAAAACAGTTAAAAAATACAAAAAGGAAAATTATATGATGATAAAAGGTATTATCCAACAGGAAAATATCACAATCCTAAACATATATGCACCTATAACGCTGGGGCTCACAAATTTATAAAACAATTACTATTAGATCTAAGAAATGAAAGAGACAGCAACACAATAATAGTGGGGGACTTCAGTACTACACTAAGAGCACAAGACAGGTCATCAAGACAGAAAGTCAACAAAGAAACAATGAATTTAAACTATAGCCTGAAACAAATGGACTTAATGTATATGCAAAGAACATTCCATCCAACAACCACAGAATATACATGCTATTCAACAGCACATGAAACTTTCTCCAAGCTAGACCATATGATAAACCACAAAAGAAGCCACAGTAAAATTAAGAAAATTGAAATTATATCAAGCACTCTCTCAGACCACAGTGGAATAAAACTGCAAATCAACTCTAAAAGGAACCCTCAAAATCATGCAAATACAGGATAAATAACCTGCTCCT"

In [260]:
infile = "reads.txt"

In [30]:
import requests
def get_seqs(chrom, pos, ref, alt, margin):
    """Obtain reference and alternate sequences 
    from Ensembl
    """
    # Calculate start and end positions
    start = pos - margin
    end = pos + margin
    server = "http://grch37.rest.ensembl.org/"
    ext = "/sequence/region/human/{}:{}..{}:1?".format(chrom, start, end)
    r = requests.get(server+ext, headers={ "Content-Type" : "text/plain"})
    ref_seq = r.text
    alt_seq = ""
    ref_len = len(ref.strip("-"))
    alt_len = len(alt.strip("-"))
    # Insertion
    if ref_len < alt_len:
        prefix = ref_seq[:margin+1]
        suffix = ref_seq[margin+1:]
        alt_seq = prefix + alt + suffix
    # Deletion
    elif ref_len > alt_len:
        prefix = ref_seq[:margin]
        suffix = ref_seq[margin+len(ref):]
        alt_seq = prefix + suffix
    # SNP
    else:
        prefix = ref_seq[:margin]
        suffix = ref_seq[margin+1:]
        alt_seq = prefix + alt + suffix
    return ref_seq, alt_seq

# K-mer approach

In [261]:
K = 10
IVAL = 2

In [262]:
def kmer_iter(text, k, step, ival):
    num_kmers = (len(text) - k * ival)//step + 1
    for i in range(num_kmers):
        kmer = text[i*step:i*step+k*ival:ival]
        yield kmer

In [263]:
def get_kmer_set(text, k, step, ival):
    kmers = set()
    for kmer in kmer_iter(text, k, step, ival):
        kmers.add(kmer)
    return kmers

In [264]:
ref_kmers = get_kmer_set(ref_seq, k=K, step=1, ival=IVAL)
alt_kmers = get_kmer_set(alt_seq, k=K, step=1, ival=IVAL)

In [265]:
from collections import defaultdict
def kmer_count(text, k, step, ival):
    kmer_counts = defaultdict(int)
    num_kmers = (len(text) - k)//step + 1
    for kmer in kmer_iter(text, k, step, ival):
        kmer_counts[kmer] += 1
    return kmer_counts

In [266]:
def calc_score(kmer_counts, kmer_set):
    common = set(kmer_counts.keys()).intersection(kmer_set)
    counts = [kmer_counts[kmer] for kmer in common]
    score = sum(counts)
    return score

In [271]:
%%time
with open(infile) as reads:
    ref_count = 0
    alt_count = 0
    amb_count = 0  # Ambiguous count
    for read in reads:
        kmer_counts = kmer_count(read, k=K, step=1, ival=IVAL)
        score_ref = calc_score(kmer_counts, ref_kmers)
        score_alt = calc_score(kmer_counts, alt_kmers)
        if score_ref > score_alt:
            ref_count += 1
        elif score_ref < score_alt:
            alt_count += 1
        else:
            amb_count += 1
vaf = round(alt_count/(alt_count + ref_count), 2)
print(ref_count, alt_count, amb_count, vaf)

29 8 4 0.22
CPU times: user 5.18 ms, sys: 2.25 ms, total: 7.43 ms
Wall time: 5.58 ms


# Local alignment approach

In [272]:
alphabet = ['A', 'C', 'G', 'T']
score = [[0, 4, 2, 4, 8],
         [4, 0, 4, 2, 8],
         [2, 4, 0, 4, 8],
         [4, 2, 4, 0, 8],
         [8, 8, 8, 8, 8]]

In [273]:
import numpy as np
def local_aln_score(t, p):

    # Create distance matrix
    D = np.zeros((len(p)+1,len(t)+1), dtype=np.int)
    
    # Initialize first row
    for i in range(1, len(t)+1):
        D[0,i] = 0
    
    # Initialize first column
    for i in range(1, len(p)+1):
        D[i,0] = D[i-1,0] + score[alphabet.index(p[i-1])][-1]
        
    # Fill rest of the matrix
    for i in range(1, len(p)+1):
        for j in range(1, len(t)+1):
            distHor = D[i,j-1] + score[-1][alphabet.index(t[j-1])]
            distVer = D[i-1,j] + score[alphabet.index(p[i-1])][-1]
            distDiag = D[i-1,j-1] + score[alphabet.index(p[i-1])][alphabet.index(t[j-1])]
            D[i][j] = min(distHor, distVer, distDiag)
    
    # Return min of bottom row
    return min(D[-1])

In [275]:
%%time
with open(infile) as reads:
    ref_count = 0
    alt_count = 0
    amb_count = 0  # Ambiguous count
    for read in reads:
        read = read.rstrip()
        score_ref = local_aln_score(ref_seq, read)
        score_alt = local_aln_score(alt_seq, read)
        if score_ref > score_alt:
            ref_count += 1
        elif score_ref < score_alt:
            alt_count += 1
        else:
            amb_count += 1
vaf = round(alt_count/(alt_count + ref_count), 2)
print(ref_count, alt_count, amb_count, vaf)

9 29 3 0.76
CPU times: user 31.2 s, sys: 854 ms, total: 32.1 s
Wall time: 31.4 s
