# Setting some variables

In [193]:
from __future__ import print_function
from __future__ import division

In [194]:
ref_seq = "CTTACAATAGAGCTAAGGTAGTACTTTACCAAAAATGGTTTTGATTTAAGGTAAAATCATACCTCATCAACTTGAGTTTTATTTACCTAAGGACCAAAAGAAGAAAAAGCCTGGTTACAAATAGTTTCTTCTATACCCAGGTCAGTGGTTCTTAAAAGTGGTTTTAGTACCCACAGCATCAGCATAATTTGTGAATGTGTTAGGAAGTCAAATTCGTGGGTCCCACCCCGACCTAAACGGAATCAGAATCTCTGGGGATAGGGCCCAGAAACGAGTTCTCTGTTGATTGTCATGAACTTTAAAGGTTGTGAACCACTGACTTAAACAATTGAGGAACATTAAGGAACTTAACAGTATATTCTCTCAATAAAATGTTTGATTATTCTTACTAGCCCCTTAGGATAGTTACCAAAACAAAACTACTAATTTATACTCTTTTAAAACTACAAGTTGCCTGTGTTATGTAGGAGAGATATAAGATTGTGAAATTTTCCAAAAATACCTGCCACACCCCTAACAGACAGGAGATGATAAGAGTAAAAGAGAGAATTGGGAGGCTTTAGAAGCCCTATTGTAATATTTCTTAAAACACATTATTTATTGCTAAATATATTTAGTGTGAGCCATGTAAAAATTGCCATTTTTGGAGGTCAAAAACCGTTGAATATCAGCAATTTAGTATGGTTCAATTTATGTGGTTTCTAATCAAATGTACAAAGTACAATAATTTGGTTTCATACTTGATTTTTTATTTCTTAGGCTGAGTCATTAAAAGTTGATCAGTGATATATGGCAAAATTATTTAAAGTGCTTGAGGAATTGGGGAGACTTGCTGCTTATGTGATCAGATATACAAAAATAAAGTTTTTCATTTCATTAAATATTTGATTAAAACTGCTAAGTTTGAAAATGTTTACCAAGAAATAGTTTTCAGAAATTTTGCTAACTTGATAAAGGCTTTTGAGCAATATCTTTCAGACTTTTTACTAATTAAATAGGGA"
alt_seq = "CTTACAATAGAGCTAAGGTAGTACTTTACCAAAAATGGTTTTGATTTAAGGTAAAATCATACCTCATCAACTTGAGTTTTATTTACCTAAGGACCAAAAGAAGAAAAAGCCTGGTTACAAATAGTTTCTTCTATACCCAGGTCAGTGGTTCTTAAAAGTGGTTTTAGTACCCACAGCATCAGCATAATTTGTGAATGTGTTAGGAAGTCAAATTCGTGGGTCCCACCCCGACCTAAACGGAATCAGAATCTCTGGGGATAGGGCCCAGAAACGAGTTCTCTGTTGATTGTCATGAACTTTAAAGGTTGTGAACCACTGACTTAAACAATTGAGGAACATTAAGGAACTTAACAGTATATTCTCTCAATAAAATGTTTGATTATTCTTACTAGCCCCTTAGGATAGTTACCAAAACAAAACTACTAATTTATACTCTTTTAAAACTACAAGTTGCCTGTGTTATGTAGGAGAGATATAAGATTGTGAAATTTTCCAAAAATACCCCTGCCACACCCCTAACAGACAGGAGATGATAAGAGTAAAAGAGAGAATTGGGAGGCTTTAGAAGCCCTATTGTAATATTTCTTAAAACACATTATTTATTGCTAAATATATTTAGTGTGAGCCATGTAAAAATTGCCATTTTTGGAGGTCAAAAACCGTTGAATATCAGCAATTTAGTATGGTTCAATTTATGTGGTTTCTAATCAAATGTACAAAGTACAATAATTTGGTTTCATACTTGATTTTTTATTTCTTAGGCTGAGTCATTAAAAGTTGATCAGTGATATATGGCAAAATTATTTAAAGTGCTTGAGGAATTGGGGAGACTTGCTGCTTATGTGATCAGATATACAAAAATAAAGTTTTTCATTTCATTAAATATTTGATTAAAACTGCTAAGTTTGAAAATGTTTACCAAGAAATAGTTTTCAGAAATTTTGCTAACTTGATAAAGGCTTTTGAGCAATATCTTTCAGACTTTTTACTAATTAAATAGGGA"

In [195]:
infile = "reads.txt"

# K-mer approach

In [232]:
K = 10
IVAL = 2

In [233]:
def kmer_iter(text, k, step, ival):
    num_kmers = (len(text) - k * ival)//step + 1
    for i in range(num_kmers):
        kmer = text[i*step:i*step+k*ival:ival]
        yield kmer

In [234]:
def get_kmer_set(text, k, step, ival):
    kmers = set()
    for kmer in kmer_iter(text, k, step, ival):
        kmers.add(kmer)
    return kmers

In [235]:
ref_kmers = get_kmer_set(ref_seq, k=K, step=1, ival=IVAL)
alt_kmers = get_kmer_set(alt_seq, k=K, step=1, ival=IVAL)

In [245]:
from collections import defaultdict
def kmer_count(text, k, step, ival):
    kmer_counts = defaultdict(int)
    num_kmers = (len(text) - k)//step + 1
    for kmer in kmer_iter(text, k, step, ival):
        kmer_counts[kmer] += 1
    return kmer_counts

In [246]:
def calc_score(kmer_counts, kmer_set):
    common = set(kmer_counts.keys()).intersection(kmer_set)
    counts = [kmer_counts[kmer] for kmer in common]
    score = sum(counts)
    return score

In [253]:
%%time
with open(infile) as reads:
    ref_count = 0
    alt_count = 0
    amb_count = 0  # Ambiguous count
    for read in reads:
        kmer_counts = kmer_count(read, k=K, step=1, ival=IVAL)
        score_ref = calc_score(kmer_counts, ref_kmers)
        score_alt = calc_score(kmer_counts, alt_kmers)
        if score_ref > score_alt:
            ref_count += 1
        elif score_ref < score_alt:
            alt_count += 1
        else:
            amb_count += 1
print(ref_count, alt_count, amb_count, alt_count/(alt_count + ref_count))

21 36 2 0.631578947368
CPU times: user 7.45 ms, sys: 3.2 ms, total: 10.7 ms
Wall time: 8.01 ms


# Local alignment approach

In [94]:
alphabet = ['A', 'C', 'G', 'T']
score = [[0, 4, 2, 4, 8],
         [4, 0, 4, 2, 8],
         [2, 4, 0, 4, 8],
         [4, 2, 4, 0, 8],
         [8, 8, 8, 8, 8]]

In [150]:
import numpy as np
def local_aln_score(t, p):

    # Create distance matrix
    D = np.zeros((len(p)+1,len(t)+1), dtype=np.int)
    
    # Initialize first row
    for i in range(1, len(t)+1):
        D[0,i] = 0
    
    # Initialize first column
    for i in range(1, len(p)+1):
        D[i,0] = D[i-1,0] + score[alphabet.index(p[i-1])][-1]
        
    # Fill rest of the matrix
    for i in range(1, len(p)+1):
        for j in range(1, len(t)+1):
            distHor = D[i,j-1] + score[-1][alphabet.index(t[j-1])]
            distVer = D[i-1,j] + score[alphabet.index(p[i-1])][-1]
            distDiag = D[i-1,j-1] + score[alphabet.index(p[i-1])][alphabet.index(t[j-1])]
            D[i][j] = min(distHor, distVer, distDiag)
    
    # Return min of bottom row
    return min(D[-1])

In [154]:
%%time
with open(infile) as reads:
    ref_count = 0
    alt_count = 0
    amb_count = 0  # Ambiguous count
    for read in reads:
        read = read.rstrip()
        score_ref = local_aln_score(ref_seq, read)
        score_alt = local_aln_score(alt_seq, read)
        if score_ref > score_alt:
            ref_count += 1
        elif score_ref < score_alt:
            alt_count += 1
        else:
            amb_count += 1
print(ref_count, alt_count, amb_count, alt_count/(alt_count + ref_count))

22 36 1 0.620689655172
CPU times: user 45.9 s, sys: 1.12 s, total: 47 s
Wall time: 46.2 s
