In [3]:
# Calculate edit distance use recursive approach
# very slow
def editDistRecursive(x, y):
    
    # let algorithm know when to stop
    if len(x)==0:
        return len(y)
    elif len(y)==0:
        return len(x)
    else:
        distHor = editDistRecursive(x[:-1], y) + 1
        distVer = editDistRecursive(x, y[:-1]) + 1
        if x[-1] == y[-1]:
            distDiag = editDistRecursive(x[:-1], y[:-1]) # NOTE: x[:-1] not x
        else:
            distDiag = editDistRecursive(x[:-1], y[:-1]) + 1
        
        return min(distHor, distVer, distDiag)

In [1]:
# Use dynamic programming: calculate edit distance cell by cell for a matrix
# Very fast!

def editDistance(x, y):
    D = [] # initiate empty matrix
    for i in range(len(x)+1):
        D.append([0]*(len(y)+1))
    for i in range(len(x)+1): # Initiate first column numbers
        D[i][0] = i
    for i in range(len(y)+1): # Initiate first row numbers
        D[0][i] = i
    
    for i in range(len(x)+1):
        for j in range(len(y)+1):
            distHor = D[i][j-1] + 1
            distVer = D[i-1][j] + 1
        if x[i-1] == y[j-1]:
            distDiag = D[i][j]
        else:
            distDiag = D[i][j] + 1
        D[i][j] = min(distHor, distVer, distDiag)

    return D[-1][-1] # Edit distance is the value in the bottom right corner of the matrix  

In [None]:
%%time
x = 'shake spea'
y = 'Shakespearerer'
editDistRecursive(x, y)

In [8]:
%%time
x = 'shake spea'
y = 'Shakespear'
editDistance(x,y)

CPU times: user 83 µs, sys: 1e+03 ns, total: 84 µs
Wall time: 88.2 µs


1

In [5]:
x = [1,2,3,4]
y=[1,2,3,4]
D = []
for i in range(len(x)+1):
    D.append([0]*(len(y)+1))
    print(D)

[[0, 0, 0, 0, 0]]
[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]
[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]
[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]
[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]


In [6]:
D

[[0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0]]

### 3.02_GlobalAlignment

In [12]:
# Define penalty weights
# transitions (A-G or C-T): 2
# transversions: 4
# deletion2/insertions: 8
alphabet = ['A','C','G','T']
score = [[0, 4, 2, 4, 8],
        [4, 0, 4, 2, 8],
        [2, 4, 0, 4, 8],
        [4, 2, 4, 0, 8],
        [8, 8, 8, 8, 8]]

In [13]:
# converts from character to its offset in list alphabet
alphabet.index('A')

0

In [15]:
# penalty associated with A (from X) mismatching with T (from Y)
score[alphabet.index('A')][alphabet.index('T')]

4

In [16]:
# penalty associated with C (from X) being deleted in Y
score[alphabet.index('C')][-1]

8

In [24]:
# Build on editDistance() function
def globalAlignment(x, y):
    D = [] # initiate empty matrix
    for i in range(len(x)+1):
        D.append([0]*(len(y)+1))
        
    # if last row and column numbers are not equal (e.g.,not all 8s)
    # add specific codes rather than 8 (see notes)
    for i in range(1, len(x)+1): # Initiate first column numbers
        D[i][0] = D[i-1][0] + 8 # or: + score[alphabet.index(x[i-1])][-1]
    for j in range(1, len(y)+1): # Initiate first row numbers
        D[0][j] = D[0][j-1] + 8 # or: + score[-1][alphabet.index(x[i-1])]
    print(D)
    # Fill the rest
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            distHor = D[i][j-1] + 8 #  or: + score[-1][alphabet.index(x[i-1])]
            distVer = D[i-1][j] + 8 #  or: + score[alphabet.index(x[i-1])][-1]
            distDiag = D[i-1][j-1] + score[alphabet.index(x[i-1])][alphabet.index(y[j-1])]
            D[i][j] = min(distHor, distVer, distDiag)

    return D[-1][-1] # Edit distance is the value in the bottom right corner of the matrix  

In [25]:
x = 'TATGTCATGC'
y = 'TATGGCATGC'
print(globalAlignment(x,y))

[[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80], [8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [56, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
4


### 3.03_FindingOverlaps

In [3]:
def overlap(a, b, min_length=3):
    """Find the longest overlap between suffix of 'a' and prefix of 'b'
    if no overlap return 0"""
    start = 0 # start searching from the left
    while True: # Search the next occurrence of prefix of b in a
        start = a.find(b[:min_length], start) # find the first match from left. Returns -1 if no match
        #print(start)
        if start == -1:
            return 0
        if b.startswith(a[start:]): # Check if rest of 'a' (to the right) matches with prefix of 'b'
                                    # a[start:] can be longer than  b[:min_length]
            return len(a)-start #length of the overlap
        start += 1
        

In [4]:
overlap('TTACGTCGTGT', 'CGTGTGC')

5

In [28]:
from itertools import permutations

list(permutations([1,2,3],2))

[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)]

In [32]:
def naive_overlap_map(reads, k):
    """Find overlaps among every pair of reads"""
    overlaps = {} 
    for a, b in permutations(reads,2):
        overlap_length = overlap(a, b, min_length=k)
        if overlap_length > 0:
            overlaps[(a, b)] = overlap_length
    return overlaps

In [33]:
reads = ['ACGGATC', 'GATCAAGT', 'TTCACGGA']
print(naive_overlap_map(reads, 3))

3
-1
-1
-1
3
-1
{('ACGGATC', 'GATCAAGT'): 4, ('TTCACGGA', 'ACGGATC'): 5}


### 4.01_ShortestCommonSuperstring

In [1]:
# brute force method using permutations
# If ss contains n strings, n!(n factorial) orderings possible
from itertools import permutations

def scs(ss):
    """Return the shortest common superstring of a set of strings (ss)
    Assuming no string is a substring of another"""
    shortest = None
    for perms in permutations(ss): # loop through all permutations of the substrings
        sup = perms[0] # initiate scs with the first substring of the permutation
        
        for i in range(len(ss)-1): # for each permutation, find overlaps of every two substrings and concatenate in order
            overlap_length = overlap(perms[i], perms[i+1], min_length=1)
            sup += perms[i+1][overlap_length:] # concatenate prev substring and the non-overlapping part (should not concatenate the overlapping part) of the next substring
        if shortest is None or len(shortest) > len(sup): # Note the indentation here. Should be same as the for loop above
            shortest = sup
    return shortest
        

In [5]:
scs(['ACGGTACGAGC', 'GAGCTTCGGA', 'GACACGG'])

'GACACGGTACGAGCTTCGGA'

In [7]:
scs(['CCT','CTT','TGC','TGG','GAT','ATT'])

'CCTTGGATTGC'

In [11]:
len(scs(['ATT','CCT','CTT','TGC','TGG','GAT']))

11

### 4.02_GreedySCS

In [49]:
# write a helper function first to find max suffix-prefix overlap
def pick_maximal_overlap(reads, k):
    """ Return a pair of reads from the list with a
    maximal suffix/prefix overlap >= k.  Returns
    overlap length 0 if there are no such overlaps."""
    reada, readb = None,None
    best_olen = 0
    for a, b in permutations(reads,2): # note permutation size of 2 instead of 1 in the scs function
        olen = overlap(a, b, min_length=k)
        if olen > best_olen:
            reada, readb = a, b
            best_olen = olen
    return reada, readb, best_olen

In [50]:
# Greedy SCS
def greedy_scs(reads, k):
    """ Greedy shortest-common-superstring merge.
    Repeat until no edges (overlaps of length >= k)
    remain. """
    reada, readb, best_olen = pick_maximal_overlap(reads, k)
    
    while best_olen > 0:
        reads.remove(reada)
        reads.remove(readb)
        reads.append(reada + readb[best_olen:]) # merge reads and remove reada, readb from list
        reada, readb, best_olen = pick_maximal_overlap(reads, k)
    return ''.join(reads) # concatenate if there are multiple 'nodes' left at the end (not overlaped)

In [51]:
greedy_scs(['ABC', 'BCA', 'CAB'], 2)

'CABCA'

In [52]:
# Example when greedy scs does not return the shortest common substring
# because it merged ABCD and BCDA first due to largest overlap, 
# then have to concatenate with CDBC
greedy_scs(['ABCD', 'CDBC', 'BCDA'], 1)

'CDBCABCDA'

In [53]:
# Compare with brute force:
scs(['ABCD', 'CDBC', 'BCDA'])

'ABCDBCDA'

### 4.03_DeBruijn

In [56]:
# function to create nodes and edges for the De Bruijn graph
def de_bruijn_ize(st, k): # input: string and size of kmer
    edges = []
    nodes = set()
    for i in range(len(st) - k + 1): # NOTE: range(len(st) - k + 1)
        edges.append((st[i:i+k-1],st[i+1:i+k])) # append the two (k-1)-mers for the first k mer
        nodes.add(st[i:i+k-1]) # NOTE: use 'add' not 'append' for set object
        nodes.add(st[i+1:i+k])
    return nodes, edges
    

In [57]:
nodes, edges = de_bruijn_ize("ACGCGTCG", 3)

In [58]:
nodes

{'AC', 'CG', 'GC', 'GT', 'TC'}

In [59]:
edges

[('AC', 'CG'),
 ('CG', 'GC'),
 ('GC', 'CG'),
 ('CG', 'GT'),
 ('GT', 'TC'),
 ('TC', 'CG')]

 #### Visualize De Bruijn graph

In [60]:
# function to pass nodes and edges to dot

def visualize_de_bruijn(st, k):
    """ Visualize a directed multigraph using graphviz """
    nodes, edges = de_bruijn_ize(st, k)
    dot_str = 'digraph "DeBruijn graph" {\n'
    for node in nodes:
        dot_str += '  %s [label="%s"] ;\n' % (node, node)
    for src, dst in edges:
        dot_str += '  %s -> %s ;\n' % (src, dst)
    return dot_str + '}\n'

In [62]:
# might have to do this first:
%install_ext https://raw.github.com/cjdrake/ipython-magic/master/gvmagic.py
%load_ext gvmagic

# Errors. How to fix this??

UsageError: Line magic function `%install_ext` not found.


In [68]:
%dotstr visualize_de_bruijn("ACGCGTCG", 3)

UsageError: Line magic function `%dotstr` not found.
