# BA2B Find a Median String

In [1]:
def hamming_distance(s,t):
    cnt = 0
    for i,j in zip(s,t):
        cnt += (i!=j)
    return cnt

def minimum(kmer,dna,k):
    score = 0
    for seq in dna:
        mini = float('inf')
        for i in range(len(seq)-k+1):
            mini = min(mini,hamming_distance(kmer,seq[i:i+k]))
        score += mini
    return score

def ba2b(dna,k):
    dna = dna.split('\n')
    m = {}
    for seq in dna:
        for i in range(len(dna)-k+1):
            m[seq[i:i+k]] = minimum(seq[i:i+k],dna,k)
    ans = ''
    mini = float('inf')
    for k,v in zip(m.keys(),m.values()):
        if mini>v:
            mini = v
            ans = k
    return ans


k = 6

dna = '''GTGAATCTATATGGAGTACTCTACGTGGGGAACCGGTGGTGC
CTAAATCAGATCGGACGGATCGTTATGTCAGACTCAACCTGT
CTAGCAGCGCATCTAAATGCCACACCTACCATAATCTTTACA
AAGGCCCATTAGCTATATTTGTCTTGGTATAGGCGCTAGAAT
CGCAACCTATATCCTCACGATTTTAAAATTACCGATACCGCG
TATGATCGCAGCCTATATATATTCCAGCGTGCCTTCAATAGG
CTAAATTTCTCTTAGCAATGGCACTTCGAGCACGCTTCGTTG
ACACCACGGTGGAATCGTTCGCGTCTAAATTTCAGACGATAG
CATTCCCCATCCACCCCCCTAGATCCAAAAGCCAGTACTCTA
GTAACCCTAAATTTTCAGTAAATCAGCGTGGGACAATGACGG'''

print(ba2b(dna,k))

CTAAAT


# BA2C Find a Profile-most Probable k-mer in a String

In [2]:
def probability(kmer,pm):
    pro = 1
    for i in range(len(kmer)):
        pro *= pm[kmer[i]][i]
    return pro

def ba2c(dna,k,profile_matrix):
    best_pro = -1*float('inf')
    for i in range(len(dna)-k+1):
        pro = probability(dna[i:i+k],profile_matrix)
        if pro>best_pro:
            best_pro = pro
            ans = dna[i:i+k]
    return ans

dna = 'ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT'
k = 5
m = '''0.2 0.2 0.3 0.2 0.3
0.4 0.3 0.1 0.5 0.1
0.3 0.3 0.5 0.2 0.4
0.1 0.2 0.1 0.1 0.2'''

m = m.split('\n')
profile_matrix={
    'A':[float(i) for i in m[0].split(' ')],
    'C':[float(i) for i in m[1].split(' ')],
    'G':[float(i) for i in m[2].split(' ')],
    'T':[float(i) for i in m[3].split(' ')],
}

print(ba2c(dna,k,profile_matrix))

CCGAG


# BA2F Implement RandomizedMotifSearch

In [3]:
import random

def probability(kmer,pm,k):
    pro = 1
    for i in range(len(kmer)):
        pro *= pm[kmer[i]][i]
    return pro

def update_kmers(kmers,dna,k,pm):
    update = []
    for seq in dna:
        max_pro = -1*float('inf')
        ans = ''
        for i in range(len(seq)-k+1):
            pro = probability(seq[i:i+k],pm,k)
            if pro>max_pro:
                max_pro = pro
                ans = seq[i:i+k]
        update.append(ans)
    return update

def profile_matrix(kmers,k):
    pm = {
        'A':[1/4]*k,
        'C':[1/4]*k,
        'G':[1/4]*k,
        'T':[1/4]*k
    }
    for kmer in kmers:
        for i in range(len(kmer)):
            pm[kmer[i]][i] += 1/len(kmers)
    return pm

def score(kmers,k):
    score = 0
    for i in range(k):
        l = [j[i] for j in kmers]
        score += len(l)-max(l.count('A'),l.count('C'),l.count('G'),l.count('T'))
    return score

def ba2f(dna,k,n):
    final_list = []
    dna = dna.split('\n')
    for i in range(1001):
        kmers = []
        for seq in dna:
            # print(seq)
            i = random.choice(range(k))
            kmers.append(seq[i:i+k])
        min_score = float('inf')
        res = ''
        while True:
            pm = profile_matrix(kmers,k)
            kmers = update_kmers(kmers,dna,k,pm)
            sc = score(kmers,k)
            if sc<min_score:
                min_score=sc
                res = kmers
            else:
                final_list.append((res,min_score))
                break
    min_v = float('inf')
    answer = ''
    for i in final_list:
        if i[1]<min_v:
            min_v = i[1]
            answer = i[0]
    return answer

k = 8
n = 5
dna = '''CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA
GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG
TAGTACCGAGACCGAAAGAAGTATACAGGCGT
TAGATCAAGTTTCAGGTGCACGTCGGTGAACC
AATCCACCAGCTCCACGTGCAATGTTGGCCTA'''

for i in ba2f(dna,k,n):
    print(i)

TCTCGGGG
CCAAGGTG
TACAGGCG
TTCAGGTG
TCCACGTG


# BA2H Implement DistanceBetweenPatternAndStrings

In [4]:
def hamming_distance(s,t):
    cnt = 0
    for i,j in zip(s,t):
        cnt += (i!=j)
    return cnt

def ba2h(kmer,dna):
    score = 0
    dna = dna.split(' ')
    k = len(kmer)
    for seq in dna:
        mini = float('inf')
        for i in range(len(seq)-k+1):
            # print(seq[i:i+k])
            mini = min(mini,hamming_distance(kmer,seq[i:i+k]))
        score += mini
    return score


kmer = 'AAA'
dna = 'TTACCTTAAC GATATCTGTC ACGGCGTTCG CCCTAAAGAG CGTCAGAGGT'

ba2h(kmer,dna)

5

# BA3E Construct the De Bruijn Graph of a Collection of k-mers

In [5]:
def ba3e(kmers):
    kmers = kmers.split('\n')
    m = {}
    for s in kmers:
        if s[:-1] not in m:
            m[s[:-1]] = []
        m[s[:-1]].append(s[1:])
    for k,v in zip(m.keys(),m.values()):
        print(k,end=' -> ')
        for i in v[:-1]:
            print(i,end=',')
        print(v[-1])

kmers = '''GAGG
CAGG
GGGG
GGGA
CAGG
AGGG
GGAG'''

ba3e(kmers)

GAG -> AGG
CAG -> AGG,AGG
GGG -> GGG,GGA
AGG -> GGG
GGA -> GAG


# BA3G Find an Eulerian Path in a Graph

In [6]:
from collections import defaultdict

def ba3g(graph):
    in_deg = defaultdict(int)
    out_deg = defaultdict(int)
    for node in graph:
        out_deg[node] = len(graph[node])
    # print(out_deg)
    for node in graph:
        for neighbour in graph[node]:
            in_deg[neighbour]+=1
    # print(in_deg)
    start_node = graph[0]
    for node in graph:
        if in_deg[node]+1==out_deg[node]:
            start_node=node
    # print(start_node)
    def visit(node):
        while(graph[node]):
            visit(graph[node].pop())
        path.append(node)
    path = []
    visit(start_node)
    path = path[::-1]
    # print(path)
    for i in path[:-1]:
        print(i,end='->')
    print(path[-1])
    
graph = {
    0:[2],
    1:[3],
    2:[1],
    3:[0,4],
    4:[],
    6:[3,7],
    7:[8],
    8:[9],
    9:[6]
}

ba3g(graph)

6->7->8->9->6->3->0->2->1->3->4


# BA4B Find Substrings of a Genome Encoding a Given Amino Acid String

In [7]:

from Bio import Seq

def ba4b(dna,peptide):
    k = len(peptide)*3
    for i in range(len(dna)-k+1):
        s = Seq.Seq(dna[i:i+k]).transcribe().translate()
        t = Seq.Seq(dna[i:i+k]).reverse_complement().transcribe().translate()
        if s==peptide or t==peptide:
            print(dna[i:i+k])

dna = 'ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA'
peptide = 'MA'

ba4b(dna,peptide)

ATGGCC
GGCCAT
ATGGCC


# BA4C Generate the Theoretical Spectrum of a Cyclic Peptide

In [8]:
def ba4c(peptide):
    ans = []
    apss = []
    pep = peptide+peptide
    for k in range(1,len(peptide)):
        for i in range(len(peptide)):
            apss.append(pep[i:i+k])
    apss.append(peptide)
    m = {
        'G':57,'A':71,'S':81,'P':97,'V':99,'T':101,'C':103,'I':113,'L':113,'N':114,'D':115,
        'K':128,'Q':128,'E':129,'M':131,'H':137,'F':147,'R':156,'Y':163,'W':186
    }
    # print(m.values())
    for s in apss:
        cnt = 0
        for i in s:
            cnt+=m[i]
        ans.append(cnt)
    ans.append(0)
    return sorted(ans)

peptide = 'LEQN'
for i in ba4c(peptide):
    print(i,end=' ')

0 113 114 128 129 227 242 242 257 355 356 370 371 484 

# BA4D Compute the Number of Peptides of Given Total Mass

In [2]:
mass = [57, 71, 81, 97, 99, 101, 103, 113, 114, 115, 128, 129, 131, 137, 147, 156, 163, 186]

def ba4d(total,m):
    if total==0:
        return 1
    if m[total]!=-1:
        return m[total]
    sum = 0
    for i in mass:
        if i<= total:
            sum += ba4d(total-i,m)
    m[total] = sum
    return m[total]
        
    
total = 1024
m = [-1]*(total+1)

ba4d(total,m)

18921682231

# BA4E Find a Cyclic Peptide with Theoretical Spectrum Matching an Ideal Spectrum

# BA4F Compute the Score of a Cyclic Peptide Against a Spectrum

In [10]:
def mass(kmer):
    m = {
        'G':57,'A':71,'S':81,'P':97,'V':99,'T':101,'C':103,'I':113,'L':113,'N':114,'D':115,
        'K':128,'Q':128,'E':129,'M':131,'H':137,'F':147,'R':156,'Y':163,'W':186
    }
    cnt = 0
    for c in kmer:
        cnt+=m[c]
    return cnt

def ba4f(peptide,spec):
    ans = []
    spec = [int(i) for i in  spec.split(' ')]
    pep = peptide+peptide
    for k in range(1,len(peptide)):
        for i in range(len(pep)):
            ans.append(mass(pep[i:i+k]))
    ans.append(0)
    ans.append(mass(peptide))
    ans = sorted(ans)
    # print(ans)
    # print(spec)
    res = list(set(ans+spec))
    score = 0
    for i in res:
        score += min(ans.count(i),spec.count(i))
    return score



peptide = 'NQEL'
spec = '0 99 113 114 128 227 257 299 355 356 370 371 484'

ba4f(peptide,spec)

11

# BA4H Generate the Convolution of a Spectrum

In [11]:
def ba4h(spec):
    spec = [int(i) for i in spec.split(' ')]
    spec = sorted(spec)
    ans = []
    for i in range(1,len(spec)):
        # print(spec[i:])
        for j in spec[i:]:
                ans.append(j-spec[i-1])
    m = {}
    for i in ans:
        if i not in m:
            m[i]=0
        m[i]+=1
    m = dict(sorted(m.items(), key=lambda item: item[1],reverse=True))
    for k,v in zip(m.keys(),m.values()):
        for i in range(v):
                if k>0:
                    print(k,end=' ')

spec='0 137 186 323'
ba4h(spec)

137 137 186 186 323 49 