In [2]:
IUPAC = {
  'W' : ["A", "T"],
  "R" : ["A", "G"],
  "K" : ["G", "T"],
  "S" : ["G", "C"],
  "Y" : ["T", "C"],
  "M" : ["A", "C"],
  "B" : ["G", "C", "T"],
  "D" : ["G", "A", "T"],
  "H" : ["C", "A", "T"],
  "V" : ["C", "A", "G"],
  "N" : ["C", "A", "G", "T"],
}
b_k = 0.25

In [3]:
def BuildProfile(motif):
	k = len(motif[0])
	profile = [[0 for y in range(k)] for x in range(4)]
	for count in range(k):
		A=0
		C=0
		G=0
		T=0
		
		# Add in Laplace counts to avoid
		# prob densities that are zero or one
		# accelerates alg runtime
		A += 1
		C += 1
		G += 1
		T += 1
		for string in motif:
			if string[count]=='A':
				A+=1
			elif string[count]=='C':
				C+=1
			elif string[count]=='G':
				G+=1
			elif string[count]=='T':
				T+=1
		# Insert frequencies if base A
		profile[0][count] = float(A)/(A+C+G+T)
		# Insert frequencies if base C
		profile[1][count] = float(C)/(A+C+G+T)
		# Insert frequencies if base G
		profile[2][count] = float(G)/(A+C+G+T)
		# Insert frequencies if base T
		profile[3][count] = float(T)/(A+C+G+T)
	return profile

In [4]:
def singleReplacementMotif(motifs, dna_i):
  k = len(motifs[0])
  profile = BuildProfile(motifs)
  # pwm = BuildProfile(motifs)

  # Calculate probilities for each k-mer in Dna_i
  kmerDensities = [0 for x in range(len(dna_i)-k+1)]
  for i in range(len(dna_i)-k+1):
    prob = 1
    s = 0
    for j in range(k):
      if dna_i[i+j] == 'A':
        prob *= profile[0][j]
        # s += pwm[0][j]
      elif dna_i[i+j] == 'C':
        prob *= profile[1][j]
        # s += pwm[1][j]
      elif dna_i[i+j] == 'G':
        prob *= profile[2][j]
        # s += pwm[2][j]
      elif dna_i[i+j] == 'T':
        prob *= profile[3][j]
        # s += pwm[2][j]
    kmerDensities[i] = prob

  # normalize probabilities
  normalizationTot = sum(kmerDensities)

  for i in range(len(dna_i)-k+1):
    kmerDensities[i] = kmerDensities[i]/normalizationTot
  
  # construct prefix sum for lookup		
  kmerDensities = list(accumulate(kmerDensities))

  # randomly select a k-mer
  randVal = random.random()
  
  for i in range(len(dna_i)-k+1):
    if randVal < kmerDensities[i]:
      break
  replacementKmer = dna_i[i:i+k]
  
  # max_index = kmerDensities.index(max(kmerDensities))
  # replacementKmer = dna_i[max_index:max_index+k]

  return replacementKmer

In [5]:
def accumulate(iterable, func=operator.add):
	# accumulate([1,2,3,4,5]) --> 1 3 6 10 15
	# accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
	it = iter(iterable)
	try:
		total = next(it)
	except StopIteration:
		return
	yield total
	for element in it:
		total = func(total, element)
		yield total

In [6]:
def hammingDistance(str1, str2):
  diffs = 0
  for ch1, ch2 in zip(str1, str2):
    if ch1 != ch2:
      diffs += 1
  return diffs

In [7]:
def score(motifs):
	k = len(motifs[0])
	pattern = []
	for i in range(k):
		A=0
		C=0
		G=0
		T=0
		for string in motifs:
			if string[i]=='A':
				A+=1
			elif string[i]=='C':
				C+=1
			elif string[i]=='G':
				G+=1
			elif string[i]=='T':
				T+=1				
		if A >= C and A >= G and A >= T:
			pattern.append('A')
		elif C >= G and C >= T:
			pattern.append('C')
		elif G >= T:
			pattern.append('G')
		else:
			pattern.append('T')

	pattern = "".join(pattern)
 			
	score = 0
	for string in motifs:
		score += hammingDistance(string, pattern)
	return score

In [8]:
def GibbsSampler(dna, k, N):  
  Motifs = []
  t = len(dna)

  for strand in dna:
    i = random.randrange(len(strand)-k+1)
    substr = strand[i:i+k]
    Motifs.append(substr)

  bestMotifs = Motifs
  bestMotifsScore = score(bestMotifs)

  for j in range(1,N):
    i = random.randrange(t)
    subsetMotifs = Motifs[0:i]+Motifs[i+1:t]
    replacementMotif = singleReplacementMotif(subsetMotifs, dna[i])
    Motifs[i] = replacementMotif

    if score(Motifs) < bestMotifsScore:
      bestMotifs = list(Motifs)
      bestMotifsScore = score(bestMotifs)
  return [bestMotifs, bestMotifsScore]

In [9]:
kmer_length, N = 8, 200
dna = ['CGCCCCTCTCGGGGGTGTTCAGTAACCGGCCA', 'GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG', 'TAGTACCGAGACCGAAAGAAGTATACAGGCGT', 'TAGATCAAGTTTCAGGTGCACGTCGGTGAACC','AATCCACCAGCTCCACGTGCAATGTTGGCCTA']
best_motifs = [None, float('inf')]

# Repeat the Gibbs sampler search 20 times.
for repeat in range(20):
  current_motifs = GibbsSampler(dna, kmer_length, N)
  # print(current_motifs)
  if current_motifs[1] < best_motifs[1]:
      best_motifs = current_motifs
#Print and save the answer.
#print ('\n'.join(best_motifs[1])) 
print(best_motifs)           

[['TCTCGGGG', 'CCAAGGTG', 'TACAGGCG', 'TTCAGGTG', 'TCCACGTG'], 9]


- TCTCGGGG
- CCAAGGTG
- TACAGGCG
- TTCAGGTG
- TCCACGTG