# Project for Algorithms in Molecular Biology

In [1]:
import numpy as np
import random
import math
from random import randint, uniform    
import operator

In [None]:
IUPAC = {
  'W' : ["A", "T"],
  "R" : ["A", "G"],
  "K" : ["G", "T"],
  "S" : ["G", "C"],
  "Y" : ["T", "C"],
  "M" : ["A", "C"],
  "B" : ["G", "C", "T"],
  "D" : ["G", "A", "T"],
  "H" : ["C", "A", "T"],
  "V" : ["C", "A", "G"],
  "N" : ["C", "A", "G", "T"],
}
b_k = 0.25

In [None]:
def hammingDistance(str1, str2):
  diffs = 0
  for ch1, ch2 in zip(str1, str2):
    if ch1 != ch2:
      diffs += 1
  return diffs

In [None]:
def BuildProfile(motif):
	k = len(motif[0])
	profile = [[0 for y in range(k)] for x in range(4)]
	for count in range(k):
		A=0
		C=0
		G=0
		T=0
		
		# Add in Laplace counts to avoid
		# prob densities that are zero or one
		# accelerates alg runtime
		A += 1
		C += 1
		G += 1
		T += 1
		for string in motif:
			if string[count]=='A':
				A+=1
			elif string[count]=='C':
				C+=1
			elif string[count]=='G':
				G+=1
			elif string[count]=='T':
				T+=1
		# Insert frequencies if base A
		profile[0][count] = float(A)/(A+C+G+T)
		# Insert frequencies if base C
		profile[1][count] = float(C)/(A+C+G+T)
		# Insert frequencies if base G
		profile[2][count] = float(G)/(A+C+G+T)
		# Insert frequencies if base T
		profile[3][count] = float(T)/(A+C+G+T)
	return profile

In [None]:
def BuildPWM(profile):
  pwm = [[0 for y in range(np.shape(profile)[1])] for x in range(4)]
  for index1, prob in enumerate(profile):
    for index2, j in enumerate(prob):
       pwm[index1][index2] = math.log2(j/b_k)
  
  return pwm

In [None]:
def score(motifs):
	k = len(motifs[0])
	pattern = []
	for i in range(k):
		A=0
		C=0
		G=0
		T=0
		for string in motifs:
			if string[i]=='A':
				A+=1
			elif string[i]=='C':
				C+=1
			elif string[i]=='G':
				G+=1
			elif string[i]=='T':
				T+=1				
		if A >= C and A >= G and A >= T:
			pattern.append('A')
		elif C >= G and C >= T:
			pattern.append('C')
		elif G >= T:
			pattern.append('G')
		else:
			pattern.append('T')

	pattern = "".join(pattern)
 			
	score = 0
	for string in motifs:
		score += hammingDistance(string, pattern)
	return score

In [None]:
def selection(motifs, dna):
  k = len(motifs[0])

  newMotifs = []
  # Calculate probilities for each k-mer in Dna_i
  for index, dna_string in enumerate(dna):
    
    distances = [0 for x in range(len(dna_string)-k+1)]
    
    for i in range(len(dna_string)-k+1):
      # calculate the distances of each k-mer with
      distances[i] = hammingDistance(dna_string[i:i+k], motifs[index])

    # normalize the distances - scores
    # for i in range(len(dna_string)-k+1):
    #   distances[i] = distances[i]/sum(distances)

    accumulated_values = [0 for x in range(len(dna_string)-k+1)]
    previous_probability = 0.0
    for i in range(len(dna_string)-k+1):
      previous_probability = previous_probability + (distances[i]/sum(distances))
      accumulated_values[i] = previous_probability #sum(distances[:i+1])
    
    # randomly select a value
    randVal = random.random()

    for i in range(len(dna_string)-k+1):
      if accumulated_values[i] > randVal: # the lower the distance the bigger the chance to be selected
        newMotifs.append(dna_string[i:i+k])
        break

  return newMotifs   

In [None]:
# crossover two parents to create two children
def crossover(p1, p2):
	# children are copies of parents by default
  c1, c2 = p1.copy(), p2.copy()
  # select crossover point that is not on the end of the string
  pt = randint(1, len(p1)-2)
  # perform crossover
  c1 = p1[:pt] + p2[pt:]
  c2 = p2[:pt] + p1[pt:]

  return [c1, c2]

In [None]:
def mutation(child):
  
  ind = randint(0, len(child)-1)

  # child[ind] = ""

  return child 

In [None]:
def GSGA(dna, k, N):
  Motifs = []
  t = len(dna)

  for strand in dna:
    i = random.randrange(len(strand)-k+1)
    substr = strand[i:i+k]
    Motifs.append(substr)

  bestMotifs = Motifs
  bestMotifsScore = score(bestMotifs)
  profile_matrix = BuildProfile(bestMotifs)
  pwm = BuildPWM(profile_matrix)

  motifsSelection = np.concatenate((bestMotifs, selection(bestMotifs, dna)))

  # create the next generation
  children = list()
  for i in range(0, t):
    # get selected parents in pairs
    p1, p2 = motifsSelection[i], motifsSelection[i+t]
    # crossover and mutation
    for c in crossover(p1, p2):
      # mutation
      mutation(c)
      # store for next generation
  #     children.append(c)
  # # replace population
  # pop = children

  # print(bestMotifs)
  # print(motifsSelection)

  # for j in range(1,N):
  #   i = random.randrange(t)
  #   subsetMotifs = Motifs[0:i]+Motifs[i+1:t]
  #   replacementMotif = singleReplacementMotif(subsetMotifs, dna[i])
  #   Motifs[i] = replacementMotif

  #   if score(Motifs) < bestMotifsScore:
  #     bestMotifs = list(Motifs)
  #     bestMotifsScore = score(bestMotifs)
  # return [bestMotifs, bestMotifsScore]

In [None]:
kmer_length, N = 8, 200
dna = ['CGCCCCTCTCGGGGGTGTTCAGTAACCGGCCA', 'GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG', 'TAGTACCGAGACCGAAAGAAGTATACAGGCGT', 'TAGATCAAGTTTCAGGTGCACGTCGGTGAACC','AATCCACCAGCTCCACGTGCAATGTTGGCCTA']
best_motifs = [None, float('inf')]

GSGA(dna, kmer_length, N)