## Derek Robinson

## Question 1

In [9]:
import numpy as np

def print_distance_matrix(D, seq_names):
	"""
	A helper function to print the distance matrix for the UPGMA algorithm
	Parameters
	----------
	D : list-like
		the 2d distance matrix we are going to pretty print
	seq_names : list-like
		the names of the sequences, D[0][0] corresponds to the distance between seq_names[0] and seq_names[0]
	"""
	print("-", end=" ")
	for seq in seq_names:
		print(seq, end=" ")
	print("\n", end="")
	for i in range(0, len(D)):
		print(seq_names[i], end=" ")
		for j in range(0,len(D[0])):
			print(D[i][j], end="")
			print(" ", end ="")
		print("\n", end="")

def find_minimum_coords(D):
	"""
	Finds the coordinates of the minimum value in a 2d matrix
    Parameters
    ----------
    D : np.ndarray
        the 2d matrix we are going to find the coordinates of the minimum for
    Returns
    -------
    min_coord : tuple
        the tuple which contains the coordinates of the minimum in D
	"""
	current_min = float('inf') #don't know the min yet so set it to infinity
	min_coord = (0, 0)
	more_than_one_min = 1 #if greater than 1 there is more than 1 minimum in D and thus more than one optimal tree
	for i in range(0, len(D)):
		for j in range(0, len(D[0])):
			if(D[i][j] < current_min and D[i][j] != 0):
				current_min = D[i][j]
				min_coord = (i,j)
	return tuple(sorted(min_coord))

def hamming_distance(string1, string2):
	"""
	A helper function which returns the pairwise distance between two strings, also known as the hamming distance.
	Taken directly from the second reference.
	References
    ----------
	https://en.wikipedia.org/wiki/Distance_matrices_in_phylogeny
	https://en.wikipedia.org/wiki/Hamming_distance
    Parameters
    ----------
    string1, string2 : str
        the strings we are going to calculate the hamming distance for
    Return
    ------
    dist_counter : int
        the distance between string1 and string2
	"""
	dist_counter = 0
	for n in range(len(string1)):
		if string1[n] != string2[n]:
			dist_counter += 1
	return dist_counter

def create_new_cluster(clusters, min_coord, D):
    """
    Creates a new cluster in the param clusters.
    Parameters
	----------
	clusters : list
		a list of the current clusters
	min_coord : tuple
		a tuple that gives the coordinates in D for the minimun value
    D : np.ndarray
        A 2d array which holds the distance matrix for the UPGMA alg
	Returns
    -------
	clusters : list
		a list of the clusters with the newest one added
    """
    clusters[min_coord[0]] = clusters[min_coord[0]] + clusters[min_coord[1]].replace("S", "")
    clusters.remove(clusters[min_coord[1]])
    return clusters

def build_tree_nodes(nodes, min_coord, D, clusters):
    """
    Builds a new tree node and places it into nodes
    Parameters
    ----------
    nodes : list
        a list of tuples containing each of the nodes and their height in the tree
	min_coord : tuple
		a tuple that gives the coordinates in D for the minimun value
    D : np.ndarray
        A 2d array which holds the distance matrix for the UPGMA alg
    clusters : list
		a list of the current clusters
    Return
    ------
    nodes : list
        the list of nodes with the newly added node
    """
    height = D[min_coord[0]][min_coord[1]]/2
    leaf_0 = clusters[min_coord[0]]
    leaf_1 = clusters[min_coord[1]]
    nodes.append((leaf_0, leaf_1, height))
    return nodes

def build_final_tree(nodes, clusters):
    """
    Builds the final tree output from the nodes and final cluster
    Parameters
    ----------
    nodes : list
        a list of tuples containing each of the nodes and their height in the tree
    clusters : list
		a list of the current clusters, when this function is called clusters only contains the final cluster
    Return
    ------
    tree : string
        the final phylogenetic tree
    """
    tree = ""
    last_height = 0
    for i in range(0, len(nodes)):
        left_node = nodes[i][0]
        right_node = nodes[i][1]
        height = nodes[i][2]
        if(i >= 1):
            tree = "(" + left_node + ":" + str(height - last_height)[0:3] + tree + ")" + "(" + right_node + ":" + str(height)[0:3] + ")"
        else:
            tree = "(" + left_node + ":" + str(height)[0:3] + tree + ")" + "(" + right_node + ":" + str(height)[0:3] + ")"
        last_height = height
    tree = clusters[0] + tree
    return tree

def recalculate_distances(D, clusters, min_coord):
    """
    Recalculates the distance matrix D after a new cluster is created
    Parameters
    ----------
    D : np.ndarray
        A 2d array which holds the distance matrix for the UPGMA alg
    clusters : list
        a list of the current clusters
    min_coord : tuple
        a tuple that gives the coordinates in D for the minimun value
    Return
    ------
    D : np.ndarray
        The distance matrix with recalculated values
    """
    copy_D = D.copy()
    D = np.ndarray(shape=(len(clusters),len(clusters)), dtype=float)
    # fill in the first row and column with the new scores
    for i in range(0, len(D)):
        if(i == 0):
            D[0][i] = 0
            D[i][0] = 0
        elif(i >= min_coord[1]):
            D[0][i] = (copy_D[min_coord[0]][i+1] + copy_D[min_coord[1]][i+1])/2
            D[i][0] = (copy_D[min_coord[0]][i+1] + copy_D[min_coord[1]][i+1])/2
        else:
            D[0][i] = (copy_D[min_coord[0]][i] + copy_D[min_coord[1]][i])/2
            D[i][0] = (copy_D[min_coord[0]][i] + copy_D[min_coord[1]][i])/2
    # fill in the remainder of the matrix with score from copy_D
    for i in range(1, len(D)):
        for j in range(1, len(D)):
            if(i == j):
                D[i][j] = 0
            elif(i >= min_coord[1] and j >= min_coord[1]):
                D[i][j] = copy_D[i+1][j+1]	
            elif(j >= min_coord[1]):
                D[i][j] = copy_D[i][j+1]
            elif(i >= min_coord[1]):
                D[i][j] = copy_D[i+1][j]
            else:
                D[i][j] = copy_D[i][j]
    return D

def build_distance_matrix(seq_names, seqs):
	"""
	Computes and returns the pairwise distance matrix for all sequences.
	Parameters
	----------
	seq_names : list
		a list of the sequence names we are to build the phylogenetic tree for
	seqs : list
		a list of the sequences we are to build the phylogenetic tree for
	Returns
    -------
	D : numpy.ndarray 
		the pairwise distance matrix
	"""
	n = int(len(seqs))# our pairwise distance matrix will have shape = (n,n)
	D = np.ndarray(shape=(n,n), dtype=float) # the pairwise distance matrix
	# compute the pairwise distance between each of the sequences and populate D with them
	for i in range(0,n):
		for j in range(0,n):
			D[i][j] = hamming_distance(seqs[i],seqs[j])
	return D

def UPGMA(D, seq_names):
	clusters = [cluster for cluster in seq_names] #initialize clusters with the sequence names
	nodes = []
	while(len(clusters) != 1):
		min_coord = find_minimum_coords(D)
		nodes = build_tree_nodes(nodes, min_coord, D, clusters)
		clusters = create_new_cluster(clusters, min_coord, D)
		#calculate the distance from the new cluster to every other cluster
		D = recalculate_distances(D, clusters, min_coord)
	tree = build_final_tree(nodes, clusters)
	return tree

def is_there_multiple_trees(D):
	"""
	Prints YES or NO depending on if there are multiple optimal trees
	In other words, prints YES or NO depending on if there are multiple minimum distance values in D.
	"""
	current_min = float('inf') #don't know the min yet so set it to infinity
	more_than_one_min = 1 #if greater than 1 there is more than 1 minimum in D and thus more than one optimal tree
	for i in range(0, len(D)):
		for j in range(0, i+1):
			if(D[i][j] < current_min and D[i][j] != 0):
				current_min = D[i][j]
			elif D[i][j] == current_min:
				more_than_one_min += 1
	if(more_than_one_min > 1):
		return "YES"
	else:
		return "NO"

Next we handle file I/O

In [10]:

with open('3.in', 'r') as f:
		lines = f.readlines()
		seq_names = [] #the names of the sequences
		seqs = [] #the sequences themselves
		for i in range(0, len(lines)):
			lines[i] = lines[i].replace('\n','').replace('>','')
			if i % 2 != 0: #if i is odd, lines[i] is a sequence
				seqs.append(lines[i])
			else: #if i is even, lines[i] is a sequence name
				seq_names.append(lines[i])

We are ready to answer now
## a)

In [11]:
D = build_distance_matrix(seq_names, seqs)
print_distance_matrix(D, seq_names)

- S1 S2 S3 S4 S5 
S1 0.0 6.0 4.0 12.0 11.0 
S2 6.0 0.0 5.0 13.0 11.0 
S3 4.0 5.0 0.0 12.0 10.0 
S4 12.0 13.0 12.0 0.0 11.0 
S5 11.0 11.0 10.0 11.0 0.0 


## b)

In [12]:
print(UPGMA(D, seq_names))

S13254(S1325:0.5(S132:2.6(S13:0.7(S1:2.0)(S3:2.0))(S2:2.7))(S5:5.3))(S4:5.8)


## c)

In [13]:
print(is_there_multiple_trees(D))

NO


## Question 2
## a)
This question involves performing fitches algorithm on two different trees. We will begin with the left tree first.  
![](2a-left.png)  
The total cost (or length) of the left tree is 2

Next we will do the right tree.  
![](2a-right.png)  
Which also has total cost (or length) of 2

## b)

The final cost (or length) of this tree is 2  
![](2b.png)

## Question 3
## a) Left tree  
![](3a-1.png)
![](3a-2-1.png)
![](3a-2-2.png)

## a) Right tree  
![](3a-3-1.png)
![](3a-3-2.png)

## b)  

A stochastic local search approach could work to find the maximum likelihood tree by rearranging some internal nodes in the tree in order to produce a tree which is more optimal, this is the local search portion of the algorithm. Once you have found the optimal arrangement for those nodes you move on to the next set of nodes and find an optimal arrangement of those. This cycle continues until all nodes have been optimally rearranged.
