Derek Robinson

In [1]:
import numpy as np
import itertools as it
import pandas as pd

In [2]:
"""
saves the sequence from the input file into a tuple
"""
with open("4.in", 'r') as f:
    lines = f.readlines()
    sequence = lines[1]
    sequence_tuple = tuple(lines[1])

In [3]:
"""
This cell contains the HMM provided in the Assignment 4 description PDF
Adapted from: https://en.wikipedia.org/wiki/Viterbi_algorithm#Example
"""
obs = sequence_tuple
states = ("H", "L")
trans_p = {
    "S": {"H": 0.5, "L": 0.5}, # probability of going from start state to the other statess
    "H": {"H": 0.6, "L": 0.4},
    "L": {"H": 0.5, "L": 0.5}
}
emit_p = {
    "H": {"A": 0.2, "C": 0.3, "G": 0.3, "T": 0.2},
    "L": {"A": 0.3, "C": 0.2, "G": 0.2, "T": 0.3},
}

In [14]:
def log_viterbi(obs, states, trans_p, emit_p):
    """
    Returns the following:
        * V - the log Viterbi DP matrix 
        * path - one optimal path
        * mulitple_paths - if there are multiple paths or not (YES or NO)
        * num_ties - the number of ties which occur in the DP matrix
    Params
        * obs - the observations that the HMM is able to produce
        * states - the states of the HMM
        * trans_p - the HMM state transtition probabilities
        * emit_p - the HMM state emission probabilities
    References:
        * https://www.cis.upenn.edu/~cis262/notes/Example-Viterbi-DNA.pdf
    """
    V = np.zeros((len(states) + 1, len(obs) + 1), dtype=float)
    path = ""
    multiple_paths = "NO"
    num_ties = 0

    for col in range(0, len(obs) + 1):
        if(col == 0):
            V[1][col] = V[2][col] = np.NINF
        else:
            V[0][col] = np.NINF

    for row in range(1, len(states) + 1):
        if(row == 1):
            state = "S"
            other_state = "H"
            base = obs[0]
            V[row][1] = np.log2(trans_p[state][other_state]) + np.log2(emit_p[other_state][base])
        else:
            state = "S"
            other_state = "L"
            base = obs[0]
            V[row][1] = np.log2(trans_p[state][other_state]) + np.log2(emit_p[other_state][base])
        V[row] = np.round(V[row], 3)

    for col in range(2, len(obs) + 1):
        for row in range(1, len(states) + 1):
            if(row == 1):
                state = "H"
                other_state = "L"
                base = obs[col - 1]
                max_prob = max((V[row][col - 1]+np.log2(trans_p[state][state])),(V[row + 1][col - 1]+np.log2(trans_p[other_state][state])))
                V[row][col] = np.log2(emit_p[state][base]) + max_prob
            else:
                state = "L"
                other_state = "H"
                base = obs[col - 1]
                max_prob = max((V[row][col - 1]+np.log2(trans_p[state][state])),(V[row - 1][col - 1]+np.log2(trans_p[other_state][state])))
                V[row][col] = np.log2(emit_p[state][base]) + max_prob
    
    V[1] = np.round(V[1], 3)
    V[2] = np.round(V[2], 3)

    for col in range(1, len(obs) + 1):
        best_state_index = argmax(V[1][col], V[2][col])
        if(best_state_index == 0 or best_state_index == 1):
            path += states[best_state_index]
        else: #best_state_index == -1
            path += states[0]
            multiple_paths = "YES"
            num_ties += 1

    return V, path, multiple_paths, num_ties

def print_viterbi_table(V, obs, states):
    """
    A helper function to print the Viterbi DP matrix as specified in the assignment pdf
    """
    # part b
    print("- 0", end=" ")
    for o in obs:
        print(o, end=" ")
    print("\n", end="")

    for row in range(0, len(states) + 1):
        print(0, end = " ") if row == 0 else print(states[row - 1], end = " ")
        for col in range(0, len(obs) + 1):
            print(V[row][col], end = " ")
        print("\n", end="")

def probability_of_most_probable_path(V, obs, states):
    """
    Returns the probability of the most likely path through the HMM
    Params:
        * V - the log viterbi DP matrix
        * obs - the observations that the HMM is able to produce
        * states - the states of the HMM
    """
    arr = []
    for i in range(1,len(states) + 1):
        arr.append(V[i][len(obs)])
    max_prob = np.max(arr)
    # return the probability in non log base 2 format
    return float('{:.3g}'.format(np.power(2, max_prob)))

def argmax(a, b):
    """
    A helper function that is similar to np.argmax()
    """
    if(a > b):
        return 0
    elif(b > a):
        return 1
    else: # a==b
        return -1

def all_optimal_paths(V, num_ties):
    """
    Prints all optimal paths through the specified HMM
    THIS IS NOT A GENERAL SOLUTION AND WILL ONLY WORK FOR THE CASE SPECIFIED IN THE ASSIGNMENT PDF
    Params:
        * V - the viterbi DP matrix
        * num_ties - the number of ties in V
    References:
        * https://stackoverflow.com/questions/27974126/how-to-get-all-combinations-of-length-n-in-python
        * a bunch of trial and error on paper :P
    """
    # the number of optimal paths is 2n + 2
    # where n is the number of ties in V
    num_paths = 2 * num_ties + 2
    print(str(num_paths) + "\n")
    best_states = []
    for col in range(1, len(obs) + 1):
        best_state_index = argmax(V[1][col], V[2][col])
        if(best_state_index == 0 or best_state_index == 1):
            best_states.append(states[best_state_index])
        else:
            best_states.append((states[0], states[1]))
    all_paths = []
    format_path = ""
    for state in best_states:
        if(type(state) == str):
            format_path += state
        else:
            format_path += "{}"
    res = list(it.combinations([0,1,0,1,0,1], 3))
    s = pd.Series(res).drop_duplicates().to_list()
    for i in range(0, len(s)):
        all_paths.append(format_path.format(str(states[s[i][0]]), str(states[s[i][1]]), str(states[s[i][2]])))
    for p in all_paths:
        print(p + "\n")

def forward(states, obs, trans_p, emit_p):
    """
    Returns:
        * F - the forward algorithm DP matrix
    Params:
        * states - the states of the HMM
        * obs - the observations that the HMM is able to produce
        * trans_p - the HMM state transtition probabilities
        * emit_p - the HMM state emission probabilities
    References:
        * https://www.cis.upenn.edu/~cis262/notes/Example-Viterbi-DNA.pdf
        * https://stackoverflow.com/questions/51933830/using-multiple-variables-in-a-for-loop-in-python
    """
    F = np.zeros((len(states) + 1, len(obs) + 1), dtype=float)
    F[0][0] = 1
    
    for row, state in enumerate(states, start = 1):
        F[row][1] = emit_p[state][obs[0]] * trans_p["S"][state]

    for col in range(2, len(obs) + 1):
        for row in range(1, len(states) + 1):
                if(row == 1):
                    state = "H"
                    other_state = "L"
                    base = obs[col - 1]
                    F[row][col] = emit_p[state][base] * (F[row][col - 1] * trans_p[state][state] + F[row + 1][col - 1] * trans_p[other_state][state])
                else:
                    state = "L"
                    other_state = "H"
                    base = obs[col - 1]
                    F[row][col] = emit_p[state][base] * (F[row][col - 1] * trans_p[state][state] + F[row - 1][col - 1] * trans_p[other_state][state])
    return F

def backward(states, obs, trans_p, emit_p):
    """
    Returns:
        * F - the forward algorithm DP matrix
    Params:
        * states - the states of the HMM
        * obs - the observations that the HMM is able to produce
        * trans_p - the HMM state transtition probabilities
        * emit_p - the HMM state emission probabilities
    """
    B = np.zeros((len(states), len(obs)), dtype=float)

    for col in range(len(obs)-1, -1, -1):
        if(col == len(obs) - 1):
            B[0][-1] = 1
            B[1][-1] = 1
            continue
        for row in range(0, len(states)):
            if(row == 0):
                state = "H"
                other_state = "L"
                base = obs[col + 1]
                B[row][col] = (B[row][col + 1] * trans_p[state][state] * emit_p[state][base]) + (B[row + 1][col + 1] * trans_p[state][other_state] * emit_p[other_state][base])
            else:
                state = "L"
                other_state = "H"
                base = obs[col + 1]
                B[row][col] = (B[row][col + 1] * trans_p[state][state] * emit_p[state][base]) + (B[row - 1][col + 1] * trans_p[state][other_state] * emit_p[other_state][base])  
    return B

def posterior_prob(obs, states, pos, trans_p, emit_p):
    """
    Returns:
        * post_prob - the posterior probability of observing obs[pos] for the given HMM
    Params:
        * obs - the observations that the HMM is able to produce
        * states - the states of the HMM
        * pos - the position we want the posterior probability for
        * trans_p - the HMM state transtition probabilities
        * emit_p - the HMM state emission probabilities
    """
    post_probs = []
    for i in range(1, len(states) + 1):
        fki = forward(states, obs, trans_p, emit_p)[i][pos]
        bki = backward(states, obs, trans_p, emit_p)[i - 1][pos - 1]
        p_x = forward(states, obs, trans_p, emit_p).sum(axis = 0)[len(obs)]
        result = (fki*bki)/p_x
        post_probs.append(np.round(result, 3))
    return post_probs


## a)

In [15]:
F = forward(states, obs, trans_p, emit_p)
np.round(F.sum(axis = 0)[len(obs)], 8)

3.77e-06

The following code cell creates a tuple containing all return values from the `log_viterbi()` function.  
These values will be used to answer parts **b**, **c**, **d**, **e**, and **g**

In [6]:
viterbi_tuple = log_viterbi(obs, states, trans_p, emit_p)

## b)

In [7]:
# The viterbi tabe is in log format
print_viterbi_table(viterbi_tuple[0], obs, states)

- 0 G G C A C T G A A 
0 0.0 -inf -inf -inf -inf -inf -inf -inf -inf -inf 
H -inf -2.737 -5.211 -7.685 -10.744 -13.218 -16.277 -18.751 -21.809 -24.868 
L -inf -3.322 -6.381 -8.855 -10.744 -14.066 -16.277 -19.599 -21.809 -24.546 


## c)

In [13]:
# As there are multiple optimal paths through the viterbi table this answer is not exactly what the assignment pdf specifies.
# However, you can see in part g that we do indeed have the optimal path HHHHHHHLL
print(viterbi_tuple[1])

HHHHHHHHL


## d)

In [9]:
probability_of_most_probable_path(viterbi_tuple[0], obs, states)

4.08e-08

## e)

In [10]:
print(viterbi_tuple[2])

YES


## f)

In [11]:
part_f = posterior_prob(obs, states, 4, trans_p, emit_p)
print(np.format_float_scientific(part_f[0]))
print(np.format_float_scientific(part_f[1]))

4.74e-01
5.26e-01


# g)

In [12]:
all_optimal_paths(viterbi_tuple[0], viterbi_tuple[3])

8

HHHHHLHHL

HHHHHLHLL

HHHHHHHLL

HHHHHHHHL

HHHLHHHLL

HHHLHHHHL

HHHLHLHHL

HHHLHLHLL



note: part g) is not a general solution and will only work for the HMM and sequence specified in the assignment pdf