In [2]:
import numpy as np

def read_fasta_file(filename):
    """
    Reads the given FASTA file f and returns a dictionary of sequences.

    Lines starting with ';' in the FASTA file are ignored.
    """
    sequences_lines = {}
    current_sequence_lines = None
    with open(filename) as fp:
        for line in fp:
            line = line.strip()
            if line.startswith(';') or not line:
                continue
            if line.startswith('>'):
                sequence_name = line.lstrip('>')
                current_sequence_lines = []
                sequences_lines[sequence_name] = current_sequence_lines
            else:
                if current_sequence_lines is not None:
                    current_sequence_lines.append(line)
    sequences = {}
    for name, lines in sequences_lines.items():
        sequences[name] = ''.join(lines)
    return sequences

In [3]:
a=read_fasta_file("true-ann1.fa")

def map_truestate_to_3statemarkov(strn):
    out=[]
    for i in strn:
        if i == 'C':
            out.append(0)
        elif i == 'N':
            out.append(1)
        elif i == 'R':
            out.append(2)
        else:
            raise Exception("Input should be a either N, C or R")
    return out

def map_truestate_to_7statemarkov(strn):
    out=[None]*len(strn)
    for i in range(len(strn)):
        if strn[i] == 'C':
            if out[i-1]==0:
                out[i]=(1)
            elif out[i-1]==1:
                out[i]=(2)
            else:
                out[i]=(0)
        elif strn[i] == 'N':
            out[i]=(3)
        elif strn[i] == 'R':
            if out[i-1]==4:
                out[i]=(5)
            elif out[i-1]==5:
                out[i]=(6)
            else:
                out[i]=(4)
        else:
            raise Exception("Input should be a either N, C or R")
    return out


In [4]:
def translate_observations_to_indices(obs):
    mapping = {'a': 0, 'c': 1, 'g': 2, 't': 3}
    return [mapping[symbol.lower()] for symbol in obs]

def translate_indices_to_observations(indices):
    mapping = ['a', 'c', 'g', 't']
    return ''.join(mapping[idx] for idx in indices)

class hmm:
    def __init__(self, init_probs, trans_probs, emission_probs):
        self.init_probs = init_probs
        self.trans_probs = trans_probs
        self.emission_probs = emission_probs



init_probs_3_state = np.array(
    [0.00, 1.00, 0.00]
)

trans_probs_3_state = np.array([
    [0.90, 0.10, 0.00],
    [0.05, 0.90, 0.05],
    [0.00, 0.10, 0.90],
])

emission_probs_3_state = np.array([
    #   A     C     G     T
    [0.40, 0.15, 0.20, 0.25],
    [0.25, 0.25, 0.25, 0.25],
    [0.20, 0.40, 0.30, 0.10],
])

hmm_3_state = hmm(init_probs_3_state,
                  trans_probs_3_state,
                  emission_probs_3_state)

init_probs_7_state = np.array(
    [0.00, 0.00, 0.00, 1.00, 0.00, 0.00, 0.00]
)



trans_probs_7_state = np.array([
    [0.90, 0.00, 0.00, 0.10, 0.00, 0.00, 0.00],
    [0.00, 1.00, 0.00, 0.00, 0.00, 0.00, 0.00],
    [0.00, 0.00, 1.00, 0.00, 0.00, 0.00, 0.00],
    [0.05, 0.00, 0.00, 0.90, 0.05, 0.00, 0.00],
    [0.00, 0.00, 0.00, 0.10, 0.90, 0.00, 0.00],
    [0.00, 0.00, 0.00, 0.00, 0.00, 1.00, 0.00],
    [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 1.00],    
])

emission_probs_7_state = np.array([
    #   A     C     G     T
    [0.30, 0.25, 0.25, 0.20],
    [0.20, 0.35, 0.15, 0.30],
    [0.40, 0.15, 0.20, 0.25],
    [0.25, 0.25, 0.25, 0.25],
    [0.20, 0.40, 0.30, 0.10],
    [0.30, 0.20, 0.30, 0.20],
    [0.15, 0.30, 0.20, 0.35],
])

hmm_7_state = hmm(init_probs_7_state,
                  trans_probs_7_state,
                  emission_probs_7_state)


In [22]:
import math
def log(x):
    if x == 0:
        return float("-inf")
    else:
        return math.log(x)

#Fill out the blanks to get the Viterbi algorithm:

def viterbi(obs, hmm):
    X = translate_observations_to_indices(obs)
    N = len(X)
    K = len(hmm.init_probs)
    V = np.zeros((K,N))

    init_probs=np.log(hmm.init_probs)
    trans_probs=np.log(hmm.trans_probs)
    emission_probs=np.log(hmm.emission_probs)


    for i in range(K):
        V[i][0]=hmm.init_probs[i]
    for i in range(1, N):
        if i%1000000==0:
            print(i) 
        for n in range(K):
            simian = np.argmax([V[x][i-1] for x in range(K)])
            E = emission_probs[n][X[i-1]]
            T = trans_probs[simian][n]
            Vau = V[simian][i-1]
            #print(simian,Vau,[[V[x][i-1]] for x in range(K)])
            #print(E,T,Vau)
            V[n][i]= E + T + Vau
        # Implement the Viterbi algorithm
    return V


viterbi(read_fasta_file("genome1.fa")["genome1"],hmm_7_state)


  init_probs=np.log(hmm.init_probs)
  trans_probs=np.log(hmm.trans_probs)


1000000


array([[ 0.00000000e+00, -3.60517019e+00, -5.09682506e+00, ...,
        -2.76320029e+06, -2.76320138e+06, -2.76320305e+06],
       [ 0.00000000e+00,            -inf,            -inf, ...,
                   -inf,            -inf,            -inf],
       [ 0.00000000e+00,            -inf,            -inf, ...,
                   -inf,            -inf,            -inf],
       ...,
       [ 0.00000000e+00, -4.29831737e+00, -5.78997224e+00, ...,
        -2.76320098e+06, -2.76320178e+06, -2.76320287e+06],
       [ 0.00000000e+00,            -inf,            -inf, ...,
                   -inf,            -inf,            -inf],
       [ 0.00000000e+00,            -inf,            -inf, ...,
                   -inf,            -inf,            -inf]])