In [10]:
# Max. Likelihood Est.(Discrete-MultinomialHMM) for DNA
import numpy as np
from hmmlearn import hmm
from collections import Counter

In [11]:
neucleotide_mapping = {'A':0, 'C':1, 'G':2, 'T':3}
train_sequences = ["ACGTAGCT", "CGTAGCTA", "GATCGTAC"]
train_states = ["GGNNGGNN", "NNGGNNGG", "GGNNGGNN"]

In [12]:
observed_sequences = [np.array([neucleotide_mapping[nuc] for nuc in seq]).reshape(-1,1) for seq in train_sequences]
state_mapping = {'G':0, 'N':1}
state_sequences = [np.array([state_mapping[state] for state in states]) for states in train_states]

In [13]:
model = hmm.MultinomialHMM(n_components=2, n_iter=100)

MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


In [16]:
x_train = np.concatenate(observed_sequences)
lengths = [len(seq) for seq in observed_sequences]

model.fit(x_train, lengths)

test_sequence = "GTACGTA"
test_observed = np.array([neucleotide_mapping[nuc] for nuc in test_sequences]).reshape(-1,1)

predicted_states = model.predict(test_observed)

predicted_labels = ''.join(['G' if s == 0 else 'N' for s in predicted_states])

print("\nTest DNA Sequence: ", test_sequence)
print("Predicted Gene Regions: ", predicted_labels)

Even though the 'startprob_' attribute is set, it will be overwritten during initialization because 'init_params' contains 's'
Even though the 'transmat_' attribute is set, it will be overwritten during initialization because 'init_params' contains 't'



Test DNA Sequence:  GTACGTA
Predicted Gene Regions:  GNGNGNG
