In [1]:
'''
First, we import all required packages.
'''
import numpy as np
from hmmlearn import hmm
np.random.seed(11)

In [2]:
'''
Next, we read in our sequence data from a CSV file. Each line contains one sequence.
'''
# TODO[1]: Put the correct name of the file containing the data for part 1 of module 4 here
FILENAME = 'module4_data_1.csv'

data = []
for line in open(FILENAME):
    data.append([x for x in line.strip().split(',')])
print(data)

[['cold', 'hot', 'warm', 'cold', 'hot', 'very cold', 'hot', 'very cold', 'hot', 'warm', 'hot', 'warm', 'very cold', 'warm', 'cold', 'hot', 'cold', 'very cold', 'cold', 'warm', 'very cold'], ['warm', 'hot', 'hot', 'hot', 'very cold', 'cold', 'hot', 'cold', 'cold', 'cold', 'warm', 'warm', 'warm', 'warm', 'hot', 'warm', 'cold', 'hot', 'warm', 'cold', 'warm'], ['cold', 'warm', 'cold', 'hot', 'hot', 'very cold', 'warm', 'warm', 'warm', 'hot', 'hot', 'warm', 'hot', 'cold', 'very cold', 'very cold', 'cold', 'warm', 'warm', 'warm', 'cold'], ['warm', 'warm', 'cold', 'warm', 'cold', 'hot', 'warm', 'warm', 'cold', 'cold', 'warm', 'cold', 'warm', 'warm', 'very cold', 'warm', 'hot', 'warm', 'cold', 'warm', 'cold'], ['very cold', 'warm', 'very cold', 'hot', 'cold', 'very cold', 'cold', 'very cold', 'very cold', 'warm', 'warm', 'warm', 'warm', 'warm', 'warm', 'cold', 'cold', 'hot', 'very cold', 'cold', 'very cold'], ['very cold', 'cold', 'cold', 'warm', 'cold', 'warm', 'very cold', 'hot', 'warm', 'co

In [3]:
'''
We know use maximum likelihood estimation to obtain the parameters of our Markov chain.
First, we count occurrences of inidividual states and state tuples.
'''
unigram_counts = {}
for sequence in data:
    for element in sequence[:-1]:
        if element not in unigram_counts:
            unigram_counts[element] = 0
        unigram_counts[element] += 1

# Print all unigram counts. Do you notice anything?
print(unigram_counts)

bigram_counts = {}
for sequence in data:
    for i in range(1, len(sequence)):
        if sequence[i-1] + '_' + sequence[i] not in bigram_counts:
            bigram_counts[sequence[i-1] + '_' + sequence[i]] = 0
        bigram_counts[sequence[i-1] + '_' + sequence[i]] += 1

# Print all bigram counts. 
print(bigram_counts)

{'cold': 4828, 'hot': 4754, 'warm': 4725, 'very cold': 4733}
{'cold_hot': 1237, 'hot_warm': 1194, 'warm_cold': 1202, 'hot_very cold': 1166, 'very cold_hot': 1239, 'warm_hot': 1112, 'warm_very cold': 1187, 'very cold_warm': 1146, 'hot_cold': 1218, 'cold_very cold': 1215, 'very cold_cold': 1191, 'cold_warm': 1188, 'hot_hot': 1176, 'cold_cold': 1188, 'warm_warm': 1224, 'very cold_very cold': 1157}


In [4]:
'''
Now, let's use the counts to obtain the transition probabilities of our Markov chain.
'''
markovChain = {'hot': {'hot': 0, 'warm': 0, 'cold': 0, 'very cold': 0},
              'warm': {'hot': 0, 'warm': 0, 'cold': 0, 'very cold': 0},
              'cold': {'hot': 0, 'warm': 0, 'cold': 0, 'very cold': 0},
              'very cold': {'hot': 0, 'warm': 0, 'cold': 0, 'very cold': 0}}

for key, value in markovChain.items():
    for state in value.keys():
        value[state] = float(bigram_counts[key + '_' + state]) / unigram_counts[key]
print(markovChain)

{'hot': {'hot': 0.24737063525452252, 'warm': 0.2511569204880101, 'cold': 0.2562053007993269, 'very cold': 0.24526714345814052}, 'warm': {'hot': 0.23534391534391536, 'warm': 0.259047619047619, 'cold': 0.2543915343915344, 'very cold': 0.2512169312169312}, 'cold': {'hot': 0.25621375310687655, 'warm': 0.24606462303231152, 'cold': 0.24606462303231152, 'very cold': 0.2516570008285004}, 'very cold': {'hot': 0.2617789985210226, 'warm': 0.24212972744559477, 'cold': 0.2516374392562857, 'very cold': 0.24445383477709698}}


In [5]:
'''
Time for a sanity check: the probabilities of all bigrams with the same initial state should sum to 1.
'''
for key, value in markovChain.items():
    print(key)
    print(sum(value.values()))
    if sum(value.values()) == 1:
        print('Everything seems great!')
    else:
        print('There seems to be a problem. Please go back to the last cell and try again!')
    print('')

hot
1.0
Everything seems great!

warm
1.0
Everything seems great!

cold
1.0
Everything seems great!

very cold
1.0
Everything seems great!



In [6]:
'''
Now, we only need to obtain the initial probabilities.
'''

pi = {}
states = ['hot', 'warm', 'cold', 'very cold']

initial_counts = {}
for sequence in data:
    if sequence[0] not in initial_counts:
        initial_counts[sequence[0]] = 0
    initial_counts[sequence[0]] += 1

print(initial_counts)

# TODO: The next step are the initial probabilities
for state in states:
    pi[state] = float(initial_counts[state]) / sum(initial_counts.values())
print(pi)    

{'cold': 265, 'warm': 219, 'very cold': 246, 'hot': 222}
{'hot': 0.23319327731092437, 'warm': 0.23004201680672268, 'cold': 0.27836134453781514, 'very cold': 0.25840336134453784}


In [7]:
'''
Time for another sanity check: the probabilities in pi should sum to 1.
'''
print(sum(pi.values()))
if sum(pi.values()) == 1:
    print('Everything seems great!')
else: 
    print('There seems to be a problem. Try again!')

1.0
Everything seems great!


In [8]:
'''
Finally, we will compute the probability of a couple of sequences.
'''

# TODO: compute the probabilities of 
seq1 = ['hot', 'hot', 'warm', 'cold', 'warm', 'very cold']
seq2 = ['warm', 'hot', 'warm', 'cold', 'very cold', 'cold', 'cold']
seq3 = ['warm', 'cold']

for seq in [seq1, seq2, seq3]:
    prob = pi[seq[0]]
    for i in range(1, len(seq)):
        prob = prob * markovChain[seq[i-1]][seq[i]]
    print(prob)


0.00022782955399117263
5.390032673569209e-05
0.05852074162998532
