In [1]:
training_data = [
    [("The", "DET"), ("dog", "NOUN"), ("barks", "VERB")],
    [("The", "DET"), ("cat", "NOUN"), ("meows", "VERB")],
    [("A", "DET"), ("dog", "NOUN"), ("runs", "VERB")],
    [("The", "DET"), ("dog", "NOUN"), ("runs", "VERB")],
    [("A", "DET"), ("cat", "NOUN"), ("sleeps", "VERB")],
    [("The", "DET"), ("big", "ADJ"), ("dog", "NOUN"), ("barks", "VERB")],
    [("A", "DET"), ("small", "ADJ"), ("cat", "NOUN"), ("meows", "VERB")],
    [("The", "DET"), ("dog", "NOUN"), ("sleeps", "VERB")],
    [("The", "DET"), ("cat", "NOUN"), ("runs", "VERB")],
    [("A", "DET"), ("big", "ADJ"), ("dog", "NOUN"), ("sleeps", "VERB")],
    [("The", "DET"), ("small", "ADJ"), ("cat", "NOUN"), ("runs", "VERB")],
    [("Dogs", "NOUN"), ("bark", "VERB")],
    [("Cats", "NOUN"), ("meow", "VERB")],
    [("The", "DET"), ("dog", "NOUN"), ("in", "ADP"), ("the", "DET"), ("house", "NOUN"), ("barks", "VERB")],
    [("A", "DET"), ("cat", "NOUN"), ("on", "ADP"), ("the", "DET"), ("mat", "NOUN"), ("sleeps", "VERB")],
]

In [2]:
POS_list = []
for i in range(len(training_data)):
    for j in range(len(training_data[i])):
        if training_data[i][j][1] not in POS_list:
            POS_list.append(training_data[i][j][1])

print(POS_list)

['DET', 'NOUN', 'VERB', 'ADJ', 'ADP']


In [3]:
def initial_prob(training_data):
    prob = [1 for _ in range(len(POS_list))]
    initial_prob_dict = dict(zip(POS_list, prob))
    for i in range(len(training_data)):
        initial_prob_dict[training_data[i][0][1]] += 1
    for tag, prob in initial_prob_dict.items():
        initial_prob_dict[tag]= prob/(len(training_data) + len(POS_list))
    return initial_prob_dict

i_p = initial_prob(training_data)
print(i_p)

{'DET': 0.7, 'NOUN': 0.15, 'VERB': 0.05, 'ADJ': 0.05, 'ADP': 0.05}


In [4]:
def tag_count(training_data):
    tag_count_dict = {tag: 0 for tag in POS_list}
    for i in range(len(training_data)):
        for j in range(len(training_data[i])):
            if training_data[i][j][1] in POS_list:
                tag_count_dict[training_data[i][j][1]] += 1
    return tag_count_dict

tag_count_result = tag_count(training_data)
print(tag_count_result)

{'DET': 15, 'NOUN': 17, 'VERB': 15, 'ADJ': 4, 'ADP': 2}


In [5]:
def transition_prob(training_data, tag_count_dict):
    transition_prob_dict = {t1: {t2: 1 for t2 in POS_list} for t1 in POS_list}
    for i in range(len(training_data)):
        for j in range(1, len(training_data[i])):
            current = training_data[i][j][1]
            previous = training_data[i][j - 1][1]
            transition_prob_dict[previous][current] += 1
    for tag_p, tag_c in transition_prob_dict.items():
        for tag, prob in tag_c.items():
            tag_c[tag] = prob/(tag_count_dict[tag_p] + len(POS_list))
    return transition_prob_dict

transition_prob_result = transition_prob(training_data, tag_count(training_data))
for i, j in transition_prob_result.items():
    print(f"'{i}': {j}")

'DET': {'DET': 0.05, 'NOUN': 0.6, 'VERB': 0.05, 'ADJ': 0.25, 'ADP': 0.05}
'NOUN': {'DET': 0.045454545454545456, 'NOUN': 0.045454545454545456, 'VERB': 0.7272727272727273, 'ADJ': 0.045454545454545456, 'ADP': 0.13636363636363635}
'VERB': {'DET': 0.05, 'NOUN': 0.05, 'VERB': 0.05, 'ADJ': 0.05, 'ADP': 0.05}
'ADJ': {'DET': 0.1111111111111111, 'NOUN': 0.5555555555555556, 'VERB': 0.1111111111111111, 'ADJ': 0.1111111111111111, 'ADP': 0.1111111111111111}
'ADP': {'DET': 0.42857142857142855, 'NOUN': 0.14285714285714285, 'VERB': 0.14285714285714285, 'ADJ': 0.14285714285714285, 'ADP': 0.14285714285714285}


In [6]:
def unique_words(training_data):
    unique_words_list = []
    for i in range(len(training_data)):
        for j in range(len(training_data[i])):
            if training_data[i][j][0] not in unique_words_list:
                unique_words_list.append(training_data[i][j][0])
    return unique_words_list

vocab_list = unique_words(training_data)
print(vocab_list)

['The', 'dog', 'barks', 'cat', 'meows', 'A', 'runs', 'sleeps', 'big', 'small', 'Dogs', 'bark', 'Cats', 'meow', 'in', 'the', 'house', 'on', 'mat']


In [7]:
def emission_prob(training_data, tag_count_dict, unique_words_list):
    emission_prob_dict = {t1: {w: 1 for w in unique_words_list} for t1 in POS_list}
    for i in range(len(training_data)):
        for j in range(len(training_data[i])):
            word = training_data[i][j][0]
            tag = training_data[i][j][1]
            if word in emission_prob_dict[tag]:
                emission_prob_dict[tag][word] += 1
    for tag, word_probs in emission_prob_dict.items():
        for word, prob in word_probs.items():
            emission_prob_dict[tag][word] = prob/(tag_count_dict[tag] + len(unique_words_list))
    return emission_prob_dict

emission_prob_result = emission_prob(training_data, tag_count(training_data), vocab_list)
for i, j in emission_prob_result.items():
    print(f"'{i}': {j}")

'DET': {'The': 0.2647058823529412, 'dog': 0.029411764705882353, 'barks': 0.029411764705882353, 'cat': 0.029411764705882353, 'meows': 0.029411764705882353, 'A': 0.17647058823529413, 'runs': 0.029411764705882353, 'sleeps': 0.029411764705882353, 'big': 0.029411764705882353, 'small': 0.029411764705882353, 'Dogs': 0.029411764705882353, 'bark': 0.029411764705882353, 'Cats': 0.029411764705882353, 'meow': 0.029411764705882353, 'in': 0.029411764705882353, 'the': 0.08823529411764706, 'house': 0.029411764705882353, 'on': 0.029411764705882353, 'mat': 0.029411764705882353}
'NOUN': {'The': 0.027777777777777776, 'dog': 0.2222222222222222, 'barks': 0.027777777777777776, 'cat': 0.19444444444444445, 'meows': 0.027777777777777776, 'A': 0.027777777777777776, 'runs': 0.027777777777777776, 'sleeps': 0.027777777777777776, 'big': 0.027777777777777776, 'small': 0.027777777777777776, 'Dogs': 0.05555555555555555, 'bark': 0.027777777777777776, 'Cats': 0.05555555555555555, 'meow': 0.027777777777777776, 'in': 0.027

In [18]:
import math

class HMMPOSTagger:
    def __init__(self):
        self.POS =[]
        self.vocab = []
        self.initial_prob = {}
        self.transition_prob = {}
        self.emission_prob = {}
        self.tag_count = {}

    def train(self, training_data):
        self.POS = list({tag for sentence in training_data for _, tag in sentence})
        self.vocab = unique_words(training_data)
        self.tag_count = tag_count(training_data)
        self.initial_prob = initial_prob(training_data)
        self.transition_prob = transition_prob(training_data, self.tag_count)
        self.emission_prob = emission_prob(training_data, self.tag_count, self.vocab)

    def generate_tag_sequence(self, current_sequence, index, sentence, all_sequences):
        if index == len(sentence):
            all_sequences.append(current_sequence[:])
            return
        for tag in self.POS:
            current_sequence.append(tag)
            self.generate_tag_sequence(current_sequence, index + 1, sentence, all_sequences)
            current_sequence.pop()

    def brute_force_decode(self, sentence):
        all_sequences = []
        self.generate_tag_sequence([], 0, sentence, all_sequences)
        best_tags = None
        best_prob = 0
        for tags in all_sequences:
            prob = self.initial_prob[tags[0]] * self.emission_prob[tags[0]].get(sentence[0], 1e-6)
            for i in range(1, len(sentence)):
                prev_tag = tags[i - 1]
                curr_tag = tags[i]
                word = sentence[i]
                prob *= self.transition_prob[prev_tag][curr_tag] * self.emission_prob[curr_tag].get(word, 1e-6)
            if prob > best_prob:
                best_prob = prob
                best_tags = tags
        return list(best_tags)


    def viterbi_decode(self, sentence):
        V = []
        backpointer = []

        first_probs = {}
        first_back = {}

        for tag in self.POS:
            emission = self.emission_prob[tag].get(sentence[0], 1e-6)
            first_probs[tag] = math.log(self.initial_prob[tag]) + math.log(emission)
            first_back[tag] = None

        V.append(first_probs)
        backpointer.append(first_back)

        for i in range(1, len(sentence)):
            current_probs = {}
            current_back = {}
            word = sentence[i]

            for tag in self.POS:
                max_prob = float("-inf")
                best_prev_tag = None

                for prev_tag in self.POS:
                    transition = self.transition_prob[prev_tag][tag]
                    emission = self.emission_prob[tag].get(word, 1e-6)

                    prob = V[i - 1][prev_tag] + math.log(transition) + math.log(emission)

                    if prob > max_prob:
                        max_prob = prob
                        best_prev_tag = prev_tag

                current_probs[tag] = max_prob
                current_back[tag] = best_prev_tag

            V.append(current_probs)
            backpointer.append(current_back)

        best_last_tag = None
        best_last_prob = float("-inf")

        for tag in self.POS:
            if V[-1][tag] > best_last_prob:
                best_last_prob = V[-1][tag]
                best_last_tag = tag

        best_tags = [best_last_tag]
        for i in range(len(sentence) - 1, 0, -1):
            best_tags.append(backpointer[i][best_tags[-1]])


        best_tags.reverse()
        return best_tags


In [19]:
tagger = HMMPOSTagger()
tagger.train(training_data)

In [21]:
test_sentences = [
    ["The", "dog", "barks"],
    ["A", "cat", "sleeps"],
    ["The", "big", "dog", "runs"],
    ["Dogs", "bark"],
    # New sentences
    ["Little", "bird", "sings"],
    ["The", "little", "boy", "runs", "quickly"],
    ["The", "sun", "shines"],
    ["A", "tiny", "mouse", "eats", "cheese"],
    ["The", "happy", "dog", "plays", "outside"],
    ["Cats", "jump", "high"],
    ["The", "girl", "reads", "a", "book"],
    ["A", "beautiful", "flower", "blooms", "today"]
]

for sentence in test_sentences:
    print("Sentence:", sentence)
    print("Brute force:", tagger.brute_force_decode(sentence))
    print("Viterbi:", tagger.viterbi_decode(sentence))
    print()

Sentence: ['The', 'dog', 'barks']
Brute force: ['DET', 'NOUN', 'VERB']
Viterbi: ['DET', 'NOUN', 'VERB']

Sentence: ['A', 'cat', 'sleeps']
Brute force: ['DET', 'NOUN', 'VERB']
Viterbi: ['DET', 'NOUN', 'VERB']

Sentence: ['The', 'big', 'dog', 'runs']
Brute force: ['DET', 'ADJ', 'NOUN', 'VERB']
Viterbi: ['DET', 'ADJ', 'NOUN', 'VERB']

Sentence: ['Dogs', 'bark']
Brute force: ['NOUN', 'VERB']
Viterbi: ['NOUN', 'VERB']

Sentence: ['Little', 'bird', 'sings']
Brute force: ['DET', 'NOUN', 'VERB']
Viterbi: ['DET', 'NOUN', 'VERB']

Sentence: ['The', 'little', 'boy', 'runs', 'quickly']
Brute force: ['DET', 'ADJ', 'NOUN', 'VERB', 'ADJ']
Viterbi: ['DET', 'ADJ', 'NOUN', 'VERB', 'ADJ']

Sentence: ['The', 'sun', 'shines']
Brute force: ['DET', 'NOUN', 'VERB']
Viterbi: ['DET', 'NOUN', 'VERB']

Sentence: ['A', 'tiny', 'mouse', 'eats', 'cheese']
Brute force: ['DET', 'NOUN', 'ADP', 'DET', 'NOUN']
Viterbi: ['DET', 'NOUN', 'ADP', 'DET', 'NOUN']

Sentence: ['The', 'happy', 'dog', 'plays', 'outside']
Brute forc