In [4]:
import nltk

from nltk.corpus import brown
from collections import defaultdict, Counter

#Download the necessery resources 
nltk.download("brown")
nltk.download("universal_tagset")

#Load tagged senteences from the Brown Corpus
tagged_sentences = brown.tagged_sents(tagset = "universal")

#Split the data into training and testing sets

train_size = int(0.8 * len(tagged_sentences))
train_data = tagged_sentences[:train_size]
test_data = tagged_sentences[train_size:]


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\falou\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\falou\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


## Calculating Transition and Emission Probabilities

In [10]:
# Initialize counters for transitions and emissions

transition_counts = defaultdict(Counter)
emission_counts = defaultdict(Counter)
tag_counts = Counter()

# Count occurences for transitions and emissions
for sentence in train_data :
    prev_tag = "<START>"
    for word, tag in sentence :
        transition_counts[prev_tag][tag] += 1
        emission_counts[tag][word.lower()] += 1
        tag_counts[tag] += 1
        prev_tag = tag
    transition_counts[prev_tag]["<END>"] += 1


print(emission_counts)


# Calculate transitions probabilities

transition_prob = {
    tag: {next_tag: count / sum(transition_counts[tag].values())
          for next_tag, count in next_tags.items()}
    for tag, next_tags in transition_counts.items()
}

# Calculate emission probabilities
emission_prob = {
    tag: {word: count / sum(emission_counts[tag].values())
          for word, count in words.items()}
    for tag, words in emission_counts.items()
}





## Implementing the Viterbi Algorithm

In [8]:
def viterbi_algo(sentence, transition_prob, emission_prob, tags):
    V = [{}]
    backpointer = [{}]
    tags = list(tags)

    # Initialisation
    for tag in tags :
        V[0][tag] = transition_prob["<START>"].get(tag,0) * emission_prob[tag].get(sentence[0].lower(),0)
        backpointer[0][tag] = None
    
    # Recursion
    for t in range(1, len(sentence)):
        V.append({})
        backpointer.append({})
        for tag in tags :
            max_prob, best_prev_tag = max(
                ((V[t-1][prev_tag] * transition_prob[prev_tag].get(tag,0) *
                  emission_prob[tag].get(sentence[t].lower(), 0), prev_tag)
                for prev_tag in tags), default = (0,None)
            )
            V[t][tag] = max_prob
            backpointer[t][tag] = best_prev_tag
    
    # Termination
    max_prob, best_last_tag = max(
        ((V[len(sentence) -1][tag] * transition_prob[tag].get("<END>", 0), tag)
         for tag in tags), default=(0, None)
    )

    # Backtrace
    best_path = [best_last_tag]
    for t in range(len(sentence)-1, 0, -1):
        best_path.insert(0, backpointer[t][best_path[0]])
    
    return best_path


## Evaluating the model

In [12]:
def evaluate_model(test_data, transition_prob, emission_prob, tags):
    correct = total = 0
    for sentence in test_data:
        words, true_tags = zip(*sentence)
        predicted_tags = viterbi_algo(words, transition_prob, emission_prob, tags)
        correct += sum(p== t for p,t in zip(predicted_tags, true_tags))
        total += len(true_tags)
    return correct/total

tags = list(tag_counts.keys())

accuracy = evaluate_model(test_data, transition_prob, emission_prob, tags)
print(f"Model accuracy : {accuracy * 100:.2f}%")

Model accuracy : 64.14%


In [19]:
# Test the trained HMM on a custom sentence
sentence = ["Despite", "the", "rain", ",", "the", "children", "played", "outside"]

predicted_tags = viterbi_algo(sentence, transition_prob, emission_prob, tags)

print("Sentence:")
print(" ".join(sentence))
print("\nPredicted POS tags:")
print(" ".join(predicted_tags))

true_tags = ["ADP", "DET", "NOUN", ".", "DET", "NOUN", "VERB", "ADV"]
accuracy_sentence = sum(p == t for p, t in zip(predicted_tags, true_tags)) / len(sentence)
print(f"\nAccuracy for this sentence: {accuracy_sentence * 100:.2f}%")


Sentence:
Despite the rain , the children played outside

Predicted POS tags:
ADP DET NOUN . DET NOUN VERB ADV

Accuracy for this sentence: 100.00%
