In [18]:
import os
import pickle
import random
import nltk
from nltk.tag import hmm
from nltk.probability import LidstoneProbDist
import dill
import sys
from google.colab import files

In [22]:
train_path = "/content/train_pos_data.pkl"
test_path = "/content/test_pos_data.pkl"

In [45]:
def load_pickle_file(path):
    with open(path,"rb") as file:
        data=pickle.load(file)
    return data

In [32]:
try:
    train_data = load_pickle_file(train_path)
    test_data = load_pickle_file(test_path)
except Exception as e:
    print("Error loading data:", e)
    sys.exit(1)

print(f"Total Training sentences in the file:{len(train_data)}")
print(f"Total testing senetences within the file:{len(test_data)}")
print("Small set of training data with their tags:")
print(train_data[0][:5])

Total Training sentences in the file:3131
Total testing senetences within the file:783
Small set of training data with their tags:
[('In', 'IN'), ('talks', 'NNS'), ('with', 'IN'), ('Mr.', 'NNP'), ('yoshoda', 'NNP')]


In [36]:
def laplace_estimator(freq_dist, bins):
    return LidstoneProbDist(freq_dist, 1.0, bins)

In [38]:
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_data, estimator=laplace_estimator)

In [46]:
ttags=0
ctags=0
for sent in test_data:
    words = [pair[0] for pair in sent]
    gold_tags = [pair[1] for pair in sent]
    predicted = [tag for (word, tag) in tagger.tag(words)]
    ttags += len(gold_tags)
    for i in range(len(gold_tags)):
        if gold_tags[i] == predicted[i]:
            ctags += 1

if (total_tags>0):
  accuracy=(ctags/ttags)*100
else:
  accuracy=0

print(f"Accuracy:{accuracy:.2f}%")
print(f"Total tokens evaluated:{total_tags}")


Accuracy:82.64%
Total tokens evaluated:19969


In [44]:
sample=random.choice(test_data)
sentence_words=[pair[0] for pair in sample]
actual_tags=[pair[1] for pair in sample]
predicted_tags=[tag for (word, tag) in tagger.tag(sentence_words)]

print("Sample data for checking the accuracy:")
print("Words:", sentence_words)
print("Actual Tags:", actual_tags)
print("Predicted Tags:", predicted_tags)

Sample data for checking the accuracy:
Words: ['Mr.', 'preeti', 'said', '0', 'he', 'will', 'stay', 'until', 'Dec.', '31', 'and', 'work', 'with', 'his', 'successor', ',', 'who', '*T*-36', 'is', '*-1', 'to', 'be', 'named', '*-43', 'soon', '.']
Actual Tags: ['NNP', 'NNP', 'VBD', '-NONE-', 'PRP', 'MD', 'VB', 'IN', 'NNP', 'CD', 'CC', 'VB', 'IN', 'PRP$', 'NN', ',', 'WP', '-NONE-', 'VBZ', '-NONE-', 'TO', 'VB', 'VBN', '-NONE-', 'RB', '.']
Predicted Tags: ['NNP', 'NNP', 'VBD', '-NONE-', 'PRP', 'MD', 'VB', 'IN', 'NNP', ',', 'CC', 'NN', 'IN', 'PRP$', 'NN', ',', 'WP', '-NONE-', 'VBZ', '-NONE-', 'TO', 'VB', 'VBN', '-NONE-', 'RB', '.']
