In [1]:
import pickle
from collections import defaultdict, Counter

In [10]:
train_path = "/content/train_pos_data.pkl"
test_path = "/content/test_pos_data.pkl"

with open(train_path, "rb") as f:
  train_data = pickle.load(f)
with open(test_path, "rb") as f:
  test_data = pickle.load(f)
  print("Total Training data:",len(train_data))
  print("Total Testing data",len(test_data))

Total Training data: 3131
Total Testing data 783


In [4]:
word_tag_counts = defaultdict(Counter)
for sent in train_data:
    for word, tag in sent:
        word_tag_counts[word.lower()][tag]+=1

In [6]:
rules = {
    "ing": "VBG",
    "ed": "VBD",
    "ly": "RB",
    "ion": "NN",
    "ity": "NN",
    "ous": "JJ",
    "ment": "NN",
    "ness": "NN",
    "ive": "JJ",
    "able": "JJ",
    "less": "JJ",
    "est": "JJS",
}


In [7]:
def rule_based_tagger(word):
    w = word.lower()
    if w in word_tag_counts:
        return word_tag_counts[w].most_common(1)[0][0]
    for end, tg in rules.items():
        if w.endswith(end):
            return tg
    if w[0].isupper():
        return "NNP"
    if w.isdigit():
        return "CD"
    return "NN"

In [8]:
total, correct = 0, 0
predicted_sentences = []
for sent in test_data:
    pred_tags = []
    for word, actual in sent:
        pred = rule_based_tagger(word)
        pred_tags.append((word, pred))
        if pred == actual:
            correct += 1
        total += 1
    predicted_sentences.append(pred_tags)

In [9]:
accuracy = (correct / total) * 100
print("Evaluating on test set...")
print("Token-level accuracy:", round(accuracy, 2), "%")
print("Total tokens evaluated:", total)

Evaluating on test set...
Token-level accuracy: 85.03 %
Total tokens evaluated: 19969
