In [2]:
import nltk
import numpy as np

In [3]:
total_docs = 100
total_relevant = 10

retrieved = np.array(['R', 'N', 'N', 'R', 'R', 'N', 'N', 'N',
             'R', 'N', 'R', 'N', 'N', 'R', 'R'])



In [4]:
precision = sum(retrieved == 'R') / len(retrieved)
recall = sum(retrieved == 'R') / 10
f1 = 2 * (precision * recall / (precision + recall))

corr_class = (total_docs - sum(retrieved == 'N')) / total_docs

print(f'Precision: {precision}, recall: {recall}, F1: {f1}, Accuracy score: {corr_class}')

Precision: 0.4666666666666667, recall: 0.7, F1: 0.56, Accuracy score: 0.92


For binary classification the classes should preferably be equal size in the dataset in order for accuracy to be a reasonable metric. In this case, predicting 'Not relevant' for every single document yields 99.9% accuracy, so whatever algorithm is tasked to learn to solve the problem will most likely only learn to classify everything as 'Not relevant' and not solve the actual classification problem.

In [31]:
from nltk.corpus import treebank
from nltk.tag.hmm import HiddenMarkovModelTagger
from ass5utils import split_corpus, tagset
from nltk.metrics import ConfusionMatrix

training_sents, test_sents = split_corpus(treebank, 0.8)

test_tokens = [t[0] for s in test_sents for t in s]
correct_tags = [t[1] for s in test_sents for t in s]

hmm_tagger = HiddenMarkovModelTagger.train(training_sents)
predicted_tags = [t[1] for t in hmm_tagger.tag(test_tokens)]


In [146]:
cm = ConfusionMatrix(reference=correct_tags, test=predicted_tags, sort_by_count=True)

corr = 0
total = 0
unique_tags = set(correct_tags)

for tag in unique_tags:
    for tag2 in unique_tags:
        if tag == tag2:
            corr += cm[tag, tag2]
        total += cm[tag, tag2]
        
acc = correct / total
true_pos = cm['NN', 'NN']
false_positive = sum(list(map(lambda tag: 0 if tag == 'NN' else cm[tag, 'NN'], unique_tags)))
all_nn = sum(list(map(lambda tag: cm['NN', tag], unique_tags)))
pre = true_pos / (true_pos + false_positive)
rec = true_pos / all_nn
f1 = 2 * pre * rec / (pre + rec)

print(f'Precision {pre} recall {rec} f1 {f1}, acc {acc}')

Precision 0.9460803059273423 recall 0.8528093760772147 f1 0.8970268310369833, acc 0.8745371775417299


In [147]:
print(cm.pretty_format(sort_by_count=True))

     |                                                                                                   P              N                                              |
     |              N         N              V                   V         V    V    P         P    V    R    W    J    N    J         W         R         P    R    W |
     |    N    I    N    D    N    J    C    B    V    R    T    B    C    B    B    R    M    O    B    P    D    J    P    J    R    R    W    B    E    D    B    P |
     |    N    N    P    T    S    J    D    D    B    B    O    N    C    Z    G    P    D    S    P    $    T    R    S    S    P    B    P    R    X    T    S    $ |
-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
  NN |<2474>  34   38   18   90  109   10    6   27    1    4   14    .    3   22    9    7   10    5    9    7    .    .    .    4    .    .    .    .    

In [151]:
from collections import Counter
def random_tag(tokens):
    return [tagset[np.random.randint(len(tagset))] for _ in tokens]

def majority_tag(tokens):
    counts = Counter()
    for tag in training_sents:
        for token in tag:
            counts[token[1]] += 1
    return [counts.most_common()[0][0]] * len(tokens)


def score(predicted_tags, name):
    acc = sum(np.array(predicted_tags) == np.array(correct_tags)) / len(predicted_tags)
    print(f'Accuracy for {name} is {acc}')
    return acc
    

In [152]:
score(majority_tag(test_tokens), 'Majority')
score(random_tag(test_tokens), 'Random')

Accuracy for Majority is 0.17608497723823976
Accuracy for Random is 0.028710166919575113


0.028710166919575113

In [129]:
test = list(zip(test_tokens, [None]*len(test_tokens)))
log = hmm_tagger.log_probability(test)

In [132]:
ppl = 2 ** (-log / len(test))
ppl

1266.8944112578856

In [142]:
s1 = ['RB', 'RB', 'NNP', 'NNP', 'VB', 'VB', 'TO', 'VB', 'NNP', 'IN', 'VB', 'IN', 'NN', 'NN', 'VB', 'VB', 'JJ']
s2 = ['RB', 'DT', 'JJ', 'NN', 'DT', 'VB', 'DT', 'JJ', 'NN', 'DT', 'JJ', 'VB', 'VB', 'TO', 'DT', 'NN']
s3 = ['DT', 'JJ', 'NN', 'VB', 'DT', 'DT', 'VB', 'VB', 'DT', 'NN', 'DT', 'VB', 'DT', 'JJ', 'NNP']

tags1 = ['IN', 'RB', 'NNP', 'NNP', 'VBZ', 'VBG', 'TO', 'VB', 'NNP', 'IN', 'VBG', 'DT', 'JJ', 'NN', 'IN', 'VBG', 'JJ']
tags2 = ['IN', 'DT', 'JJ', 'NN', 'EX', 'VBP', 'JJ', 'JJ', 'NNS', 'IN', 'NN', 'VBZ', 'VBG', 'TO', 'DT', 'NN']
tags3 = ['DT', 'JJ', 'NN', 'VBZ', 'IN', 'WP', 'MD', 'VB', 'DT', 'NN', 'CC', 'VB', 'PRP$', 'JJ', 'NNS']

me = np.array(s1 + s2 + s3)
gold = np.array(tags1 + tags2 + tags3)

assert(len(me) == len(gold))

acc = sum(me == gold) / len(me)

In [143]:
print(acc)

0.5


In [144]:
me

array(['RB', 'RB', 'NNP', 'NNP', 'VB', 'VB', 'TO', 'VB', 'NNP', 'IN',
       'VB', 'IN', 'NN', 'NN', 'VB', 'VB', 'JJ', 'RB', 'DT', 'JJ', 'NN',
       'DT', 'VB', 'DT', 'JJ', 'NN', 'DT', 'JJ', 'VB', 'VB', 'TO', 'DT',
       'NN', 'DT', 'JJ', 'NN', 'VB', 'DT', 'DT', 'VB', 'VB', 'DT', 'NN',
       'DT', 'VB', 'DT', 'JJ', 'NNP'], dtype='<U3')

In [145]:
gold

array(['IN', 'RB', 'NNP', 'NNP', 'VBZ', 'VBG', 'TO', 'VB', 'NNP', 'IN',
       'VBG', 'DT', 'JJ', 'NN', 'IN', 'VBG', 'JJ', 'IN', 'DT', 'JJ', 'NN',
       'EX', 'VBP', 'JJ', 'JJ', 'NNS', 'IN', 'NN', 'VBZ', 'VBG', 'TO',
       'DT', 'NN', 'DT', 'JJ', 'NN', 'VBZ', 'IN', 'WP', 'MD', 'VB', 'DT',
       'NN', 'CC', 'VB', 'PRP$', 'JJ', 'NNS'], dtype='<U4')