# Naive Bayes for WSD

Musat Bianca-Stefania

407 Artificial Intelligence

In [None]:
import nltk
nltk.download('senseval')
from nltk.corpus import senseval
import nltk
import random
from nltk.classify import accuracy
from collections import defaultdict
from sklearn.model_selection import train_test_split
from math import log

nltk.download('stopwords')
STOPWORDS_SET = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package senseval to /root/nltk_data...
[nltk_data]   Package senseval is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def extract_vocab_frequency(instances, stopwords=STOPWORDS_SET, n=300):
    """
    Given a list of senseval instances, return a list of the n most frequent words that
    appears in its context (i.e., the sentence with the target word in), output is in order
    of frequency and includes also the number of instances in which that key appears in the
    context of instances.
 
    Params
    ------
    instances: The instances of a word in Senseval2
    stopwords: list of stop words
    n: number of most frequent words we want to include in the vocabulary

    Return
    ------
    A dictionarry containing the words in the vocabulary and their frequences

    Source: Laboratory 6, NLP
    """

    fd = nltk.FreqDist()
    for i in instances:
        (target, suffix) = i.word.split('-')
        words = (c[0] for c in i.context if not c[0] == target)
        for word in set(words) - set(stopwords):
            fd[word] += 1
    return fd.most_common()[:n+1]

In [None]:
# find what part of speech can be the words we are interested in disambiguating in senseval 2
insts = [senseval.instances('line.pos'), senseval.instances('hard.pos'), senseval.instances('serve.pos'), senseval.instances('interest.pos')]
poss = []
for inst in insts:
    for i in inst:
        poss.append(i.context[i.position][1])
print(set(list(poss)))

{'VBG', 'VBZ', 'VBD', 'NNS', 'JJ', 'VB', 'NN'}


In [None]:
def extract_vocab(instances, stopwords=STOPWORDS_SET, n=300):
    """
    Given a list of senseval instances, return a list of the n most frequent words that
    appears in its context.
 
    Params
    ------
    instances: The instances of a word in Senseval2
    stopwords: list of stop words
    n: number of most frequent words we want to include in the vocabulary

    Return
    ------
    A list containing the words in the vocabulary

    Source: Laboratory 6, NLP
    """
    
    pos_list = ['VBG', 'VBZ', 'VBD', 'NNS', 'JJ', 'VB', 'NN']
    return [w for w,f in extract_vocab_frequency(instances,stopwords,n)] + pos_list

In [None]:
def wsd_context_features(instance, vocab, dist=3):
    """
    Given a senseval instance, a vocabulary and a window size return the context features associated with that instance.
 
    Params
    ------
    instance: An instance from Senseval2
    vocab: list of words in the vocabulary
    dist: window size

    Return
    ------
    A dictionary of features

    Source: Adapted from Laboratory 6, NLP
    """

    features = {}
    ind = instance.position
    con = instance.context
    for i in range(max(0, ind-dist), ind):
        j = ind-i
        if con[i][0] in vocab:
            features['left-context-word-%s(%s)' % (j, con[i][0])] = features.get('left-context-word-%s(%s)' % (j, con[i][0]), 0) + 1

    for i in range(ind+1, min(ind+dist+1, len(con))):
        j = i-ind
        if con[i][0] in vocab:
            features['right-context-word-%s(%s)' % (j, con[i][0])] = features.get('right-context-word-%s(%s)' % (j, con[i][0]), 0) + 1

    features[con[ind][1]] = 1 # add the pos of the instance word to the features

    return features

In [None]:
def wsd_word_features(instance, vocab, dist=3):
    """
    Given a senseval instance, a vocabulary and a window size return the word features associated with that instance.
 
    Params
    ------
    instance: An instance from Senseval2
    vocab: list of words in the vocabulary
    dist: window size

    Return
    ------
    A dictionary of features

    Source: Adapted from Laboratory 6, NLP
    """
    features = {}
    ind = instance.position
    con = instance.context
    for i in range(max(0, ind-dist), ind):
        j = ind-i
        if con[i][0] in vocab:
            features[con[i][0]] = features.get(con[i][0], 0) + 1

    for i in range(ind+1, min(ind+dist+1, len(con))):
        j = i-ind
        if con[i][0] in vocab:
            features[con[i][0]] = features.get(con[i][0], 0) + 1

    return features

In [None]:
def feature_prob_given_sense(context, instances, features_func, sense, vocab, dist=3):
    """
    Given a senseval instance (context), a set of senseval instances (a corpus), a function that computes the features,
    a sense, a vocabulary and a window size return the probability of that sense given the context.
 
    Params
    ------
    context: an instance from Senseval2
    instances: A list of instances from Senseval2
    features_func: the function that computes the features
    sense: the sens we are interested in
    vocab: list of words in the vocabulary
    dist: window size

    Return
    ------
    The probability of the given sense given the context
    """

    features = [features_func(instance, vocab, dist) for instance in instances if instance.senses[0]==sense]
    features_contx = features_func(context, vocab, dist)
    sum_of_features_contx_occ = 0
    for feat in features_contx:
        sum = 0
        for dict_f in features:
            if feat in dict_f:
                sum += dict_f[feat]
        sum_of_features_contx_occ += sum
    if sum_of_features_contx_occ == 0:
        return 0
    prod = 1
    for feat in features_contx:
        sum = 0
        for dict_f in features:
            if feat in dict_f:
                sum += dict_f[feat]
        prod *= pow((sum / sum_of_features_contx_occ), features_contx[feat])
    return prod

In [None]:
def sense_probability(instances, sense):
    """
    Computes the probability of a sense, given the instances of a word

    Params
    ------
    instances: The instances of a word in Senseval2
    sense: The sense we are interested in

    Return
    ------
    The probability of the given sense
    """

    sense_len = len([instance for instance in instances if instance.senses[0]==sense])
    senses_len = len(instances)
    return sense_len / senses_len

In [None]:
def wsd_classifier(word, features_func, stopwords_list = STOPWORDS_SET):
    """
    Naive bayes classifier for word sense disambiguation. Given a word, the algorithm takes all the instances
    of that word in Senseval 2 and creates 3 different sets: a train dataset containing 60% of the instances,
    a test dataset containing 20% of the instances and a validation dataset containing the rest 20% of the instances.
    We use the validation dataset to pick the optimum values for the vocabulary size and window size. The vocabulary
    size is used to pick the most common N words in the training instances (note that we eliminate stop words from
    the vocabulary). The window size is used to search in a particular vecinity in the context of the word we are interested
    in disambiguating. The alogorithm uses a Naive Bayes classifier to pick the sense that maximize the probability.
    After finding the optimum parameters, we apply the Bayes Classifier on the test data and output the accuracy and
    the confusion matrix.

    Params
    ------
    word: The word we want to disambiguate
    features_func: The function used for feature extraction
    stopwords_list: The list of stop words we want to exclude from the vocabulary
    """

    # extract the data from the Senseval 2 dataset
    events = [(i, i.senses[0]) for i in senseval.instances(word)][:]
    senses = list(set(l for (i, l) in events))
    instances = [i for (i, l) in events]
    instance_labels = [l for (i, l) in events]
    print(' Senses: ' + ' '.join(senses))

    # split data into train, test and validation
    X_train, X_test, y_train, y_test = train_test_split(instances, instance_labels, test_size=0.2, random_state=13)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

    # the vocabulary size and window size values we try at validation
    valid_values= {(200, 2):0, (300, 2):0, (300, 3):0, (300, 5):0, (500, 2):0, (500, 3):0, (500, 5):0}

    # compute the optimum vocabulary size and window size
    for (voc_sz, win_sz) in valid_values:

        vocab = extract_vocab(instances, stopwords=stopwords_list, n=voc_sz)

        acc = 0
        for i, context in enumerate(X_val):
            senses_dict = {s:0 for s in senses}
            for sense in senses:
                senses_dict[sense] = log(sense_probability(X_train, sense) + 0.0001) + log(feature_prob_given_sense(context, X_train, features_func, sense, vocab, win_sz) + 0.0001)
            max_elem = max(senses_dict, key=senses_dict.get)
            if y_val[i] == max_elem:
                acc += 1
        valid_values[(voc_sz, win_sz)] = acc / len(X_val)
        print("For a vocabulary size of", voc_sz, "and a window size of", win_sz, "we have the accuracy on validation data:", acc / len(X_val))

    (best_voc_sz, best_win_sz) = max(valid_values, key=valid_values.get)

    # compute the accuracy and confusion matrix on test data, using the optimum vocabulary size and window size
    vocab = extract_vocab(instances, stopwords=stopwords_list, n=best_voc_sz)
    y_pred = []
    acc = 0
    for i, context in enumerate(X_test):
        senses_dict = {s:0 for s in senses}
        for sense in senses:
            senses_dict[sense] = log(sense_probability(X_train, sense) + 0.0001) + log(feature_prob_given_sense(context, X_train, features_func, sense, vocab, best_win_sz) + 0.0001)
        max_elem = max(senses_dict, key=senses_dict.get)
        y_pred.append(max_elem)
        if y_test[i] == max_elem:
            acc += 1
    print("\nAccuracy on test is: ", acc / len(X_test))

    print("\nThe confusion matrix on test data:\n")
    cm = nltk.ConfusionMatrix(y_test, y_pred)
    print(cm)

    return X_test, y_test, y_pred

In [None]:
def analyze_context(X_test, y_test, y_pred, sense, no_ex=10):
    """
    Given a senseval instance set (the test set), the true labels (senses), the predicted lables,
    a sense to analyse, and the number of examples we want to analyze, print the context of false
    positive examples of that sense.
 
    Params
    ------
    X_test: A list of instances (test set) from Senseval2
    y_test: list of true labels (senses)
    y_pred: list of predicted labels (senses)
    sense: The sense we are interested in
    no_ex: The number of examples we want to analyze

    Source: Adapted from Laboratory 6, NLP
    """
    true_pos_indx = 2
    indx = no_ex
    print("False positive examples:")
    for i, inst in enumerate(X_test):
        if y_pred[i] == sense and y_test[i] != sense and indx > 0:
            indx -= 1
            p = inst.position
            left = ' '.join(w for (w,t) in inst.context[p-3:p])
            word = ' '.join(w for (w,t) in inst.context[p:p+1])
            right = ' '.join(w for (w,t) in inst.context[p+1:p+4])
            senses = ' '.join(inst.senses)
            print('%30s |%10s | %-30s -> %s' % (left, word, right, senses))
    print("True positive examples:")
    for i, inst in enumerate(X_test):
        if y_pred[i] == sense and y_test[i] == sense and true_pos_indx > 0:
            true_pos_indx -= 1
            p = inst.position
            left = ' '.join(w for (w,t) in inst.context[p-3:p])
            word = ' '.join(w for (w,t) in inst.context[p:p+1])
            right = ' '.join(w for (w,t) in inst.context[p+1:p+4])
            senses = ' '.join(inst.senses)
            print('%30s |%10s | %-30s -> %s' % (left, word, right, senses))

# Word Features
We train the Naive Bayes classifier with word features, consisting of those words from the context of an instance that appear in the vocabulary and how many times they appear in the context.

In [None]:
X_test1, y_test1, y_pred1 = wsd_classifier('hard.pos', wsd_word_features)

 Senses: HARD3 HARD1 HARD2
For a vocabulary size of 200 and a window size of 2 we have the accuracy on validation data: 0.7946943483275664
For a vocabulary size of 300 and a window size of 2 we have the accuracy on validation data: 0.7946943483275664
For a vocabulary size of 300 and a window size of 3 we have the accuracy on validation data: 0.7785467128027682
For a vocabulary size of 300 and a window size of 5 we have the accuracy on validation data: 0.7831603229527105
For a vocabulary size of 500 and a window size of 2 we have the accuracy on validation data: 0.7946943483275664
For a vocabulary size of 500 and a window size of 3 we have the accuracy on validation data: 0.7785467128027682
For a vocabulary size of 500 and a window size of 5 we have the accuracy on validation data: 0.7843137254901961

Accuracy on test is:  0.825836216839677

The confusion matrix on test data:

      |   H   H   H |
      |   A   A   A |
      |   R   R   R |
      |   D   D   D |
      |   1   2   3 |
-

In [None]:
analyze_context(X_test1, y_test1, y_pred1, 'HARD1')

False positive examples:
                     are a lot |    harder | in oakland than                -> HARD3
                  came from `` |      hard | '' assets (                    -> HARD2
                               |      hard | work .                         -> HARD2
              of diligence and |      hard | work on their                  -> HARD2
          de-thatch and aerate |      hard | packed turf areas              -> HARD3
      age-changing voice sound |      hard | , and it                       -> HARD2
                silky hair and |      hard | bone .                         -> HARD3
                cigarette in a |      hard | pack ; continues               -> HARD3
                 very dark and |      hard | ferruginous sandstone ,        -> HARD3
                    rock and a |      hard | place . ''                     -> HARD3
True positive examples:
            triathlete was the |   hardest | .                              -> HARD1
               c

In [None]:
X_test2, y_test2, y_pred2 = wsd_classifier('line.pos', wsd_word_features)

 Senses: cord product phone division formation text
For a vocabulary size of 200 and a window size of 2 we have the accuracy on validation data: 0.5838359469240049
For a vocabulary size of 300 and a window size of 2 we have the accuracy on validation data: 0.5958986731001207
For a vocabulary size of 300 and a window size of 3 we have the accuracy on validation data: 0.5910735826296744
For a vocabulary size of 300 and a window size of 5 we have the accuracy on validation data: 0.6067551266586249
For a vocabulary size of 500 and a window size of 2 we have the accuracy on validation data: 0.5983112183353438
For a vocabulary size of 500 and a window size of 3 we have the accuracy on validation data: 0.5922798552472859
For a vocabulary size of 500 and a window size of 5 we have the accuracy on validation data: 0.6115802171290712

Accuracy on test is:  0.5686746987951807

The confusion matrix on test data:

          |           f             |
          |       d   o             |
         

In [None]:
analyze_context(X_test2, y_test2, y_pred2, 'product')

False positive examples:
        polarized along racial |     lines | , black voters                 -> division
                 has drawn the |      line | : the back-pay                 -> division
                guests wait in |      line | for elevators and              -> formation
              the subscriber - |      line | charge , also                  -> phone
dedicated telephone communications |     lines | currently exist between        -> phone
                   to draw the |      line | ?                              -> division
           company 's customer |     lines | increased only 38              -> phone
                     to keep a |      line | open for a                     -> phone
              breeze on myriad |     lines | high above your                -> cord
                has 10 million |     lines | in service ,                   -> phone
True positive examples:
               its barbie doll |      line | , hot wheels                   -> produc

In [None]:
X_test3, y_test3, y_pred3 = wsd_classifier('serve.pos', wsd_word_features)

 Senses: SERVE6 SERVE12 SERVE2 SERVE10
For a vocabulary size of 200 and a window size of 2 we have the accuracy on validation data: 0.4942922374429224
For a vocabulary size of 300 and a window size of 2 we have the accuracy on validation data: 0.519406392694064
For a vocabulary size of 300 and a window size of 3 we have the accuracy on validation data: 0.5981735159817352
For a vocabulary size of 300 and a window size of 5 we have the accuracy on validation data: 0.6301369863013698
For a vocabulary size of 500 and a window size of 2 we have the accuracy on validation data: 0.5468036529680366
For a vocabulary size of 500 and a window size of 3 we have the accuracy on validation data: 0.6198630136986302
For a vocabulary size of 500 and a window size of 5 we have the accuracy on validation data: 0.6301369863013698

Accuracy on test is:  0.6084474885844748

The confusion matrix on test data:

        |   S   S         |
        |   E   E   S   S |
        |   R   R   E   E |
        |   V  

In [None]:
analyze_context(X_test3, y_test3, y_pred3, 'SERVE10')

False positive examples:
               risks which had |    served | as justification for           -> SERVE2
               actor before he |    served | as mexican ambassador          -> SERVE12
                 that the jews |     serve | as a "                         -> SERVE2
           growing wariness of |   serving | on boards of                   -> SERVE12
                      . he had |    served | in two world                   -> SERVE12
              the islands thus |     serve | only as an                     -> SERVE2
           vice chairman after |   serving | as president and               -> SERVE12
              these too hardly |     serve | for either physical            -> SERVE2
                officer , will |     serve | as acting executive            -> SERVE12
                states , which |    serves | a wide area                    -> SERVE6
True positive examples:
                 olive oil and |     serve | with a hot                     -> SERVE10

In [None]:
X_test4, y_test4, y_pred4 = wsd_classifier('interest.pos', wsd_word_features)

 Senses: interest_5 interest_2 interest_3 interest_1 interest_4 interest_6
For a vocabulary size of 200 and a window size of 2 we have the accuracy on validation data: 0.6582278481012658
For a vocabulary size of 300 and a window size of 2 we have the accuracy on validation data: 0.6708860759493671
For a vocabulary size of 300 and a window size of 3 we have the accuracy on validation data: 0.6856540084388185
For a vocabulary size of 300 and a window size of 5 we have the accuracy on validation data: 0.6265822784810127
For a vocabulary size of 500 and a window size of 2 we have the accuracy on validation data: 0.6919831223628692
For a vocabulary size of 500 and a window size of 3 we have the accuracy on validation data: 0.6940928270042194
For a vocabulary size of 500 and a window size of 5 we have the accuracy on validation data: 0.6075949367088608

Accuracy on test is:  0.7130801687763713

The confusion matrix on test data:

           |   i   i   i   i   i   i |
           |   n   n   

In [None]:
analyze_context(X_test4, y_test4, y_pred4, 'interest_6')

False positive examples:
               , suggesting an |  interest | in looking for                 -> interest_1
                      the 49 % |  interest | owned by coca-cola             -> interest_5
                    `` now the |  interest | is in what                     -> interest_1
                  takes a keen |  interest | in monetary matters            -> interest_1
                   15 % voting |  interest | in united ,                    -> interest_5
                     's in the |  interest | of the self-regulator          -> interest_4
               protect its own | interests | as a shareholder               -> interest_4
                  have a great |  interest | in making investments          -> interest_1
              that serve those | interests | .                              -> interest_3
               addition to its | interests | in las vegas                   -> interest_5
True positive examples:
                firm 's annual |  interest | paymen

# Context Features
We train the Naive Bayes classifier with context features, consisting of those words and their context (their location in the context) from the context of an instance that appear in the vocabulary. Also, we add the part of speech of the word we want to disambiguate in the features list.

In [None]:
X_test1, y_test1, y_pred1 = wsd_classifier('hard.pos', wsd_context_features)

 Senses: HARD3 HARD1 HARD2
For a vocabulary size of 200 and a window size of 2 we have the accuracy on validation data: 0.8477508650519031
For a vocabulary size of 300 and a window size of 2 we have the accuracy on validation data: 0.8512110726643599
For a vocabulary size of 300 and a window size of 3 we have the accuracy on validation data: 0.831603229527105
For a vocabulary size of 300 and a window size of 5 we have the accuracy on validation data: 0.8200692041522492
For a vocabulary size of 500 and a window size of 2 we have the accuracy on validation data: 0.8558246828143022
For a vocabulary size of 500 and a window size of 3 we have the accuracy on validation data: 0.839677047289504
For a vocabulary size of 500 and a window size of 5 we have the accuracy on validation data: 0.8154555940023068

Accuracy on test is:  0.8650519031141869

The confusion matrix on test data:

      |   H   H   H |
      |   A   A   A |
      |   R   R   R |
      |   D   D   D |
      |   1   2   3 |
--

In [None]:
analyze_context(X_test1, y_test1, y_pred1, 'HARD1')

False positive examples:
                     are a lot |    harder | in oakland than                -> HARD3
                  came from `` |      hard | '' assets (                    -> HARD2
          de-thatch and aerate |      hard | packed turf areas              -> HARD3
      age-changing voice sound |      hard | , and it                       -> HARD2
                silky hair and |      hard | bone .                         -> HARD3
                               |      hard | feelings ; james               -> HARD2
                 very dark and |      hard | ferruginous sandstone ,        -> HARD3
                 consul with a |      hard | top - like                     -> HARD3
                      a long , |      hard | look into themselves           -> HARD2
                      a long , |      hard | look at how                    -> HARD2
True positive examples:
            triathlete was the |   hardest | .                              -> HARD1
               c

In [None]:
X_test2, y_test2, y_pred2 = wsd_classifier('line.pos', wsd_context_features)

 Senses: cord product phone division formation text
For a vocabulary size of 200 and a window size of 2 we have the accuracy on validation data: 0.594692400482509
For a vocabulary size of 300 and a window size of 2 we have the accuracy on validation data: 0.609167671893848
For a vocabulary size of 300 and a window size of 3 we have the accuracy on validation data: 0.6079613992762364
For a vocabulary size of 300 and a window size of 5 we have the accuracy on validation data: 0.5910735826296744
For a vocabulary size of 500 and a window size of 2 we have the accuracy on validation data: 0.617611580217129
For a vocabulary size of 500 and a window size of 3 we have the accuracy on validation data: 0.6127864897466827
For a vocabulary size of 500 and a window size of 5 we have the accuracy on validation data: 0.586248492159228

Accuracy on test is:  0.5963855421686747

The confusion matrix on test data:

          |           f             |
          |       d   o             |
          |  

In [None]:
analyze_context(X_test2, y_test2, y_pred2, 'product')

False positive examples:
        polarized along racial |     lines | , black voters                 -> division
                guests wait in |      line | for elevators and              -> formation
              the subscriber - |      line | charge , also                  -> phone
dedicated telephone communications |     lines | currently exist between        -> phone
                  of many long |     lines | is at the                      -> formation
              racial and class |     lines | , especially if                -> division
           company 's customer |     lines | increased only 38              -> phone
              agency draws the |      line | .                              -> division
              breeze on myriad |     lines | high above your                -> cord
                has 10 million |     lines | in service ,                   -> phone
True positive examples:
               its barbie doll |      line | , hot wheels                   -> pr

In [None]:
X_test3, y_test3, y_pred3 = wsd_classifier('serve.pos', wsd_context_features)

 Senses: SERVE6 SERVE12 SERVE2 SERVE10
For a vocabulary size of 200 and a window size of 2 we have the accuracy on validation data: 0.5639269406392694
For a vocabulary size of 300 and a window size of 2 we have the accuracy on validation data: 0.5776255707762558
For a vocabulary size of 300 and a window size of 3 we have the accuracy on validation data: 0.591324200913242
For a vocabulary size of 300 and a window size of 5 we have the accuracy on validation data: 0.5182648401826484
For a vocabulary size of 500 and a window size of 2 we have the accuracy on validation data: 0.593607305936073
For a vocabulary size of 500 and a window size of 3 we have the accuracy on validation data: 0.591324200913242
For a vocabulary size of 500 and a window size of 5 we have the accuracy on validation data: 0.5

Accuracy on test is:  0.6015981735159818

The confusion matrix on test data:

        |   S   S         |
        |   E   E   S   S |
        |   R   R   E   E |
        |   V   V   R   R |
    

In [None]:
analyze_context(X_test3, y_test3, y_pred3, 'SERVE10')

False positive examples:
               risks which had |    served | as justification for           -> SERVE2
          the electric utility |   serving | eastern massachusetts said     -> SERVE6
               actor before he |    served | as mexican ambassador          -> SERVE12
                 that the jews |     serve | as a "                         -> SERVE2
           growing wariness of |   serving | on boards of                   -> SERVE12
                      . he had |    served | in two world                   -> SERVE12
              the islands thus |     serve | only as an                     -> SERVE2
                    . his wife |    served | on the boards                  -> SERVE12
                  of cyprus to |     serve | as a transfer                  -> SERVE2
              driving trucks , |   serving | as floorwalkers in             -> SERVE12
True positive examples:
                 olive oil and |     serve | with a hot                     -> SERVE10

In [None]:
X_test4, y_test4, y_pred4 = wsd_classifier('interest.pos', wsd_context_features)

 Senses: interest_5 interest_2 interest_3 interest_1 interest_4 interest_6
For a vocabulary size of 200 and a window size of 2 we have the accuracy on validation data: 0.7088607594936709
For a vocabulary size of 300 and a window size of 2 we have the accuracy on validation data: 0.7130801687763713
For a vocabulary size of 300 and a window size of 3 we have the accuracy on validation data: 0.6708860759493671
For a vocabulary size of 300 and a window size of 5 we have the accuracy on validation data: 0.5822784810126582
For a vocabulary size of 500 and a window size of 2 we have the accuracy on validation data: 0.7151898734177216
For a vocabulary size of 500 and a window size of 3 we have the accuracy on validation data: 0.6329113924050633
For a vocabulary size of 500 and a window size of 5 we have the accuracy on validation data: 0.5590717299578059

Accuracy on test is:  0.7151898734177216

The confusion matrix on test data:

           |   i   i   i   i   i   i |
           |   n   n   

In [None]:
analyze_context(X_test4, y_test4, y_pred4, 'interest_6')

False positive examples:
               , suggesting an |  interest | in looking for                 -> interest_1
                    `` now the |  interest | is in what                     -> interest_1
                  takes a keen |  interest | in monetary matters            -> interest_1
                   15 % voting |  interest | in united ,                    -> interest_5
                     's in the |  interest | of the self-regulator          -> interest_4
                't mention his |  interest | in horse racing                -> interest_1
            italy plus certain | interests | in ecuador ,                   -> interest_5
                         . 4 % |  interest | in the company                 -> interest_5
                       a 100 % |  interest | in the well                    -> interest_5
                    not in the |  interest | of the public                  -> interest_4
True positive examples:
                firm 's annual |  interest | paymen

## Observations:
The context features perform better than the word features, proving that specifing the exact position of a word in the context alongside the part of speech of the word to disambiguate is helpful for the Naive Bayes Classifier.

All classifiers seem to have a bias towards the most common sense in the dataset. This is expected as the probability of a sense is part of the Naive Bayes formula when computing the most probable sense given a word to disambiguate and a context.