In [37]:
import re
import string
import math
from nltk.probability import (ConditionalFreqDist, ConditionalProbDist, LidstoneProbDist)
from nltk import pos_tag
# from classifiers.DecisionListClassifier import DecisionListClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.metrics import ConfusionMatrix
from collections import Counter
import re
import string
import random

In [38]:
def preprocessing(data):
    c = []
    stop_word_set = set(stopwords.words('english'))
    data = [l for l in data.split("\n") if len(l)]
    senses = []

    for line in data:

        line = line.strip()
        s = line.index(":")
        line = (line[:s], line[s:])
        line = (line[0], word_tokenize(line[1]))
        line = (line[0], [w for w in line[1] if w not in stop_word_set])
        line = (line[0], [re.sub('['+string.punctuation+']', '', w) for w in line[1]])
        line = (line[0], [w for w in line[1] if len(w) > 2])
        line = (line[0], [w.lower() for w in line[1]])

        senses.append(line[0])
        c.append(line)

    return c

In [39]:
class DecisionListClassifier(object):
   

    def __init__(self, target_word, sense_pairs, max_collocation=3):
        word_senses = list(set([w[0] for w in sense_pairs]))
        if len(word_senses) != 2:
            raise ValueError('There can only be 2 word senses')
        word_sense_count = 0
        word_sense_star_count = 0
        word_sense_majority = ''
        for w in sense_pairs:
            if w == word_senses[0]:
                word_sense_count +=1
            else:
                word_sense_star_count +=1

        if word_sense_count/(word_sense_star_count+word_sense_count) > \
            word_sense_star_count/(word_sense_star_count+word_sense_count) :
            word_sense_majority = word_senses[0]
        else:
            word_sense_majority = word_senses[1]
        self.word = target_word
        self.word_sense = word_senses[0]
        self.word_sense_star = word_senses[1]
        self.majority_word_sense = word_sense_majority
        dl = self.train(target_word, sense_pairs, max_collocation)
        self._decision_list = dl
        self.evaluation_result = {}
        
    def train(self, target_word, sense_pairs, max_collocation):
        freqs = ConditionalFreqDist()
        for pair in sense_pairs:
            for g in self.generate_collocations(target_word, pair[1], max_collocation):
                freqs[g][pair[0]] += 1
            pos = self.parts_of_speech(target_word, pair[1])
            for g in self.generate_collocations(target_word, pos, max_collocation):
                if g != self.word:
                    freqs[g][pair[0]] += 1
        probs = ConditionalProbDist(freqs, LidstoneProbDist, 0.2)
        dl = []
        for feature in probs.conditions():
            self.add_to_decision_list(dl, probs, feature)
        dl.sort(key=lambda r: r[2], reverse=True)
        return dl
        
        
    def predict(self, target_word, tokens, max_collocation=3):
        features = []
        features += self.generate_collocations(target_word, tokens, max_collocation)

        pos = self.parts_of_speech(target_word, tokens)
        for g in self.generate_collocations(target_word, pos, max_collocation):
            if g != target_word:
                features += [g]
        max_log = 0.0
        sense = self.majority_word_sense
        for f in features:
            for rule in self._decision_list:
                if rule[1] == f:
                    if rule[2] > max_log:
                        max_log = rule[2]
                        sense = rule[0]

        return sense
    
    def add_to_decision_list(self, dl, probs, feature):
        prob = probs[feature].prob(self.word_sense)
        prob_star = probs[feature].prob(self.word_sense_star)
        d = math.log(prob/prob_star)
        if d == 0:
            dl.append((self.majority_word_sense, feature, 0))
        else:
            sense = self.word if d > 0 else self.word_sense_star
            dl.append((sense, feature, abs(d)))

    def parts_of_speech(self, target_word, tokens):
        b = tokens.index(target_word)
        pos = [w[1] for w in pos_tag(tokens)]
        pos[b] = target_word
        return pos

    def generate_collocations(self, word, tokens, max_c):
        def ngrams_h(c, l, r, acc):
            if c == 0:
                return acc
            acc.append('_'.join(tokens[l:r]))
            return ngrams_h(c-1, l+1, r+1, acc)

        w = tokens.index(word) #ValueError
        ngrams = []
        for c in range(max_c, 0, -1):
            ngrams = ngrams_h(c+1, w-c, w+1, [])

        ngrams += [word]
        return ngrams


In [43]:
def predict(word, dlc, test_corpus): #, sample_size=3
    
        result = {}

        # do some baseline counting
        word_senses = list(set([w[0] for w in test_corpus]))
        word_sense, word_sense_star = word_senses[0], word_senses[1]
        word_sense_count = 0
        word_sense_star_count = 0
        word_sense_majority = ''
        for w in test_corpus:
            if w[0] == word_sense:
                word_sense_count +=1
            else:
                word_sense_star_count +=1

        if word_sense_count/(word_sense_star_count+word_sense_count) > \
            word_sense_star_count/(word_sense_star_count+word_sense_count) :
            word_sense_majority = word_senses[0]
        else:
            word_sense_majority = word_senses[1]

        # baseline testing
        baseline_correct = 0
        result["majority_baseline"] = word_sense_majority
        for row in test_corpus:
            if row[0] == word_sense_majority:
                baseline_correct += 1
        result["majority_baseline_percent_correct"] = round(baseline_correct/len(test_corpus)*100, 2)

        # analyze using model prediction
        result["correct_count"], result["incorrect_count"] = 0, 0
        guesses = [] # what we guessed
        actual = [] # what acutally was
        correctly_guessed = []
        incorrectly_guessed = []
        for row in test_corpus:
            g = dlc.predict(word, row[1])
            guesses.append(g)
            actual.append(row[0])
            if g == row[0]:
                result["correct_count"] += 1
                correctly_guessed.append(row[1])
            else:
                result["incorrect_count"] += 1
                incorrectly_guessed.append(row[1])

#         result["correct_guess_sample"] = random.sample(correctly_guessed, sample_size)
#         result["incorrect_guess_sample"] = random.sample(incorrectly_guessed, sample_size)
        result["percent_correct"] = round((result["correct_count"]/len(test_corpus))*100, 2)

        # confusion
        result["confusion_matrix"] = ConfusionMatrix(actual, guesses)

        # calculate true/false_positive/negatives for both senses
        true_pos = Counter()
        false_neg = Counter()
        false_pos = Counter()
        for i in [word_sense, word_sense_star]:
            for j in [word_sense, word_sense_star]:
                if i == j:
                    true_pos[i] += result["confusion_matrix"][i,j]
                else:
                    false_neg[i] += result["confusion_matrix"][i,j]
                    false_pos[j] += result["confusion_matrix"][i,j]

        if float(true_pos[word_sense_star]+false_pos[word_sense_star]) == 0:
            result["precision_word_star"] = 0
        else:
            result["precision_word_star"] = true_pos[word_sense_star] / float(true_pos[word_sense_star]+false_pos[word_sense_star])

        # recall
        result["recall_word"] = true_pos[word_sense] / float(true_pos[word_sense]+false_neg[word_sense])
        result["recall_word_star"] = true_pos[word_sense_star] / float(true_pos[word_sense_star]+false_neg[word_sense_star])

        # macros
        result["macro_precision"] = (float(result["recall_word"]) + float(result["recall_word_star"])) / 2.0
        result["macro_recall"] = (float(result["recall_word"]) + float(result["recall_word_star"])) / 2.0

        result["word_sense"] = word_sense
        result["word_sense_star"] = word_sense_star

#         return result

        for i in result:
            print(i, ":  '", result[i], "'")
    #     return result

In [44]:
train = preprocessing(open('./bass.trn', 'r').read())
test = preprocessing(open('./bass.tst', 'r').read())

clf = DecisionListClassifier('bass', train)
predictions = predict('bass', clf, test)
print(predictions)

majority_baseline :  ' bass '
majority_baseline_percent_correct :  ' 56.0 '
correct_count :  ' 56 '
incorrect_count :  ' 44 '
percent_correct :  ' 56.0 '
confusion_matrix :  '       |  *    |
      |  b  b |
      |  a  a |
      |  s  s |
      |  s  s |
------+-------+
*bass | <.>44 |
 bass |  .<56>|
------+-------+
(row = reference; col = test)
 '
precision_word_star :  ' 0.56 '
recall_word :  ' 0.0 '
recall_word_star :  ' 1.0 '
macro_precision :  ' 0.5 '
macro_recall :  ' 0.5 '
word_sense :  ' *bass '
word_sense_star :  ' bass '
None


In [45]:
train = preprocessing(open('./sake.trn', 'r').read())
test = preprocessing(open('./sake.tst', 'r').read())

clf = DecisionListClassifier('sake', train)
predictions = predict('sake', clf, test)
print(predictions)

majority_baseline :  ' sake '
majority_baseline_percent_correct :  ' 94.0 '
correct_count :  ' 94 '
incorrect_count :  ' 6 '
percent_correct :  ' 94.0 '
confusion_matrix :  '       |  *    |
      |  s  s |
      |  a  a |
      |  k  k |
      |  e  e |
------+-------+
*sake | <.> 6 |
 sake |  .<94>|
------+-------+
(row = reference; col = test)
 '
precision_word_star :  ' 0.94 '
recall_word :  ' 0.0 '
recall_word_star :  ' 1.0 '
macro_precision :  ' 0.5 '
macro_recall :  ' 0.5 '
word_sense :  ' *sake '
word_sense_star :  ' sake '
None
