In [1]:
import nltk
import numpy as np

from collections import deque
from sklearn.linear_model import LogisticRegression
from gensim.models import KeyedVectors

In [2]:
# Load vectors directly from the file
word_embeddings = KeyedVectors.load_word2vec_format('./data/news_word_embeddings.bin', binary = True)

In [3]:
nltk.download('treebank')
nltk.download('tagsets')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\DomainFlag\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\DomainFlag\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [4]:
def extract_data(ratio = 0.9):
    assert (ratio >= 0 and ratio <= 1.0)
    
    # treebank corpus
    file_names = nltk.corpus.treebank.fileids()
    
    # extract tagged words
    tagged_words = []
    for file_name in file_names:
        tagged_words += list(nltk.corpus.treebank.tagged_words(file_name))
        
    # data offset
    offset = int(len(tagged_words) * ratio)
        
    return (tagged_words[:offset], tagged_words[offset:])

In [5]:
train_data, valid_data = extract_data()

In [6]:
class Base:
    """ Base model class """
    
    def __init__(self):
        pass

In [7]:
class HMM:
    """ HMM model based on Search Beam & Viterbi algorithm """
    
    def __init__(self, data):
        
        self.train_data = data
        
        # initialize the model
        self.init(data)
    
    def init(self, data):
                
        # unpack word tokens and pos labels
        tokens, labels = zip(*data)
        
        # our vocabulary
        self.vocabulary, self.labels = set(tokens), set(labels)
        self.label_types = list(self.labels)
        
        # freq token <-> pos label
        self.freq_tokens = nltk.FreqDist(data)
        
        # create 2-gram generator
        grams = nltk.bigrams(labels)

        # create 1-gram freq and 2-gram freq for labels
        self.freq_gram_labels = (nltk.FreqDist(labels), nltk.FreqDist(grams))
    
    def get_seq_prob(self, word, prev_target, target):
        
        # case if the target word is unknown
        word_likelihood = 1.0 if word not in self.vocabulary else self.freq_tokens[(word, target)]
            
        return self.freq_gram_labels[1][(prev_target, target)] * word_likelihood / self.freq_gram_labels[0][prev_target] ** 2
        
    def forward(self, features, window_size = None):
        
        # avoid prob underflow
        if window_size is None:
            window_size = len(features)
        
        # sequence size
        seq_size, label_size = len(features), len(self.label_types)
        
        # memoization of max prob for each t timestamp and indexing for backtracking
        T1, T2 = np.zeros((window_size, label_size)), np.zeros((window_size, label_size))
        
        # initialize with equal prob
        T1[0] = [ 1 / label_size ] * label_size
        
        # initialize helper variables
        labels, offset = [], 0
        
        while True:
            
            # case of overflow
            offset_lim = min(offset + window_size, seq_size) - offset
            
            # forward pass
            for t in range(1, offset_lim):
                
                # current target token
                token = features[t + offset]

                # markov assumption
                for index, label in enumerate(self.label_types):
                    for prev_index, prev_label in enumerate(self.label_types):
                        prob = self.get_seq_prob(token, prev_label, label) * T1[t - 1][prev_index]

                        if prob > T1[t][index]:
                            T1[t][index] = prob
                            T2[t][index] = prev_index

            # backward pass
            prob_max_index = np.argmax(T1[-1, :])
            
            labels_window, prob_index = deque(), prob_max_index
            for t in range(offset_lim, 1, -1):
                prob_index = int(T2[t - 1][prob_index])
                
                labels_window.appendleft(self.label_types[prob_index])
            
            # reset mem
            T1.fill(0)
            T2.fill(0)
            
            # assign max prob to last word
            T1[0][prob_max_index] = 1.0
            
            offset += window_size - 1
            labels += list(labels_window)
            
            if offset >= seq_size:
                break
        
        # update with last item
        labels.append(self.label_types[prob_max_index])
            
        return labels

In [8]:
class MEMM:
    """ MEMM model based on Search Beam & Viterbi algorithm """
    
    def __init__(self, data):
        
        self.train_data = data
        
        # initialize the model
        self.init(data)
    
    def init(self, data):
                
        # unpack word tokens and pos labels
        tokens, labels = zip(*data)
        
        # our vocabulary
        self.vocabulary, self.labels = set(tokens), set(labels)
        self.label_types = list(self.labels)
        
        # create train features
        inputs = []
        for index, (token, tag) in enumerate(zip(tokens, labels)):
            inputs.append(self.get_feature(tokens, labels, index))

        # create train targets
        targets = [ self.label_types.index(label) for label in labels ]
        
        # freq token <-> pos label
        self.model = LogisticRegression(random_state = 0, max_iter = 300, solver = 'lbfgs', multi_class = 'multinomial').fit(inputs, targets)
    
    def get_feature(self, tokens, labels, index, window_size = 3, bare = False):
        
        # features
        features = []
        
        # boundary indices
        index_start, index_end = index - window_size, index + window_size

        for i in range(index_start, index_end):

            # current token
            t = tokens[i] if i >= 0 and i < len(tokens) else None
            
            # get token vector representation
            token_vec = word_embeddings[t] if t is not None and t in word_embeddings else np.zeros((word_embeddings.vector_size,))
            
            features.append(token_vec)
            
        if bare:
            index_start, index = 0, len(labels)
        
        for i in range(index_start, index - 1):
            
            # current tag
            t = labels[i] if i >= 0 and i < len(labels) else None
            
            if t == -1:
                features.append([-1])
            else:
                features.append([self.label_types.index(t) if t is not None else -1])

        # concatenate features
        return np.concatenate(features)
        
    def forward(self, features, window_size = None):
        
        # avoid prob underflow
        if window_size is None:
            window_size = len(features)
        
        # sequence size
        seq_size, label_size = len(features), len(self.label_types)
        
        # memoization of max prob for each t timestamp and indexing for backtracking
        T1, T2 = np.zeros((window_size, label_size)), np.zeros((window_size, label_size))
        
        # initialize with equal prob
        T1[0] = [ 1 / label_size ] * label_size
        
        # initialize helper variables
        labels, offset = [], 0
        
        while True:
            
            # case of overflow
            offset_lim = min(offset + window_size, seq_size) - offset
            
            # forward pass
            for t in range(1, offset_lim):
                
                for prev_index, prev_tag in enumerate(self.label_types):
                    
                    # min_start index
                    min_start, min_offset = max(0, t - 3), 0 if t - 3 >= 0 else t - 3

                    # previous taggings based on maximum likelihood
                    taggings, sample_index = deque(), int(prev_index)
                    
                    taggings.appendleft(self.label_types[sample_index])
                    for i in range(t - 1, min_start, -1):
                        sample_index = int(T2[i][sample_index])
                        
                        taggings.appendleft(self.label_types[sample_index])
                        
                    if min_offset < 0:
                        taggings = labels[min_offset:] + list(taggings)
                        
                    if len(taggings) < 3:
                        taggings = [-1] * (3 - len(taggings)) + taggings
                    
                    # features
                    X = np.expand_dims(self.get_feature(features, taggings, t + offset, bare = True), 0)
                    
                    # make prediction (len(self.label_types), )
                    prob = self.model.predict_proba(X)[0]
                    
                    # joint probability with previous assumptions
                    prob *= T1[t - 1][prev_index]
                    
                    for index in range(len(self.label_types)):
                        if prob[index] > T1[t][index]:
                            T1[t][index] = prob[index]
                            T2[t][index] = prev_index

            # backward pass
            prob_max_index = np.argmax(T1[-1, :])
            
            labels_window, prob_index = deque(), prob_max_index
            for t in range(offset_lim, 1, -1):
                prob_index = int(T2[t - 1][prob_index])
                
                labels_window.appendleft(self.label_types[prob_index])
            
            # reset mem
            T1.fill(0)
            T2.fill(0)
            
            # assign max prob to last word
            T1[0][prob_max_index] = 1.0
            
            offset += window_size - 1
            
            # concatenate our outputs
            labels += list(labels_window)
            
            if offset >= seq_size:
                break
        
        # update with last item
        labels.append(self.label_types[prob_max_index])
            
        return labels           

In [9]:
def validate_model(outputs, targets):
    assert (len(outputs) == len(targets))
    
    # compute number of valid labels
    labels_predicted = 0
    for predicted, target in zip(outputs, targets):
        labels_predicted += predicted == target
    
    # size of annotations
    size = len(targets)
    
    return labels_predicted / size, (labels_predicted, size)

In [10]:
def test_model(model_type, train_data, valid_data):
    
    # reference model definition
    model = HMM if model_type == 'hmm' else MEMM
    
    # create an instance of our model
    network = model(train_data)
    
    # pre-process labelled data 
    features, targets = zip(*valid_data)
    
    # generate network outputs
    outputs = network.forward(features, window_size = 7)
    
    score, occ = validate_model(outputs, targets)

    print(f"Model {model_type} -> {score}")

In [11]:
for model_type in ['hmm', 'memm']:
    test_model(model_type, train_data, valid_data)

Model hmm -> 0.8915375446960667




Model memm -> 0.850715137067938
