In [40]:
import numpy as np 
import os
import conlleval

train_dir = "dataset/train"
test_dir = "dataset/dev.in"


START_STATE_KEY = "START"
STOP_STATE_KEY = "STOP"

LARGE_NEG = -2**52

In [41]:
def tokenize(file_path):  
    data, lst = [], []
    with open(file_path, 'r') as f:  
        for line in f:
            if line== '\n':
                data.append(lst)
                lst = []    
            else:
                lines = line.replace("\n",'').split(" ")
                lst.append(tuple(lines))
    return data

train_sentences = tokenize(train_dir)
print(train_sentences[:5])      


[[('All', 'O'), ('in', 'O'), ('all', 'O'), (',', 'O'), ('the', 'O'), ('food', 'B-positive'), ('was', 'O'), ('great', 'O'), ('(', 'O'), ('except', 'O'), ('for', 'O'), ('the', 'O'), ('dessserts', 'B-negative'), (')', 'O'), ('.', 'O')], [('I', 'O'), ('have', 'O'), ('NEVER', 'O'), ('been', 'O'), ('disappointed', 'O'), ('in', 'O'), ('the', 'O'), ('Red', 'B-positive'), ('Eye', 'I-positive'), ('.', 'O')], [('Great', 'O'), ('food', 'B-positive'), ('with', 'O'), ('an', 'O'), ('awesome', 'O'), ('atmosphere', 'B-positive'), ('!', 'O')], [('The', 'O'), ('sangria', 'B-positive'), ('was', 'O'), ('pretty', 'O'), ('tasty', 'O'), ('and', 'O'), ('good', 'O'), ('on', 'O'), ('a', 'O'), ('hot', 'O'), ('muggy', 'O'), ('day', 'O'), ('.', 'O')], [('Also', 'O'), (',', 'O'), ('waiters', 'B-negative'), ('try', 'O'), ('to', 'O'), ('push', 'O'), ('more', 'O'), ('food', 'O'), ('on', 'O'), ('you', 'O'), (',', 'O'), ('like', 'O'), ('suggest', 'O'), ('things', 'O'), ('as', 'O'), ('if', 'O'), ('they', 'O'), ('are', 'O'

# PART 1
## 1i)

In [42]:

def MLE_emission_parameters(train_sentences):
    ''' Calculates the emission parameters by count(y->x)/count(y)
    
    :param train_sentences: our train file tokenised sentences
    :type train_sentences: list(tuple())

    :return count_y_dict: Count of labels 
    :rtype: dict()

    :return count_y_to_x_dict: Count of words and labels
    :rtype: dict()

    :param emission_dict: value of Count(labels->words)/Count(labels), keys are tuples of word and label ('emission: O+All', -9.01768561), value MLE
    :rtype: dict

    '''

    count_y_dict = {}
    count_y_to_x_dict = {}
    emission_dict = {}

    for sentence in train_sentences:
        for x_y_pair in sentence:
            word, label = x_y_pair
            if label in count_y_dict:
                count_y_dict[label] = count_y_dict.get(label) + 1
            else:
                count_y_dict[label] = 1
            if (word,label) in count_y_to_x_dict:
                count_y_to_x_dict[(word,label)] = count_y_to_x_dict.get((word,label)) + 1
            else:
                count_y_to_x_dict[(word,label)] = 1
    # print("count(y): \n", count_y_dict, "\n")
    # print("count(y->x): \n",list(count_y_to_x_dict.items())[0:5], len(count_y_to_x_dict), "\n")
    # Calculate our emission
    for key, value in count_y_to_x_dict.items(): # Default is iterate keys()
        word = key[0]
        label = key[1]
        string = f"emission: {label}+{word}" 
        prob =  value / count_y_dict.get(label)
        emission_dict[string] = float(np.where(prob != 0, np.log(prob), LARGE_NEG))
    # print("MLE: \n",list(emission_dict.items())[0:5],len(emission_dict) ,"\n")

    return count_y_dict, count_y_to_x_dict, emission_dict

In [43]:
count_y_dict, count_y_to_x_dict, emission_dict = MLE_emission_parameters(train_sentences)
print(list(emission_dict.items())[0:5])

[('emission: O+All', -9.017685611042436), ('emission: O+in', -4.54034879656423), ('emission: O+all', -5.785564559424215), ('emission: O+,', -3.24728344904484), ('emission: O+the', -3.0916488692569097)]


## 1ii)

In [44]:
def  MLE_transition_parameters(train_dir, emission_dict):
    ''' Calculates the transition parameters by count(y->y-1)/count(y)

    :param train_dir: our train file
    :type train_sentences: str

    :param emission_dict: Count(y->x)/Count(y), keys are tuples of word and label ('emission: O+All', -9.01768561), value MLE
    :type emission_dict: dict()

    :return count_y_to_y_dict: Count of labels and previous label
    :rtype: dict()

    :return emission_transition_dict: value of Count(labels->words)/Count(labels) for emission and Count(prev_labels->labels)/Count(labels) for transmission, keys are tuples of word and label ('emission: O+All', -9.01768561), value MLE
    :rtype: dict()
    '''
    count_y_dict = {}
    count_y_to_y_dict = {}
    prev_label = ""

    with open(train_dir, "r", encoding="utf8") as f:
        for line in f:
            # Parse each line
            if len(line.split(" ")) == 2:
                _, label = line.replace("\n","").split(" ")
            else:
                label = ''
            if label == '' and prev_label != '':
                count_y_dict[STOP_STATE_KEY] = count_y_dict.get(STOP_STATE_KEY) + 1 if count_y_dict.get(STOP_STATE_KEY) else 1
            elif label !='':
                if prev_label == '':
                    count_y_dict[START_STATE_KEY] = count_y_dict.get(START_STATE_KEY) + 1 if count_y_dict.get(START_STATE_KEY) else 1
                if label in count_y_dict:
                    count_y_dict[label] = count_y_dict.get(label)+1
                else:
                    count_y_dict[label] = 1
            if prev_label == '' and label != '':
                if (START_STATE_KEY, label) in count_y_to_y_dict:
                    count_y_to_y_dict[(START_STATE_KEY, label)] = count_y_to_y_dict.get((START_STATE_KEY, label)) + 1
                else:
                    count_y_to_y_dict[(START_STATE_KEY, label)] = 1
            elif label == '' and prev_label != '':
                if (prev_label, STOP_STATE_KEY) in count_y_to_y_dict:
                    count_y_to_y_dict[(prev_label, STOP_STATE_KEY)] = count_y_to_y_dict.get((prev_label, STOP_STATE_KEY)) + 1
                else:
                    count_y_to_y_dict[(prev_label, STOP_STATE_KEY)] = 1
            elif label != '' and prev_label != '':
                if (prev_label, label) in count_y_to_y_dict:
                    count_y_to_y_dict[(prev_label, label)] = count_y_to_y_dict.get((prev_label, label)) + 1
                else:
                    count_y_to_y_dict[(prev_label, label)] = 1
            prev_label = label
    # Calculate our transition
    for key, value in count_y_to_y_dict.items(): # Default is iterate keys()
        prev_label = key[0]
        label = key[1]
        string = f"transition: {prev_label}+{label}" 
        prob =  value / count_y_dict.get(prev_label)
        emission_dict[string] = float(np.where(prob != 0, np.log(prob), LARGE_NEG))
    # print("MLE: \n",list(emission_dict.items()), len(emission_dict) ,"\n")
    emission_transition_dict = emission_dict

    return count_y_to_y_dict, emission_transition_dict

In [45]:
count_y_to_y_dict, emission_transition_dict = MLE_transition_parameters(train_dir, emission_dict)
print(list(emission_transition_dict.items())[:5])
print(list(emission_transition_dict.items())[-5:])

[('emission: O+All', -9.017685611042436), ('emission: O+in', -4.54034879656423), ('emission: O+all', -5.785564559424215), ('emission: O+,', -3.24728344904484), ('emission: O+the', -3.0916488692569097)]
[('transition: B-positive+STOP', -4.252688120309395), ('transition: I-positive+STOP', -4.564348191467836), ('transition: B-positive+B-positive', -7.0859014643656115), ('transition: B-neutral+STOP', -3.245193133185574), ('transition: B-negative+STOP', -4.836281906951478)]


# Part 2

## 2i)

In [46]:
def score(sentence, emission_transition_dict):
    ''' Calculates the score with of a given pair based on emission and transmission features
    
    :param sentences: our  file tokenised sentences
    :type sentences: list(tuple())

    :return emission_transition_dict: value of Count(labels->words)/Count(labels) for emission and Count(prev_labels->labels)/Count(labels) for transmission, keys are tuples of word and label ('emission: O+All', -9.01768561), value MLE
    :type emission_transition_dict: dict()

    :param score: our emission score + transition score for sentence
    :type sentences: float
    '''
    score = 0
    emission_score = 0 
    transition_score = 0
    x_seq = [x[0] for x in sentence]
    y_seq = [START_STATE_KEY]+[y[1] for y in sentence]+[STOP_STATE_KEY]
    
    for i in range(len(x_seq)):
        label = y_seq[i+1]
        word = x_seq[i]
        key = f"emission: {label}+{word}" 
        emission_score += emission_transition_dict[key]
    for j in range(1, len(y_seq)):
        prev_label = y_seq[j-1]
        label = y_seq[j]
        key = f"transition: {prev_label}+{label}" 
        transition_score += emission_transition_dict[key]
    score = emission_score + transition_score
    return score



In [47]:
score(train_sentences[0],emission_transition_dict)

-85.52845366888094

## 2ii)

In [48]:
test_sentences = tokenize(test_dir)
print(test_sentences[:5])   

[[('Loved',), ('it',)], [('The',), ('music',), ('playing',), ('was',), ('very',), ('hip',), (',',), ('20-30',), ('something',), ('pop',), ('music',), (',',), ('but',), ('the',), ('subwoofer',), ('to',), ('the',), ('sound',), ('system',), ('was',), ('located',), ('under',), ('my',), ('seat',), (',',), ('which',), ('became',), ('annoying',), ('midway',), ('through',), ('dinner',), ('.',)], [('This',), ('place',), ('has',), ('ruined',), ('me',), ('for',), ('neighborhood',), ('sushi',), ('.',)], [('I',), ('have',), ('never',), ('eaten',), ('in',), ('the',), ('restaurant',), (',',), ('however',), (',',), ('upon',), ('reading',), ('the',), ('reviews',), ('I',), ('got',), ('take',), ('out',), ('last',), ('week',), ('.',)], [('It',), ("isn't",), ('the',), ('cheapest',), ('sushi',), ('but',), ('has',), ('been',), ('worth',), ('it',), ('every',), ('time',), ('.',)]]


In [49]:
def viterbi_algo(test_sentences, count_y_dict, emission_transition_dict):
    ''' Decoding process that finds greedily finds the best possible labels from past MLE scores, saves file to output folder
    
    :param test_sentences: our file tokenised sentences
    :type test_sentences: list(tuple())

    :param count_y_dict: Count of labels 
    :param count_y_dict: dict()

    :param emission_transition_dict: value of Count(labels->words)/Count(labels) for emission and Count(prev_labels->labels)/Count(labels) for transmission, keys are tuples of word and label ('emission: O+All', -9.01768561), value MLE
    :param emission_transition_dict: dict()
    '''
    
    pi = [{}]
    path = {}
    labels = count_y_dict.keys()
    os.makedirs('output',exist_ok=True)

    with open('output/dev.p2.out', "w") as outfile:
        for sentence in test_sentences:
            # j = 0 (START)
            for label in labels:
                pi[0][label] = emission_transition_dict.get(f"transition: {'START'}+{label}",LARGE_NEG) + emission_transition_dict.get(f"emission: {label}+{sentence[0][0]}",LARGE_NEG)
                path[label] = [label]
            # j = 1 to N-1
            for idx in range(1,len(sentence)):
                pi.append({})
                newpath = {}
                for label_y in labels:
                    (prob, label) = max([(pi[idx-1][prev_label] + emission_transition_dict.get(f"transition: {prev_label}+{label_y}",LARGE_NEG) + emission_transition_dict.get(f"emission: {label_y}+{sentence[idx][0]}",LARGE_NEG), prev_label) 
                                    for prev_label in labels])
                    pi[idx][label_y] = prob
                    newpath[label_y] = path[label] + [label_y]
                path = newpath
            # j = N (STOP)
            idx = len(sentence)
            (prob, label) = max([(pi[idx-1][label_y] + emission_transition_dict.get(f"transition: {label_y}+{'STOP'}", LARGE_NEG), label_y) for label_y in labels])
            
            # handle inconsistent length
            if len(sentence) != len(path[label]):
                print(len(sentence),len(path[label]))
                raise Exception("{} has a different lenght with {}".format(sentence, path[label]))
            
            # write to file
            for i in range(len(sentence)):
                line = "{} {}\n".format(sentence[i][0], path[label][i])
                outfile.write(line)
                
            outfile.write("\n")

In [50]:
viterbi_algo(test_sentences, count_y_dict, emission_transition_dict)

# Evaluation of dev.p2.out

In [51]:
prediction_dir = 'output/dev.p2.out'
truth_dir = 'dataset/dev.out'

def evaluate_results(truth_dir,prediction_dir):
    predictions = []
    prediction_sentences = tokenize(prediction_dir)
    for sentence in prediction_sentences:
        for word_pair in sentence:
            predictions.append(word_pair[1])     
    lines = """"""
    idx = 0
    with open(truth_dir, "r", encoding="utf8") as tstr:
        for line in tstr:
            if len(line) > 1:
                newline = line.replace("\n",f" {predictions[idx]}\n")
                lines += newline
                idx += 1
            else:
                lines += "\n"
    return lines.splitlines()

lines = evaluate_results(truth_dir,prediction_dir)
res = conlleval.evaluate(lines)
print(conlleval.report(res))

processed 3809 tokens with 210 phrases; found: 132 phrases; correct: 63.
accuracy:  93.23%; precision:  47.73%; recall:  30.00%; FB1:  36.84
         negative: precision:  35.29%; recall:   9.23%; FB1:  14.63  17
          neutral: precision:  20.00%; recall:  12.50%; FB1:  15.38  5
         positive: precision:  50.91%; recall:  40.88%; FB1:  45.34  110



# Part 3

In [52]:
def logSumExp(a):
    max = np.max(a)
    sumOfExp = np.exp(a - max).sum()
    return max + np.log(sumOfExp)

def forward_algorithm(sentence, count_y_dict, emission_transition_dict):    
    pi = [{}]
    labels = count_y_dict.keys()
    # j = 0 (START)
    for label in labels:
        pi[0][label] = emission_transition_dict.get(f"transition: {'START'}+{label}",LARGE_NEG) + emission_transition_dict.get(f"emission: {label}+{sentence[0][0]}",LARGE_NEG)

    # j = 1 to N-1
    for idx in range(1,len(sentence)):
        pi.append({})

        for label in labels:
            log_a = []
            for prev_label in labels:
                log_a.append(pi[idx-1][prev_label] + emission_transition_dict.get(f"transition: {prev_label}+{label}",LARGE_NEG) + emission_transition_dict.get(f"emission: {label}+{sentence[idx][0]}",LARGE_NEG))
            pi[idx][label] = logSumExp(log_a)
            
    # j = N (STOP)
    idx = len(sentence)
    log_a = []
    for label in labels:
        log_a.append(pi[idx-1][label] + emission_transition_dict.get(f"transition: {label}+{'STOP'}", LARGE_NEG))
    return pi, logSumExp(log_a)

forward_algorithm(train_sentences[0], count_y_dict, emission_transition_dict)

([{'O': -9.079345204990318,
   'B-positive': -4503599627370499.0,
   'B-negative': -4503599627370501.0,
   'I-positive': -9007199254740992,
   'B-neutral': -4503599627370501.0,
   'I-neutral': -9007199254740992,
   'I-negative': -9007199254740992},
  {'O': -13.766622979615455,
   'B-positive': -4503599627370508.0,
   'B-negative': -4503599627370509.0,
   'I-positive': -4503599627370506.0,
   'B-neutral': -4503599627370511.0,
   'I-neutral': -9007199254741000.0,
   'I-negative': -4503599627370507.0},
  {'O': -19.699116517100578,
   'B-positive': -23.953109923204384,
   'B-negative': -23.93728208854489,
   'I-positive': -9007199254741004.0,
   'B-neutral': -4503599627370516.0,
   'I-neutral': -9007199254741006.0,
   'I-negative': -9007199254741004.0},
  {'O': -23.068893163028275,
   'B-positive': -4503599627370519.0,
   'B-negative': -4503599627370520.0,
   'I-positive': -30.46462428231426,
   'B-neutral': -4503599627370522.0,
   'I-neutral': -9007199254741012.0,
   'I-negative': -450359

In [53]:
def loss_func(sentences, count_y_dict, emission_transition_dict):
    loss = 0
    for sent in sentences:
        loss+= score(sent, emission_transition_dict)
        _, update = forward_algorithm(sent, count_y_dict, emission_transition_dict)
        loss-= update
    return (-1)*loss
loss_func(train_sentences, count_y_dict, emission_transition_dict)

2050.74053383538

# Part 5

In [55]:
def unigram_1_parameters(train_dir, emission_dict):
    """Calculates the transition parameters by count(y->x_i-1)/count(y)

    :param train_dir: our train file
    :type train_sentences: str

    :param emission_dict: count(y->x_i-1)/count(y), keys are tuples of word and label ('unigram_1: O+All', -9.01768561), value MLE
    :type emission_dict: dict()

    :return count_y_to_y_dict: Count of labels and previous label
    :rtype: dict()

    :return emission_transition_dict: value of Count(labels->words_i-1)/Count(labels) for emission and Count(prev_labels->labels)/Count(labels) for transmission, keys are tuples of word and label ('unigram_1: O+All', -9.01768561), value MLE
    :rtype: dict()
    """
    # key is label | value is count
    count_y_dict = {}
    # key is word_i-1 , label_i | value is count
    count_y_to_x_dict = {}

    with open(train_dir, "r", encoding="utf8") as f:
        prev_word, prev_label = "", ""
        for line in f:
            # Parse each line
            if len(line.split(" ")) == 2:
                word, label = line.replace("\n", "").split(" ")
            else:
                label = ""

            # counting
            if label == "" and prev_label != "":
                count_y_dict[STOP_STATE_KEY] = count_y_dict.get(STOP_STATE_KEY, 0) + 1

            elif label != "":
                if prev_label == "":
                    count_y_dict[START_STATE_KEY] = (
                        count_y_dict.get(START_STATE_KEY, 0) + 1
                    )
                if label in count_y_dict:
                    count_y_dict[label] = count_y_dict.get(label) + 1
                else:
                    count_y_dict[label] = 1

            # Counting unigram
            if label != "" and prev_word != "":
                count_y_to_x_dict[(label, prev_word)] = (
                    count_y_to_x_dict.get((label, prev_word), 0) + 1
                )

            prev_word, prev_label = word, label

    # Calculate unigram
    for key, value in count_y_to_x_dict.items():  # Default is iterate keys()
        label = key[0]
        word = key[1]
        string = f"unigram_1: {label}+{word}"

        prob = value / count_y_dict.get(label)
        emission_dict[string] = float(np.where(prob != 0, np.log(prob), LARGE_NEG))

    print(
        "unigram_1 yi -> xi-1: \n",
        list(emission_dict.items())[-10:],
        len(emission_dict),
        "\n",
    )
    emission_transition_dict = emission_dict

    return count_y_to_x_dict, emission_transition_dict


In [56]:
def unigram_2_parameters(train_dir, emission_dict):
    """Calculates the transition parameters by count(y->x_i+1)/count(y)

    :param train_dir: our train file
    :type train_sentences: str

    :param emission_dict: count(y->x_i+1)/count(y), keys are tuples of word and label ('unigram_1: O+All', -9.01768561), value MLE
    :type emission_dict: dict()

    :return count_y_to_y_dict: Count of labels and previous label
    :rtype: dict()

    :return emission_transition_dict: value of Count(labels -> words_i+1)/Count(labels) for emission and Count(prev_labels->labels)/Count(labels) for transmission, keys are tuples of word and label ('unigram_1: O+All', -9.01768561), value MLE
    :rtype: dict()
    """
    # key is label | value is count
    count_y_dict = {}
    # key is word_i+1 , label_i | value is count
    count_y_to_x_dict = {}

    with open(train_dir, "r", encoding="utf8") as f:
        prev_word, prev_label = "", ""
        for line in f:
            # Parse each line
            if len(line.split(" ")) == 2:
                word, label = line.replace("\n", "").split(" ")
            else:
                label = ""

            # counting
            if label == "" and prev_label != "":
                count_y_dict[STOP_STATE_KEY] = count_y_dict.get(STOP_STATE_KEY, 0) + 1
            elif label != "":
                if prev_label == "":
                    count_y_dict[START_STATE_KEY] = (
                        count_y_dict.get(START_STATE_KEY, 0) + 1
                    )
                if label in count_y_dict:
                    count_y_dict[label] = count_y_dict.get(label) + 1
                else:
                    count_y_dict[label] = 1

            if prev_label != "" and word != "":
                count_y_to_x_dict[(prev_label, word)] = (
                    count_y_to_x_dict.get((prev_label, word), 0) + 1
                )

            prev_word, prev_label = word, label

    # Calculate unigram
    for (label, word), value in count_y_to_x_dict.items():  # Default is iterate keys()
        if prev_label != "" and word != "":
            string = f"unigram_2: {label}+{word}"
            prob = value / count_y_dict.get(label)
            emission_dict[string] = float(np.where(prob != 0, np.log(prob), LARGE_NEG))

    print(
        "unigram_2 yi -> x_i+1: \n",
        list(emission_dict.items())[-10:],
        len(emission_dict),
        "\n",
    )
    emission_transition_dict = emission_dict

    return count_y_to_x_dict, emission_transition_dict


In [57]:
def bigram_parameters(train_dir, emission_dict):
    """Calculates the transition parameters by count(y->x_i+1)/count(y)

    :param train_dir: our train file
    :type train_sentences: str

    :param emission_dict: count(yi-1 -> yi -> xi)/count(y), keys are tuples of word and label ('B-neutral+O+B-neutral', -9.01768561)
    :type emission_dict: dict()

    :return count_y_to_y_dict: Count of labels and previous label
    :rtype: dict()

    :return emission_transition_dict: value of Count(label-1 -> labels -> words)/Count(labels) for emission and Count(prev_labels->labels)/Count(labels) for transmission, keys are tuples of word and label ('B-neutral+O+B-neutral', -9.01768561)
    :rtype: dict()
    """
    # key is label | value is count
    count_y_dict = {}
    # key is word_i+1 , label_i | value is count
    count_y_to_y_to_x_dict = {}

    with open(train_dir, "r", encoding="utf8") as f:
        prev_word, prev_label = "", ""
        for line in f:
            # Parse each line
            if len(line.split(" ")) == 2:
                word, label = line.replace("\n", "").split(" ")
            else:
                label = ""

            # counting
            if label == "" and prev_label != "":
                count_y_dict[STOP_STATE_KEY] = count_y_dict.get(STOP_STATE_KEY, 0) + 1
            elif label != "":
                if prev_label == "":
                    count_y_dict[START_STATE_KEY] = (
                        count_y_dict.get(START_STATE_KEY, 0) + 1
                    )
                if label in count_y_dict:
                    count_y_dict[label] = count_y_dict.get(label) + 1
                else:
                    count_y_dict[label] = 1

            if prev_label != "" and word != "" and label != "":
                count_y_to_y_to_x_dict[(prev_label, label, word)] = (
                    count_y_to_y_to_x_dict.get((prev_label, label, word), 0) + 1
                )

            prev_label = label

    # Calculate unigram
    for key, value in count_y_to_y_to_x_dict.items():  # Default is iterate keys()
        prev_label, label, word = key
        if prev_label != "" and label != "" and word != "":
            string = f"bigram: {prev_label}+{label}+{word}"
            prob = value / count_y_dict.get(label)
            emission_dict[string] = float(np.where(prob != 0, np.log(prob), LARGE_NEG))
        prev_label = label

    print(
        "bigram yi-1 -> yi -> xi: \n",
        list(emission_dict.items())[-10:],
        len(emission_dict),
        "\n",
    )
    emission_transition_dict = emission_dict

    return count_y_to_y_to_x_dict, emission_transition_dict


In [58]:
_, _, emission_dict = MLE_emission_parameters(train_sentences)
_, emission_dict = MLE_transition_parameters(train_dir, emission_dict)
_, emission_dict = unigram_1_parameters(train_dir, emission_dict)
_, emission_dict = unigram_2_parameters(train_dir, emission_dict)
count_y_dict, emission_dict = bigram_parameters(train_dir, emission_dict)

print(list(emission_dict.items())[:5])
print(list(emission_dict.items())[-5:])


unigram_1 yi -> xi-1: 
 [('unigram_1: O+combination', -10.116297899710545), ('unigram_1: B-positive+super-fresh', -7.0859014643656115), ('unigram_1: O+unusual', -10.116297899710545), ('unigram_1: B-neutral+biggest', -4.343805421853684), ('unigram_1: O+adequate', -10.116297899710545), ("unigram_1: O+Mom's", -10.116297899710545), ('unigram_1: O+leaving', -10.116297899710545), ("unigram_1: O+we're", -10.116297899710545), ('unigram_1: O+hurry', -10.116297899710545), ('unigram_1: O+originally', -10.116297899710545)] 9176 

unigram_2 yi -> x_i+1: 
 [('unigram_2: O+combination', -10.116297899710545), ('unigram_2: O+super-fresh', -10.116297899710545), ('unigram_2: O+unusual', -10.116297899710545), ('unigram_2: O+biggest', -10.116297899710545), ('unigram_2: O+adequate', -10.116297899710545), ("unigram_2: O+Mom's", -10.116297899710545), ('unigram_2: O+leaving', -10.116297899710545), ("unigram_2: O+we're", -10.116297899710545), ('unigram_2: O+hurry', -10.116297899710545), ('unigram_2: O+originall

In [None]:
output_file_name = "dev.p5.out"

In [82]:
def viterbi_algo_2(test_sentences, count_y_dict, emission_dict):
    """Decoding process that finds greedily finds the best possible labels from past MLE scores, saves file to output folder

    :param test_sentences: our file tokenised sentences
    :type test_sentences: list(tuple())

    :param count_y_dict: Count of labels
    :param count_y_dict: dict()

    :param emission_dict: value of Count(labels->words)/Count(labels) for emission and Count(prev_labels->labels)/Count(labels) for transmission, keys are tuples of word and label ('emission: O+All', -9.01768561), value MLE
    :param emission_dict: dict()
    """

    pi = [{}]
    path = {}
    labels = count_y_dict.keys()
    os.makedirs("output", exist_ok=True)

    with open("output/dev.p5.out", "w") as outfile:
        for sentence in test_sentences:
            # j = 0 (START)
            for label in labels:
                pi[0][label] = emission_dict.get(
                    f"transition: {'START'}+{label}", LARGE_NEG
                ) + emission_dict.get(f"emission: {label}+{sentence[0][0]}", LARGE_NEG)
                path[label] = [label]
            # j = 1 to N-1
            for idx in range(1, len(sentence)):
                pi.append({})
                newpath = {}
                label_y_prev = ""
                for label_y in labels:
                    (prob, label) = max(
                        [
                            (
                                pi[idx - 1][prev_label]
                                + emission_dict.get(
                                    f"transition: {prev_label}+{label_y}", LARGE_NEG
                                )
                                + emission_dict.get(
                                    f"emission: {label_y}+{sentence[idx][0]}", LARGE_NEG
                                )
                                + (
                                    emission_dict.get(
                                        f"unigram_1: {label_y}+{sentence[idx-1][0]}",
                                        LARGE_NEG,
                                    )
                                )
                                + (
                                    emission_dict.get(
                                        f"unigram_2: {label_y}+{sentence[idx+1][0]}",
                                        LARGE_NEG,
                                    )
                                    if idx < len(sentence) - 1
                                    else 0
                                )
                                + emission_dict.get(
                                    f"bigram: {label_y_prev}+{label_y}+{sentence[idx][0]}",
                                    LARGE_NEG,
                                ),
                                prev_label,
                            )
                            for prev_label in labels
                        ]
                    )
                    pi[idx][label_y] = prob
                    newpath[label_y] = path[label] + [label_y]
                    label_y_prev = label_y
                path = newpath
            # j = N (STOP)
            idx = len(sentence)
            (prob, label) = max(
                [
                    (
                        pi[idx - 1][label_y]
                        + emission_dict.get(
                            f"transition: {label_y}+{'STOP'}", LARGE_NEG
                        ),
                        label_y,
                    )
                    for label_y in labels
                ]
            )

            # handle inconsistent length
            if len(sentence) != len(path[label]):
                print(len(sentence), len(path[label]))
                raise Exception(
                    "{} has a different lenght with {}".format(sentence, path[label])
                )

            # write to file
            for i in range(len(sentence)):
                line = f"{sentence[i][0]} {path[label][i]}\n"
                outfile.write(line)

            outfile.write("\n")


In [83]:
viterbi_algo_2(test_sentences, count_y_dict, emission_dict)

lines = evaluate_results("dataset/dev.out", "output/dev.p5.out")
res = conlleval.evaluate(lines)
print(conlleval.report(res))


processed 3809 tokens with 210 phrases; found: 252 phrases; correct: 82.
accuracy:  91.86%; precision:  32.54%; recall:  39.05%; FB1:  35.50
         negative: precision:  14.29%; recall:   1.54%; FB1:   2.78  7
          neutral: precision:   0.00%; recall:   0.00%; FB1:   0.00  3
         positive: precision:  33.47%; recall:  59.12%; FB1:  42.74  242

