# **50.007 ML 1D Project**
By Darren Chan Yu Hao

## Setup

In [277]:
import numpy as np
import os
import copy as copy

In [278]:
cwd = os.getcwd()
np.random.seed(1993)

## Reading Files

In [279]:
# Functions to read data

# Read dev.in data
def read_dev_in_data(filepath):
    results = []

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            results.append(line.strip())
    
    return results

# Read dev.out data
def read_dev_out_data(filepath):
    results = []

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            stripped_line = line.strip().split(" ")
            results.append(stripped_line)
    
    return results

# Read train data
def read_train_data(filepath):
    results = []

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            stripped_line = line.strip().split(" ")
            results.append(stripped_line)

    return results

In [280]:
# Get pdath of the data
#------------------------------------
# Spanish: ES
ES_dev_in_data_path = os.path.join("Data", "ES" , "dev.in")
ES_dev_out_data_path = os.path.join("Data", "ES" , "dev.out")
ES_train_data_path = os.path.join("Data", "ES" , "train")

# Russiadn: RU
RU_dev_in_data_path = os.path.join("Data", "RU" , "dev.in")
RU_dev_out_data_path = os.path.join("Data", "RU" , "dev.out")
RU_train_data_path = os.path.join("Data", "RU" , "train")
#------------------------------------

## Part 1

In [281]:
# Split words and tags
def split_words_tags(labeled_data):
        words = []
        tags = []

        for word_tag in labeled_data:
            
            if len(word_tag) != 2:
                continue
            
            #word_tag is a list
            word = word_tag[0]
            tag = word_tag[1]

            words.append(word)
            tags.append(tag)

        return words, tags

# Count unique tags
def count_unique_tags(tags_ls):

    tags_unique = set()
    for tag in tags_ls:
        tags_unique.add(tag)
    return tags_unique

# Count unique words
def count_unique_words(words_ls):
         
    words_unique = set()
    for word in words_ls:
        words_unique.add(word)
    return words_unique

In [282]:
# Emission Parameters

# Get the emission parameters
def get_emission_parameters(ls_of_tags, ls_of_words, tags, words, k=1):

  # Write a function that estimates the emission parameters from the training set using MLE (maximumlikelihood estimation):
    # e(x|y) = Count(y -> x) / Count(y)
    # Count(y -> x) = Number of times word x is tagged with tag y
    # Count(y) = Number of times tag y appears

    # Input: ls_of_tags - list of unqiue tags
    # Input: ls_of_words - list of unqiue words
    # Input: tags - list of all tags
    # Input: words - list of all words
    # Output: emission_parameters

    # emission_parameters is a dictionary where:
        # The keys are (tag, word) tuples
        # The values are the emission parameters e(x|y)

    # Example of emission_parameters:
        # emission_parameters[("O", "apple")] = 0.00019841269
        # emission_parameters[("B-positive", "apple")] = 0.00000031622777

    # Create a dictionary to store the emission parameters
    emission_parameters = {}

    # Create a dictionary to store the count of each tag
    count_y = {}

    # Create a dictionary to store the count of each (tag, word) tuple
    count_y_to_x = {}

    # Get the count of each tag from the training set
    for tag_labels in ls_of_tags:
        count_y[tag_labels] = tags.count(tag_labels)
    
    print(f"This is Count(y) : {count_y}")

    # Get the count of each (tag, word) tuple from the training set
    for tag, word in zip(tags, words):
        if (tag, word) in count_y_to_x:
            count_y_to_x[(tag, word)] += 1
        else:
            count_y_to_x[(tag, word)] = 1

    print(f"This is Count(y -> x) : {count_y_to_x}")

    # Get the emission parameters
    for tag, word in count_y_to_x:

        emission_parameters[(tag, word)] = count_y_to_x[(tag, word)] / (count_y[tag] + k) # SOMETHING WRONG WITH THIS FORMULA
        if word == "con":
            print(tag,emission_parameters[(tag, word)])

    # For words that do not appear in the training set, k/(Count(y)+k) is used as the emission parameter
    unknown_word = "UNK"
    for tag in count_y:
        emission_parameters[(tag, unknown_word)] = k / (count_y[tag] + k)

    print(f"This is e(x|y) : {emission_parameters}")

    return emission_parameters

In [283]:
def assign_estimate_tags(test_words, emission_params, train_ls_of_words):

    # for each word in the test set of words (test_words) assign the tag with the highest emission probability

    # Inputs : test_tags - a list of all tags
    #          test_ls_of_tags - a list of unqiue tags
    #         test_number_of_tags - a list of the number of tags
    #        test_words - a list of all words
    #       emission_params - a dictionary of emission parameters
    # 
    # Output : labelled words - a list of words with their assigned tags

    predicted_results = []

    for word in test_words:
        if word in train_ls_of_words:

            # y∗ = arg max y e(x|y)
            emission_value = 0
            for key in emission_params:
                if key[1] == word:
                    if emission_value < emission_params[key]:
                        emission_value = emission_params[key]
                        value = key[0]
            
            predicted_results.append((word, value))
            
        else:

            if word != "":
                # y∗ = arg max y e(x|y)
                emission_value = 0
                for key in emission_params:
                    if key[1] == "UNK":
                        if emission_value < emission_params[key]:
                            emission_value = emission_params[key]
                            value = key[0]

                predicted_results.append(("UNK", value))
            
            else:
                predicted_results.append(("", ""))


    
    print("predicted_results: ", predicted_results)
    return predicted_results
    

In [284]:
def get_precision(test_labels, gold_standard):

    total_predicted = 0
    total_correct = 0

    # convert to set for faster lookup
    gold_standard_tuple_ver = []

    for tuple in gold_standard:

        if len(tuple) < 2:
            continue

        gold_standard_tuple_ver.append((tuple[0], tuple[1]))

    print(f"This is the gold standard: {gold_standard_tuple_ver} \n")

    for predicted_pair in test_labels:

        if predicted_pair in gold_standard_tuple_ver:
            total_correct += 1
    
        total_predicted += 1

    return total_correct / total_predicted

In [285]:
def get_recall(test_labels, gold_standard):

    total_correct = 0
    total_gold = 0

    gold_standard_tuple_ver = []

    for tuple in gold_standard:

        if len(tuple) < 2:
            continue

        gold_standard_tuple_ver.append((tuple[0], tuple[1]))

    for tuple in gold_standard_tuple_ver:

        if tuple in test_labels:
            total_correct += 1

        total_gold += 1

    return total_correct / total_gold

In [286]:
def get_f_score(precision, recall):
    return 2/((1/precision) + (1/recall))

In [287]:
def calculate_part_1(dev_in_data_path, dev_out_data_path, train_data_path, output_path):

    train_data = read_train_data(train_data_path)

    train_words, train_tags = split_words_tags(train_data)
    train_ls_of_tags = count_unique_tags(train_tags)
    train_ls_of_words = count_unique_words(train_words)

    # Get Emmission Parameters
    k = 1
    emission_params = get_emission_parameters(train_ls_of_tags, train_ls_of_words, train_tags, train_words, k)

    test_data = read_dev_in_data(dev_in_data_path)

    # Get labels for test data
    test_labels = assign_estimate_tags(test_data, emission_params, train_ls_of_words)

    with open(output_path, "w+", encoding="utf-8") as file:
        for line in test_labels:
            write_line = line[0] + " " + line[1] + "\n"
            file.write(write_line)

    gold_standard = read_dev_out_data(dev_out_data_path)

    # Precision = Total number of correctly predicted entities / Total number of predicted entities
    precision = get_precision(test_labels, gold_standard)
    print("Precision: ", precision)

    # Recall = Total number of correctly predicted entities / Total number of entities in the gold standard
    recall = get_recall(test_labels, gold_standard)
    print("Recall: ", recall)

    # F score
    f_score = get_f_score(precision, recall)
    print("F Score: ", f_score)

In [288]:
# Different Language

# For Spanish

print("For Spanish: ")
output_path = os.path.join("Data", "ES" , "dev.p1.out")
calculate_part_1(ES_dev_in_data_path, ES_dev_out_data_path, ES_train_data_path, output_path)

print("\n")

# For Russian
print("For Russian: ")
output_path = os.path.join("Data", "RU" , "dev.p1.out")
calculate_part_1(RU_dev_in_data_path, RU_dev_out_data_path, RU_train_data_path, output_path)

For Spanish: 
This is Count(y) : {'O': 29035, 'B-negative': 381, 'I-negative': 171, 'B-positive': 1160, 'I-neutral': 43, 'I-positive': 314, 'B-neutral': 72}
This is Count(y -> x) : {('O', 'Estuvimos'): 6, ('O', 'hace'): 26, ('O', 'poco'): 55, ('O', 'mi'): 72, ('O', 'pareja'): 13, ('O', 'y'): 1024, ('O', 'yo'): 36, ('O', 'comiendo'): 10, ('O', 'resultó'): 4, ('O', 'todo'): 115, ('O', 'muy'): 396, ('O', 'bien'): 165, ('O', ','): 1664, ('O', 'tanto'): 39, ('O', 'la'): 755, ('B-positive', 'comida'): 169, ('O', 'el'): 642, ('B-positive', 'vino'): 6, ('B-positive', 'trato'): 44, ('B-positive', 'decoración'): 7, ('O', '…'): 45, ('O', 'nos'): 146, ('O', 'gustó'): 11, ('O', 'mucho'): 53, ('O', '.'): 1623, ('O', 'Por'): 30, ('O', 'poner'): 5, ('O', 'algún'): 6, ('O', 'pero'): 191, ('O', 'quizá'): 4, ('B-negative', 'jamón'): 1, ('O', 'no'): 369, ('O', 'era'): 52, ('O', 'lo'): 270, ('O', '"'): 39, ('O', 'ibérico'): 2, ('O', 'que'): 845, ('O', 'cabía'): 1, ('O', 'esperar'): 5, ('O', 'Bien'): 6, ('O

# Part 2

In [289]:
train_data = read_train_data(ES_train_data_path)
train_words, train_tags = split_words_tags(train_data)
train_ls_of_tags = count_unique_tags(train_tags)
train_ls_of_words = count_unique_words(train_words)
# Get Emmission Parameters
k = 1
emission_params = get_emission_parameters(train_ls_of_tags, train_ls_of_words, train_tags, train_words, k)

This is Count(y) : {'O': 29035, 'B-negative': 381, 'I-negative': 171, 'B-positive': 1160, 'I-neutral': 43, 'I-positive': 314, 'B-neutral': 72}
This is Count(y -> x) : {('O', 'Estuvimos'): 6, ('O', 'hace'): 26, ('O', 'poco'): 55, ('O', 'mi'): 72, ('O', 'pareja'): 13, ('O', 'y'): 1024, ('O', 'yo'): 36, ('O', 'comiendo'): 10, ('O', 'resultó'): 4, ('O', 'todo'): 115, ('O', 'muy'): 396, ('O', 'bien'): 165, ('O', ','): 1664, ('O', 'tanto'): 39, ('O', 'la'): 755, ('B-positive', 'comida'): 169, ('O', 'el'): 642, ('B-positive', 'vino'): 6, ('B-positive', 'trato'): 44, ('B-positive', 'decoración'): 7, ('O', '…'): 45, ('O', 'nos'): 146, ('O', 'gustó'): 11, ('O', 'mucho'): 53, ('O', '.'): 1623, ('O', 'Por'): 30, ('O', 'poner'): 5, ('O', 'algún'): 6, ('O', 'pero'): 191, ('O', 'quizá'): 4, ('B-negative', 'jamón'): 1, ('O', 'no'): 369, ('O', 'era'): 52, ('O', 'lo'): 270, ('O', '"'): 39, ('O', 'ibérico'): 2, ('O', 'que'): 845, ('O', 'cabía'): 1, ('O', 'esperar'): 5, ('O', 'Bien'): 6, ('O', 'sabe'): 5,

In [290]:
def read_train_data_p2(filepath):
    results = []
    #Add start state
    results.append(' START')

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            stripped_line = line.strip()
            results.append(stripped_line)
            if stripped_line == "":
                # include stop and start states at new sentence
                results.append(' STOP')
                results.append(' START')
    final_results = []
    for line in results:
        split_lines = line.split(" ")
        final_results.append(split_lines)
    final_results.pop()
    #remove final start state

    return final_results

In [291]:
def get_transmission_parameters(ls_of_tags, tags):
    #Write a function that estimates the transition parameters from the training set using MLE (maximum likelihood estimation)
    # q( y_i | y_i-1 ) = Count( y_i-1, y_i ) / Count( y_i-1 )
    # Count(y_i-1 , y_i) = Number of times tag y_i-1 transits to tag y_i
    # Count(y_i-1) = Number of times tag y_i-1 appears

    # Input: ls_of_tags - list of unique tags
    # Input: tags - list of all tags

    # transmission_parameters is a dictionary where:
        # The keys are (tag_y_i-1, tag_y_i) tuples
        # The values are the transmission parameters q(y_i | y_i-1)

    # Example of emission_parameters:
        # emission_parameters[("O", "O")] = 0.00019841269
        # emission_parameters[("B-positive", "O")] = 0.00000031622777

    # Create a dictionary to store the emission parameters
    transmission_parameters = {}

    # Create a dictionary to store the count of each tag
    count_y = {}

    # Create a dictionary to store the count of each (y_i-1, y_i) tuple
    count_y_i_1_to_y_i = {}

    # Get the count of each tag from the training set
    for tag_labels in ls_of_tags:
        count_y[tag_labels] = tags.count(tag_labels)

    print(f"This is Count(y) : {count_y}")

    # Get the count of each (y_i-1, y_i) tuple from the training set
    for i in range(1, len(tags)):
        if (tags[i-1], tags[i]) in count_y_i_1_to_y_i:
            count_y_i_1_to_y_i[(tags[i-1],tags[i])] +=1
        else:
            count_y_i_1_to_y_i[(tags[i-1],tags[i])] =1

    print(f"This is Count (y_i-1 , y_i) : {count_y_i_1_to_y_i}")

    #transmission probability from state y_i-1 to y_i e.g ("START", "O") = 0.9281 == 0.9281 probability to transmit from "START" to "O" state

    for key, value in count_y_i_1_to_y_i.items():
        transmission_parameters[key] = value / count_y[key[0]]

    print(f"This is the q(y_i | y_i-1): {transmission_parameters}")

    labels = ["START", "STOP", "O", "B-positive", "B-neutral", "B-negative", "I-positive","I-neutral","I-negative"]
    for i in labels:
        for j in labels:
            if (i, j) in transmission_parameters:
                continue
            else:
                transmission_parameters[(i, j)] = 0

    return transmission_parameters

In [292]:
# note that if we just blindly shove the fractions into the algorithm
# multiply fractions enough times and it'll approach 0
# and yeah that's gonna end up becoming 0 ft. computer inaccuracy
# that's the numerical underflow
# we can prevent this by log-ing everything

# both transition_parameters and emission_parameters are dictionaries

def log_underflow_prevention(parameter_dict):
    log_parameter_dict = {}
    for key, value in parameter_dict.items():
        if value == 0:
            log_parameter_dict[key] = -np.inf
        else:
            log_parameter_dict[key] = np.log(value)
    return log_parameter_dict


In [293]:
# Reads the dev data
def read_dev(path):
  out = [[]]
  f = open(path, "r", encoding="utf-8")
  lines_in = f.readlines()
  for word in lines_in:
    if word == "\n":
      out.append([])
    else:
      out[-1].append(word.rstrip())
  return out[:-1]

In [294]:
def viterbi(document, transmission, emission, ls_of_words):
  n = len(document)
  tags = ["O", "B-positive", "B-neutral", "B-negative", "I-positive","I-neutral","I-negative","STOP"]

  memo = [{} for _ in range(n+1)]
  parent_arr = [{} for _ in range(n+1)]
  #initial step from start to first node
  for tag in tags:
    a_v_u = transmission.get(("START", tag)) 
    if document[0] in ls_of_words:
      # if tag emits word, get emission, else -inf
      b_u = emission.get((tag, document[0])) or -np.inf
    else:
      #if word not in document
      b_u = emission.get((tag, "UNK"))  or -np.inf
    memo[0][tag] =  a_v_u + b_u
    parent_arr[0][tag] = None
  #recursive
  for j in range(1,n):
    for u in tags:
      max_prob = -np.inf
      max_v = None
      for v in tags:
        if document[j] in ls_of_words:
          emission_prob = emission.get((u, document[j])) or -np.inf
        else:
          emission_prob = emission.get((u, "UNK")) or -np.inf
        transmission_v_u = transmission.get((v, u)) or -np.inf 
        prob = memo[j-1][v] + transmission_v_u+ emission_prob
        if prob > max_prob:
            max_prob = prob
            max_v = v
      memo[j][u] = max_prob
      parent_arr[j][u] = max_v
  # Termination step

  max_prob = -np.inf
  max_v = None
  for tag in tags:
    prob = memo[n-1][tag] + transmission.get((tag, "STOP"))
    if prob > max_prob:
        max_prob = prob
        max_v = tag

  if max_prob != -np.inf:
    memo[n]['STOP'] = max_prob
    parent_arr[n]['STOP'] = max_v
    
  most_likely_sequence = ["" for _ in range(n)]
  if max_v == None:
        max_v = "O"
  # Backtrack to find the most likely path
  for t in range(n , 0, -1):
    max_v = parent_arr[t].get(max_v)
    if max_v == None:
      max_v = "O"
    most_likely_sequence[t-1] = max_v

  return most_likely_sequence


In [295]:
def viterbi_loop(data, transmission, emission, ls_of_words):
  results =[]
  for document in data:
    results.append(viterbi(document, transmission, emission, ls_of_words))
  return results

In [296]:
# Writes the prediction from trained data into the dev.in file and output
def assign_prediction(prediction, data, path):
    if (len(prediction) != len(data)):
        return "Error, prediction length != data length"
    file = open(path, "w", encoding="utf-8")
    n = len(data)
    for i in range(n):
        assert( len(prediction[i])== len(data[i]))
        m = len(data[i])
        for j in range(m):
            file.write(data[i][j] + " " + prediction[i][j] + "\n")
        file.write("\n")
    print("Wrote predictions to", path)
    return



In [297]:
def write_predictions_part_2(dev_in_data_path, dev_out_data_path, train_data_path, output_path):
  #sort train data into tag and words
  train_data = read_train_data(train_data_path)
  train_words, train_tags = split_words_tags(train_data)
  train_ls_of_tags = count_unique_tags(train_tags)
  train_ls_of_words = count_unique_words(train_words)
  # Get Emission Parameters
  k = 1
  emission_params = get_emission_parameters(train_ls_of_tags, train_ls_of_words, train_tags, train_words, k)

  #append start and stop to train data per document
  train_data_modified = read_train_data_p2(train_data_path)
  train_words_modified, train_tags_modified = split_words_tags(train_data_modified)
  train_ls_of_tags_modified = count_unique_tags(train_tags_modified)
  transmission_params = get_transmission_parameters(train_ls_of_tags_modified, train_tags_modified)

  #log transmission and emission params to avoid underflow
  log_emission = log_underflow_prevention(emission_params)
  log_transmission = log_underflow_prevention(transmission_params)
  # read dev_in in list of list
  dev_in_list = read_dev(dev_in_data_path)
  
  # run viterbi and get predictions for the whole dev_in
  predictions = viterbi_loop(dev_in_list, log_transmission, log_emission, train_ls_of_words)
  # write predictions into dev.p2.out
  assign_prediction(predictions, dev_in_list, output_path)

def calculate_scores_part_2(dev_out_data_path, output_path):
  #gold standard
  gold_standard = read_dev_out_data(dev_out_data_path)
  #test predictions 
  test_data = read_dev_out_data(output_path)
  test_prediction = []
  for i in range(len(test_data)):
    if test_data[i][0] == '':
      test_prediction.append(("", ""))
    else:
      test_prediction.append((test_data[i][0], test_data[i][1]))
  # Precision = Total number of correctly predicted entities / Total number of predicted entities

  precision = get_precision(test_prediction, gold_standard)
  print("Precision: ", precision)

  # Recall = Total number of correctly predicted entities / Total number of entities in the gold standard
  recall = get_recall(test_prediction, gold_standard)
  print("Recall: ", recall)

  # F score
  f_score = get_f_score(precision, recall)
  print("F Score: ", f_score)

def calculate_part_2(dev_in_data_path, dev_out_data_path, train_data_path, output_path):
  write_predictions_part_2(dev_in_data_path, dev_out_data_path, train_data_path, output_path)
  calculate_scores_part_2(dev_out_data_path, output_path)



In [298]:
# Different Language

# For Spanish

print("For Spanish: ")
output_path = os.path.join("Data", "ES" , "dev.p2.out")
calculate_part_2(ES_dev_in_data_path, ES_dev_out_data_path, ES_train_data_path, output_path)

print("\n")

# For Russian
print("For Russian: ")
output_path = os.path.join("Data", "RU" , "dev.p2.out")
calculate_part_2(RU_dev_in_data_path, RU_dev_out_data_path, RU_train_data_path, output_path)

For Spanish: 
This is Count(y) : {'O': 29035, 'B-negative': 381, 'I-negative': 171, 'B-positive': 1160, 'I-neutral': 43, 'I-positive': 314, 'B-neutral': 72}
This is Count(y -> x) : {('O', 'Estuvimos'): 6, ('O', 'hace'): 26, ('O', 'poco'): 55, ('O', 'mi'): 72, ('O', 'pareja'): 13, ('O', 'y'): 1024, ('O', 'yo'): 36, ('O', 'comiendo'): 10, ('O', 'resultó'): 4, ('O', 'todo'): 115, ('O', 'muy'): 396, ('O', 'bien'): 165, ('O', ','): 1664, ('O', 'tanto'): 39, ('O', 'la'): 755, ('B-positive', 'comida'): 169, ('O', 'el'): 642, ('B-positive', 'vino'): 6, ('B-positive', 'trato'): 44, ('B-positive', 'decoración'): 7, ('O', '…'): 45, ('O', 'nos'): 146, ('O', 'gustó'): 11, ('O', 'mucho'): 53, ('O', '.'): 1623, ('O', 'Por'): 30, ('O', 'poner'): 5, ('O', 'algún'): 6, ('O', 'pero'): 191, ('O', 'quizá'): 4, ('B-negative', 'jamón'): 1, ('O', 'no'): 369, ('O', 'era'): 52, ('O', 'lo'): 270, ('O', '"'): 39, ('O', 'ibérico'): 2, ('O', 'que'): 845, ('O', 'cabía'): 1, ('O', 'esperar'): 5, ('O', 'Bien'): 6, ('O

# Part 3

top k-best viterbi

In [299]:
def k_best_viterbi(document, transmission, emission, ls_of_words, k):
    n = len(document)
    tags = ["O", "B-positive", "B-neutral", "B-negative", "I-positive","I-neutral","I-negative"]
    # store k-best paths
    k_best_paths = [("START", 0, []) for _ in range(k)]

    for i in range(0, n):
        new_k_best_paths = []
        for tag, path_prob, prev_path in k_best_paths:
            for next_tag in tags:  
              a_uv = transmission.get((tag, next_tag)) 
              if document[i] in ls_of_words:
                b_uo = emission.get((next_tag, document[i])) or -np.inf
              else:
                b_uo = emission.get((next_tag, "UNK")) or -np.inf
              new_prob = path_prob + a_uv + b_uo
              if (i==0):
                 new_k_best_paths.append((next_tag, new_prob, []))
              elif (next_tag, new_prob, prev_path + [tag]) not in new_k_best_paths:
                new_k_best_paths.append((next_tag, new_prob, prev_path + [tag]))
        
        #sort top 8, drop the rest
        new_k_best_paths.sort(key=lambda x: x[1], reverse=True)
        k_best_paths = new_k_best_paths[:k]

    #termination step:
    final_k_best_paths = []

    for tag, path_prob, prev_path in k_best_paths:
        final_prob = path_prob + transmission.get((tag, "STOP"))
        final_k_best_paths.append((tag, final_prob, prev_path + [tag]))
    
    final_k_best_paths.sort(key=lambda x: x[1], reverse=True)
    
    # Backtracking
    kth_best_path = final_k_best_paths
    if len(kth_best_path) >= 8: 
        # get 2nd and 8th best sequence
        second = kth_best_path[1]
        eighth = kth_best_path[7]
    else:
        # Get last output sequence if list is shorter than 8
        second = kth_best_path[1]
        eighth = kth_best_path[-1]
    
    return [second, eighth]  # The k-th best path (list of states)


In [300]:
def write_predictions_part_3(dev_in_data_path, dev_out_2nd_data_path, dev_out_8th_data_path, train_data_path, dev_out):
  #sort train data into tag and words
  train_data = read_train_data(train_data_path)
  train_words, train_tags = split_words_tags(train_data)
  train_ls_of_tags = count_unique_tags(train_tags)
  train_ls_of_words = count_unique_words(train_words)
  # Get Emission Parameters
  emission_k = 1
  emission_params = get_emission_parameters(train_ls_of_tags, train_ls_of_words, train_tags, train_words, emission_k)

  #append start and stop to train data per document
  train_data_modified = read_train_data_p2(train_data_path)
  train_words_modified, train_tags_modified = split_words_tags(train_data_modified)
  train_ls_of_tags_modified = count_unique_tags(train_tags_modified)
  transmission_params = get_transmission_parameters(train_ls_of_tags_modified, train_tags_modified)

  #log transmission and emission params to avoid underflow
  log_emission = log_underflow_prevention(emission_params)
  log_transmission = log_underflow_prevention(transmission_params)
  # read dev_in in list of list
  dev_in_list = read_dev(dev_in_data_path)
  
  # run kth-best-viterbi_loop and get predictions for the whole dev_in, store best 8 sequences
  k = 8
  total_second_seq = []
  total_eighth_seq = []
  for data in dev_in_list:
    predictions = k_best_viterbi(data, log_transmission, log_emission, train_ls_of_words, k)
    # print("2nd best sequence: ", "prob:", predictions[0][1], "\n sequence: ", predictions[0][2])
    # print("8th best sequence: ", "prob:", predictions[1][1], "\n sequence: ", predictions[1][2])

    #tags for 2nd best sequence
    second_seq = predictions[0][2]
    total_second_seq.append(second_seq)
    #tags for 8th best sequence
    eighth_seq = predictions[1][2]
    total_eighth_seq.append(eighth_seq)

  
  
  # write predictions into dev.p3.2nd.out and dev.p3.8th.out
  assign_prediction(total_second_seq, dev_in_list, dev_out_2nd_data_path)
  assign_prediction(total_eighth_seq, dev_in_list, dev_out_8th_data_path)
  
  # calculate scores for predictions
  print("Second best sequence", total_second_seq)
  print("Scores for 2nd best sequence")
  calculate_scores_part_2(dev_out, dev_out_2nd_data_path)
  print("8th best sequence", total_eighth_seq)
  print("\nScores for 8th best sequence")
  calculate_scores_part_2(dev_out, dev_out_8th_data_path)

In [301]:
ES_dev_out_p3_2nd_data_path = os.path.join("Data", "ES" , "dev.p3.2nd.out")
ES_dev_out_p3_8th_data_path = os.path.join("Data", "ES" , "dev.p3.8th.out")
RU_dev_out_p3_2nd_data_path = os.path.join("Data", "RU" , "dev.p3.2nd.out")
RU_dev_out_p3_8th_data_path = os.path.join("Data", "RU" , "dev.p3.8th.out")


write_predictions_part_3(ES_dev_in_data_path, ES_dev_out_p3_2nd_data_path, ES_dev_out_p3_8th_data_path, ES_train_data_path, ES_dev_out_data_path)

This is Count(y) : {'O': 29035, 'B-negative': 381, 'I-negative': 171, 'B-positive': 1160, 'I-neutral': 43, 'I-positive': 314, 'B-neutral': 72}
This is Count(y -> x) : {('O', 'Estuvimos'): 6, ('O', 'hace'): 26, ('O', 'poco'): 55, ('O', 'mi'): 72, ('O', 'pareja'): 13, ('O', 'y'): 1024, ('O', 'yo'): 36, ('O', 'comiendo'): 10, ('O', 'resultó'): 4, ('O', 'todo'): 115, ('O', 'muy'): 396, ('O', 'bien'): 165, ('O', ','): 1664, ('O', 'tanto'): 39, ('O', 'la'): 755, ('B-positive', 'comida'): 169, ('O', 'el'): 642, ('B-positive', 'vino'): 6, ('B-positive', 'trato'): 44, ('B-positive', 'decoración'): 7, ('O', '…'): 45, ('O', 'nos'): 146, ('O', 'gustó'): 11, ('O', 'mucho'): 53, ('O', '.'): 1623, ('O', 'Por'): 30, ('O', 'poner'): 5, ('O', 'algún'): 6, ('O', 'pero'): 191, ('O', 'quizá'): 4, ('B-negative', 'jamón'): 1, ('O', 'no'): 369, ('O', 'era'): 52, ('O', 'lo'): 270, ('O', '"'): 39, ('O', 'ibérico'): 2, ('O', 'que'): 845, ('O', 'cabía'): 1, ('O', 'esperar'): 5, ('O', 'Bien'): 6, ('O', 'sabe'): 5,

# part 4

In [306]:
def getTrainWords(train_data_path):
  train_data = read_train_data(ES_train_data_path)
  print(train_data)
  words = []
  for word_tag in train_data:
      if len(word_tag) != 2:
          words.append(word_tag[0])
      else:
        #word_tag is a list
        word = word_tag[0]
      words.append(word)

  return words

print(getTrainWords(ES_train_data_path))

[['Estuvimos', 'O'], ['hace', 'O'], ['poco', 'O'], ['mi', 'O'], ['pareja', 'O'], ['y', 'O'], ['yo', 'O'], ['comiendo', 'O'], ['y', 'O'], ['resultó', 'O'], ['todo', 'O'], ['muy', 'O'], ['bien', 'O'], [',', 'O'], ['tanto', 'O'], ['la', 'O'], ['comida', 'B-positive'], [',', 'O'], ['el', 'O'], ['vino', 'B-positive'], [',', 'O'], ['el', 'O'], ['trato', 'B-positive'], [',', 'O'], ['la', 'O'], ['decoración', 'B-positive'], ['…', 'O'], ['nos', 'O'], ['gustó', 'O'], ['todo', 'O'], ['mucho', 'O'], ['.', 'O'], [''], ['Por', 'O'], ['poner', 'O'], ['algún', 'O'], ['pero', 'O'], [',', 'O'], ['quizá', 'O'], ['el', 'O'], ['jamón', 'B-negative'], ['no', 'O'], ['era', 'O'], ['todo', 'O'], ['lo', 'O'], ['"', 'O'], ['ibérico', 'O'], ['"', 'O'], ['que', 'O'], ['cabía', 'O'], ['esperar', 'O'], ['.', 'O'], [''], ['Bien', 'O'], ['lo', 'O'], ['sabe', 'O'], ['el', 'O'], ['autor', 'O'], ['del', 'O'], ['blog', 'O'], ['.', 'O'], [')', 'O'], [''], ['Comida', 'B-positive'], ['exquisita', 'O'], ['.', 'O'], [''], ['Re

In [307]:
def add_start_stop_to_train(train_data):
    document = ['START']
    result = []
    #Add start state
    for word in train_data:
      if word == "":
        # include stop and start states at new sentence
        document.append("STOP")
        result.append(document)
        document= ["START"]
      else:
         document.append(word)

    return result

print(add_start_stop_to_train(getTrainWords(ES_train_data_path)))

[['Estuvimos', 'O'], ['hace', 'O'], ['poco', 'O'], ['mi', 'O'], ['pareja', 'O'], ['y', 'O'], ['yo', 'O'], ['comiendo', 'O'], ['y', 'O'], ['resultó', 'O'], ['todo', 'O'], ['muy', 'O'], ['bien', 'O'], [',', 'O'], ['tanto', 'O'], ['la', 'O'], ['comida', 'B-positive'], [',', 'O'], ['el', 'O'], ['vino', 'B-positive'], [',', 'O'], ['el', 'O'], ['trato', 'B-positive'], [',', 'O'], ['la', 'O'], ['decoración', 'B-positive'], ['…', 'O'], ['nos', 'O'], ['gustó', 'O'], ['todo', 'O'], ['mucho', 'O'], ['.', 'O'], [''], ['Por', 'O'], ['poner', 'O'], ['algún', 'O'], ['pero', 'O'], [',', 'O'], ['quizá', 'O'], ['el', 'O'], ['jamón', 'B-negative'], ['no', 'O'], ['era', 'O'], ['todo', 'O'], ['lo', 'O'], ['"', 'O'], ['ibérico', 'O'], ['"', 'O'], ['que', 'O'], ['cabía', 'O'], ['esperar', 'O'], ['.', 'O'], [''], ['Bien', 'O'], ['lo', 'O'], ['sabe', 'O'], ['el', 'O'], ['autor', 'O'], ['del', 'O'], ['blog', 'O'], ['.', 'O'], [')', 'O'], [''], ['Comida', 'B-positive'], ['exquisita', 'O'], ['.', 'O'], [''], ['Re