# **50.007 ML 1D Project**
By Darren Chan Yu Hao

## Setup

In [58]:
import numpy as np
import os
import copy as copy

In [59]:
cwd = os.getcwd()
np.random.seed(1993)

## Reading Files

In [60]:
# Functions to read data

# Read dev.in data
def read_dev_in_data(filepath):
    results = []

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            results.append(line.strip())
    
    return results

# Read dev.out data
def read_dev_out_data(filepath):
    results = []

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            stripped_line = line.strip().split(" ")
            results.append(stripped_line)
    
    return results

# Read train data
def read_train_data(filepath):
    results = []

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            stripped_line = line.strip().split(" ")
            results.append(stripped_line)

    return results

def read_train_data_with_start_end(filepath):
    results = []
    last_used = "STOP"

    results.append(["","START"])

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            stripped_line = line.strip().split(" ")
            if len(stripped_line) < 2:
                if(last_used == "STOP"):
                    stripped_line = ["","START"]
                    last_used = "START"
                else:
                    stripped_line = ["","STOP"]
                    last_used = "STOP"

            results.append(stripped_line)

    return results

In [61]:
# Get path of the data
#------------------------------------
# Spanish: ES
ES_dev_in_data_path = os.path.join("Data", "ES" , "dev.in")
ES_dev_out_data_path = os.path.join("Data", "ES" , "dev.out")
ES_train_data_path = os.path.join("Data", "ES" , "train")

# Russian: RU
RU_dev_in_data_path = os.path.join("Data", "RU" , "dev.in")
RU_dev_out_data_path = os.path.join("Data", "RU" , "dev.out")
RU_train_data_path = os.path.join("Data", "RU" , "train")
#------------------------------------

## Part 1

In [62]:
# Split words and tags
def split_words_tags(labeled_data):
        words = []
        tags = []

        for word_tag in labeled_data:
            
            if len(word_tag) != 2:
                continue
            
            #word_tag is a list
            word = word_tag[0]
            tag = word_tag[1]

            words.append(word)
            tags.append(tag)

        return words, tags

# Count unique tags
def count_unique_tags(tags_ls):

    tags_unique = set()
    for tag in tags_ls:
        tags_unique.add(tag)
    return tags_unique

# Count unique words
def count_unique_words(words_ls):
         
    words_unique = set()
    for word in words_ls:
        words_unique.add(word)
    return words_unique

In [63]:
# Emission Parameters

# Get the emission parameters
def get_emission_parameters(ls_of_tags, ls_of_words, tags, words, k=1):

  # Write a function that estimates the emission parameters from the training set using MLE (maximumlikelihood estimation):
    # e(x|y) = Count(y -> x) / Count(y)
    # Count(y -> x) = Number of times word x is tagged with tag y
    # Count(y) = Number of times tag y appears

    # Input: ls_of_tags - list of unqiue tags
    # Input: ls_of_words - list of unqiue words
    # Input: tags - list of all tags
    # Input: words - list of all words
    # Output: emission_parameters

    # emission_parameters is a dictionary where:
        # The keys are (tag, word) tuples
        # The values are the emission parameters e(x|y)

    # Example of emission_parameters:
        # emission_parameters[("O", "apple")] = 0.00019841269
        # emission_parameters[("B-positive", "apple")] = 0.00000031622777

    # Create a dictionary to store the emission parameters
    emission_parameters = {}

    # Create a dictionary to store the count of each tag
    count_y = {}

    # Create a dictionary to store the count of each (tag, word) tuple
    count_y_to_x = {}

    # Get the count of each tag from the training set
    for tag_labels in ls_of_tags:
        count_y[tag_labels] = tags.count(tag_labels)
    
    print(f"This is Count(y) : {count_y}")

    # Get the count of each (tag, word) tuple from the training set
    for tag, word in zip(tags, words):
        if (tag, word) in count_y_to_x:
            count_y_to_x[(tag, word)] += 1
        else:
            count_y_to_x[(tag, word)] = 1

    print(f"This is Count(y -> x) : {count_y_to_x}")

    # Get the emission parameters
    for tag, word in count_y_to_x:

        emission_parameters[(tag, word)] = count_y_to_x[(tag, word)] / (count_y[tag] + k)
        if word == "con":
            print(tag,emission_parameters[(tag, word)])

    # For words that do not appear in the training set, k/(Count(y)+k) is used as the emission parameter
    unknown_word = "UNK"
    for tag in count_y:
        emission_parameters[(tag, unknown_word)] = k / (count_y[tag] + k)

    print(f"This is e(x|y) : {emission_parameters}")

    return emission_parameters

In [64]:
def assign_estimate_tags(test_words, emission_params, train_ls_of_words):

    # for each word in the test set of words (test_words) assign the tag with the highest emission probability

    # Inputs : test_tags - a list of all tags
    #          test_ls_of_tags - a list of unqiue tags
    #         test_number_of_tags - a list of the number of tags
    #        test_words - a list of all words
    #       emission_params - a dictionary of emission parameters
    # 
    # Output : labelled words - a list of words with their assigned tags

    predicted_results = []

    for word in test_words:
        if word in train_ls_of_words:

            # y∗ = arg max y e(x|y)
            emission_value = 0
            for key in emission_params:
                if key[1] == word:
                    if emission_value < emission_params[key]:
                        emission_value = emission_params[key]
                        value = key[0]
            
            predicted_results.append((word, value))
            
        else:

            if word != "":
                # y∗ = arg max y e(x|y)
                emission_value = 0
                for key in emission_params:
                    if key[1] == "UNK":
                        if emission_value < emission_params[key]:
                            emission_value = emission_params[key]
                            value = key[0]

                predicted_results.append(("UNK", value))
            
            else:
                predicted_results.append(("", ""))


    
    print("predicted_results: ", predicted_results)
    return predicted_results
    

In [65]:
def get_precision(test_labels, gold_standard):

    total_predicted = 0
    total_correct = 0

    # convert to set for faster lookup
    gold_standard_tuple_ver = []

    for tuple in gold_standard:

        if len(tuple) < 2:
            continue

        gold_standard_tuple_ver.append((tuple[0], tuple[1]))

    print(f"This is the gold standard: {gold_standard_tuple_ver} \n")

    for predicted_pair in test_labels:

        if predicted_pair in gold_standard_tuple_ver:
            total_correct += 1
    
        total_predicted += 1

    return total_correct / total_predicted

In [66]:
def get_recall(test_labels, gold_standard):

    total_correct = 0
    total_gold = 0

    gold_standard_tuple_ver = []

    for tuple in gold_standard:

        if len(tuple) < 2:
            continue

        gold_standard_tuple_ver.append((tuple[0], tuple[1]))

    for tuple in gold_standard_tuple_ver:

        if tuple in test_labels:
            total_correct += 1

        total_gold += 1

    return total_correct / total_gold

In [67]:
def get_f_score(precision, recall):
    return 2/((1/precision) + (1/recall))

In [68]:
def calculate_part_1(dev_in_data_path, dev_out_data_path, train_data_path, output_path):

    train_data = read_train_data(train_data_path)

    train_words, train_tags = split_words_tags(train_data)
    train_ls_of_tags = count_unique_tags(train_tags)
    train_ls_of_words = count_unique_words(train_words)

    # Get Emmission Parameters
    k = 1
    emission_params = get_emission_parameters(train_ls_of_tags, train_ls_of_words, train_tags, train_words, k)

    test_data = read_dev_in_data(dev_in_data_path)

    # Get labels for test data
    test_labels = assign_estimate_tags(test_data, emission_params, train_ls_of_words)

    with open(output_path, "w+", encoding="utf-8") as file:
        for line in test_labels:
            write_line = line[0] + " " + line[1] + "\n"
            file.write(write_line)

    gold_standard = read_dev_out_data(dev_out_data_path)

    # Precision = Total number of correctly predicted entities / Total number of predicted entities

    precision = get_precision(test_labels, gold_standard)
    print("Precision: ", precision)

    # Recall = Total number of correctly predicted entities / Total number of entities in the gold standard
    recall = get_recall(test_labels, gold_standard)
    print("Recall: ", recall)

    # F score
    f_score = get_f_score(precision, recall)
    print("F Score: ", f_score)

In [69]:
# Different Language

# For Spanish

print("For Spanish: ")
output_path = os.path.join("Data", "ES" , "dev.p1.out")
calculate_part_1(ES_dev_in_data_path, ES_dev_out_data_path, ES_train_data_path, output_path)

print("\n")

# For Russian
print("For Russian: ")
output_path = os.path.join("Data", "RU" , "dev.p1.out")
calculate_part_1(RU_dev_in_data_path, RU_dev_out_data_path, RU_train_data_path, output_path)

For Spanish: 
This is Count(y) : {'B-neutral': 72, 'I-neutral': 43, 'B-negative': 381, 'I-negative': 171, 'B-positive': 1160, 'O': 29035, 'I-positive': 314}
This is Count(y -> x) : {('O', 'Estuvimos'): 6, ('O', 'hace'): 26, ('O', 'poco'): 55, ('O', 'mi'): 72, ('O', 'pareja'): 13, ('O', 'y'): 1024, ('O', 'yo'): 36, ('O', 'comiendo'): 10, ('O', 'resultó'): 4, ('O', 'todo'): 115, ('O', 'muy'): 396, ('O', 'bien'): 165, ('O', ','): 1664, ('O', 'tanto'): 39, ('O', 'la'): 755, ('B-positive', 'comida'): 169, ('O', 'el'): 642, ('B-positive', 'vino'): 6, ('B-positive', 'trato'): 44, ('B-positive', 'decoración'): 7, ('O', '…'): 45, ('O', 'nos'): 146, ('O', 'gustó'): 11, ('O', 'mucho'): 53, ('O', '.'): 1623, ('O', 'Por'): 30, ('O', 'poner'): 5, ('O', 'algún'): 6, ('O', 'pero'): 191, ('O', 'quizá'): 4, ('B-negative', 'jamón'): 1, ('O', 'no'): 369, ('O', 'era'): 52, ('O', 'lo'): 270, ('O', '"'): 39, ('O', 'ibérico'): 2, ('O', 'que'): 845, ('O', 'cabía'): 1, ('O', 'esperar'): 5, ('O', 'Bien'): 6, ('O

# Part 2

In [70]:
def get_transition_parameters(ls_of_tags, tags):
    
    # Get the transition parameters
    # Transition parameters are the probabilities of going from one tag to another
    # Input : ls_of_tags - list of unique of tags
    #         tags - list of ALL tags
    # Output : transition_parameters - dictionary of transition parameters

    # Create a dictionary to store the transition parameters
    transition_parameters = {}

    # Create a dictionary to store the count(yi-1)
    count_yi_minus_1 = {}

    # Create a dictionary to store the Count(yi-1,yi)
    count_yi_minus_1_yi = {}

    # Get the count of each tag from the training set
    for tag_labels in ls_of_tags:
        count_yi_minus_1[tag_labels] = tags.count(tag_labels)

    # Get Count(yi-1,yi)
    for current_tag_index in range(len(tags)-1): # Loop through all the tags

        if (tags[current_tag_index],tags[current_tag_index+1]) in count_yi_minus_1_yi:
            count_yi_minus_1_yi[(tags[current_tag_index],tags[current_tag_index+1])] += 1
        
        else:
            count_yi_minus_1_yi[(tags[current_tag_index],tags[current_tag_index+1])] = 1

    # Get the transition parameters





In [71]:
def calculate_part_2(dev_in_data_path, dev_out_data_path, train_data_path, output_path):

    train_data = read_train_data_with_start_end(train_data_path)

    train_words, train_tags = split_words_tags(train_data)
    train_ls_of_tags = count_unique_tags(train_tags)

    transition_params = get_transition_parameters(train_ls_of_tags, train_tags)

    test_data = read_dev_in_data(dev_in_data_path)
    

In [72]:
# Different Language

# For Spanish

print("For Spanish: ")
output_path = os.path.join("Data", "ES" , "dev.p1.out")
calculate_part_2(ES_dev_in_data_path, ES_dev_out_data_path, ES_train_data_path, output_path)

print("\n")

# For Russian
print("For Russian: ")
output_path = os.path.join("Data", "RU" , "dev.p1.out")
calculate_part_2(RU_dev_in_data_path, RU_dev_out_data_path, RU_train_data_path, output_path)

For Spanish: 
['Estuvimos', 'O']
['hace', 'O']
['poco', 'O']
['mi', 'O']
['pareja', 'O']
['y', 'O']
['yo', 'O']
['comiendo', 'O']
['y', 'O']
['resultó', 'O']
['todo', 'O']
['muy', 'O']
['bien', 'O']
[',', 'O']
['tanto', 'O']
['la', 'O']
['comida', 'B-positive']
[',', 'O']
['el', 'O']
['vino', 'B-positive']
[',', 'O']
['el', 'O']
['trato', 'B-positive']
[',', 'O']
['la', 'O']
['decoración', 'B-positive']
['…', 'O']
['nos', 'O']
['gustó', 'O']
['todo', 'O']
['mucho', 'O']
['.', 'O']
['']
['Por', 'O']
['poner', 'O']
['algún', 'O']
['pero', 'O']
[',', 'O']
['quizá', 'O']
['el', 'O']
['jamón', 'B-negative']
['no', 'O']
['era', 'O']
['todo', 'O']
['lo', 'O']
['"', 'O']
['ibérico', 'O']
['"', 'O']
['que', 'O']
['cabía', 'O']
['esperar', 'O']
['.', 'O']
['']
['Bien', 'O']
['lo', 'O']
['sabe', 'O']
['el', 'O']
['autor', 'O']
['del', 'O']
['blog', 'O']
['.', 'O']
[')', 'O']
['']
['Comida', 'B-positive']
['exquisita', 'O']
['.', 'O']
['']
['Restaurante', 'B-positive']
['diferente', 'O']
[',', 'O'