# **50.007 ML 1D Project**
By Darren Chan Yu Hao

## Setup

In [146]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from PIL import Image
import copy as copy
from scipy.optimize import minimize

In [147]:
cwd = os.getcwd()
np.random.seed(1993)

## Reading Files

In [148]:
# Functions to read data

# Read dev.in data
def read_dev_in_data(filepath):
    results = []

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            results.append(line.strip())
    
    return results

# Read dev.out data
def read_dev_out_data(filepath):
    results = []

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            stripped_line = line.strip().split(" ")
            results.append(stripped_line)
    
    return results

# Read train data
def read_train_data(filepath):
    results = []

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            stripped_line = line.strip().split(" ")
            results.append(stripped_line)

    return results

In [149]:
# Get path of the data
#------------------------------------
# Spanish: ES
ES_dev_in_data_path = os.path.join("Data", "ES" , "dev.in")
ES_dev_out_data_path = os.path.join("Data", "ES" , "dev.out")
ES_train_data_path = os.path.join("Data", "ES" , "train")

# Russian: RU
RU_dev_in_data_path = os.path.join("Data", "RU" , "dev.in")
RU_dev_out_data_path = os.path.join("Data", "RU" , "dev.out")
RU_train_data_path = os.path.join("Data", "RU" , "train")
#------------------------------------

print(read_dev_in_data(ES_dev_in_data_path))
print(read_dev_out_data(ES_dev_out_data_path))
print(read_train_data(ES_train_data_path))

['Plato', 'degustación', ':', 'un', 'poco', 'abundante', 'de', 'más', ',', 'pero', 'bien', 'cocinado', '.', '', 'restaurante', 'excelente', 'con', 'carne', 'de', 'alta', 'calidad', '.', '', 'Las', 'posibilidades', 'en', 'el', 'restaurante', 'son', 'fundamentalmente', 'tres', ';', 'carta', 'normal', ',', 'menú', 'degustacion', 'y', 'una', 'opción', 'intermedia', 'que', 'es', 'una', 'selección', 'de', 'primeros', 'y', 'postres', 'y', 'carta', 'para', 'el', 'segundo', '.', '', 'No', 'perderse', 'el', 'sorbete', 'de', 'mojito', '.', '', 'para', 'mi', 'perfecto', '!', '', 'Devolucion', 'a', 'cocina', ',', 'amabilidad', 'de', 'camarera', ',', 'requerimiento', 'de', 'cuenta', 'y', 'adios', '.', '', 'Así', 'como', 'el', 'romesco', ',', 'que', 'era', 'un', 'poco', '"', 'de', 'bote', '"', '.', '', 'Destacar', 'los', 'arroces', ',', 'la', 'caldereta', 'de', 'bogavante', ',', 'las', 'zamburiñas', 'al', 'horno', 'y', 'los', 'platos', 'de', '"', 'picoteo', '"', 'y', 'los', 'pescados', 'en', 'general

## Part 1

In [150]:
# Split words and tags
def split_words_tags(labeled_data):
        words = []
        tags = []

        for word_tag in labeled_data:
            
            if len(word_tag) != 2:
                continue
            
            #word_tag is a list
            word = word_tag[0]
            tag = word_tag[1]

            words.append(word)
            tags.append(tag)

        return words, tags

# Count unique tags
def count_unique_tags(tags_ls):

    tags_unique = set()
    for tag in tags_ls:
        tags_unique.add(tag)
    return tags_unique

# Count unique words
def count_unique_words(words_ls):
         
    words_unique = set()
    for word in words_ls:
        words_unique.add(word)
    return words_unique

In [151]:
# Emission Parameters

# Get the emission parameters
def get_emission_parameters(ls_of_tags, ls_of_words, tags, words, k=1):

  # Write a function that estimates the emission parameters from the training set using MLE (maximumlikelihood estimation):
    # e(x|y) = Count(y -> x) / Count(y)
    # Count(y -> x) = Number of times word x is tagged with tag y
    # Count(y) = Number of times tag y appears

    # Input: ls_of_tags - list of unqiue tags
    # Input: ls_of_words - list of unqiue words
    # Input: tags - list of all tags
    # Input: words - list of all words
    # Output: emission_parameters

    # emission_parameters is a dictionary where:
        # The keys are (tag, word) tuples
        # The values are the emission parameters e(x|y)

    # Example of emission_parameters:
        # emission_parameters[("O", "apple")] = 0.00019841269
        # emission_parameters[("B-positive", "apple")] = 0.00000031622777

    # Create a dictionary to store the emission parameters
    emission_parameters = {}

    # Create a dictionary to store the count of each tag
    count_y = {}

    # Create a dictionary to store the count of each (tag, word) tuple
    count_y_to_x = {}

    # Get the count of each tag from the training set
    for tag_labels in ls_of_tags:
        count_y[tag_labels] = tags.count(tag_labels)
    
    print(f"This is Count(y) : {count_y}")

    # Get the count of each (tag, word) tuple from the training set
    for tag, word in zip(tags, words):
        if (tag, word) in count_y_to_x:
            count_y_to_x[(tag, word)] += 1
        else:
            count_y_to_x[(tag, word)] = 1

    print(f"This is Count(y -> x) : {count_y_to_x}")

    # Get the emission parameters
    for tag, word in count_y_to_x:
        emission_parameters[(tag, word)] = count_y_to_x[(tag, word)] / (count_y[tag] + k)

    # For words that do not appear in the training set, k/(Count(y)+k) is used as the emission parameter
    unknown_word = "UNK"
    for tag in count_y:
        emission_parameters[(tag, unknown_word)] = k / (count_y[tag] + k)

    print(f"This is e(x|y) : {emission_parameters}")

    return emission_parameters

In [152]:
def assign_estimate_tags(test_words, emission_params, train_ls_of_words):

    # for each word in the test set of words (test_words) assign the tag with the highest emission probability

    # Inputs : test_tags - a list of all tags
    #          test_ls_of_tags - a list of unqiue tags
    #         test_number_of_tags - a list of the number of tags
    #        test_words - a list of all words
    #       emission_params - a dictionary of emission parameters
    # 
    # Output : labelled words - a list of words with their assigned tags

    predicted_results = []

    print(emission_params)

    for word in test_words:
        if word in train_ls_of_words:

            # y∗ = arg max y e(x|y)
            emission_value = 0
            for key in emission_params:
                if key[1] == word:
                    if emission_value < emission_params[key]:
                        emission_value = emission_params[key]
                        value = key[0]
            
            predicted_results.append((word, value))
        else:

            # y∗ = arg max y e(x|y)
            emission_value = 0
            for key in emission_params:
                if key[1] == "UNK":
                    if emission_value < emission_params[key]:
                        emission_value = emission_params[key]
                        value = key[0]

            predicted_results.append(("UNK", value))
    
    print("predicted_results: ", predicted_results)
    return predicted_results
    

In [153]:
def get_precision(test_labels, gold_standard):

    total_predicted = 0
    total_correct = 0

    # convert to set for faster lookup
    gold_standard_tuple_ver = []

    for tuple in gold_standard:

        if len(tuple) < 2:
            continue

        gold_standard_tuple_ver.append((tuple[0], tuple[1]))

    print(gold_standard_tuple_ver)

    for predicted_pair in test_labels:

        if predicted_pair in gold_standard_tuple_ver:
            total_correct += 1
    
        total_predicted += 1

    return total_correct / total_predicted

In [154]:
def get_recall(test_labels, gold_standard):

    total_correct = 0
    total_gold = 0

    gold_standard_tuple_ver = []

    for tuple in gold_standard:

        if len(tuple) < 2:
            continue

        gold_standard_tuple_ver.append((tuple[0], tuple[1]))

    for tuple in gold_standard_tuple_ver:

        if tuple in test_labels:
            total_correct += 1

        total_gold += 1

    return total_correct / total_gold

In [155]:
def get_f_score(precision, recall):
    return 2/((1/precision) + (1/recall))

In [156]:
train_data = read_train_data(ES_train_data_path)

train_words, train_tags = split_words_tags(train_data)
train_ls_of_tags = count_unique_tags(train_tags)
train_number_of_tags = len(train_ls_of_tags)
train_ls_of_words = count_unique_words(train_words)

# Get Emmission Parameters
k = 1
emission_params = get_emission_parameters(train_ls_of_tags, train_ls_of_words, train_tags, train_words, k)

test_data = read_dev_in_data(ES_dev_in_data_path)

# Get labels for test data
test_labels = assign_estimate_tags(test_data, emission_params, train_ls_of_words)

gold_standard = read_dev_out_data(ES_dev_out_data_path)

# Precision = Total number of correctly predicted entities / Total number of predicted entities

precision = get_precision(test_labels, gold_standard)
print("Precision: ", precision)

# Recall = Total number of correctly predicted entities / Total number of entities in the gold standard
recall = get_recall(test_labels, gold_standard)
print("Recall: ", recall)

# F score
f_score = get_f_score(precision, recall)
print("F Score: ", f_score)

This is Count(y) : {'B-positive': 1160, 'I-neutral': 43, 'O': 29035, 'B-negative': 381, 'I-negative': 171, 'B-neutral': 72, 'I-positive': 314}
This is Count(y -> x) : {('O', 'Estuvimos'): 6, ('O', 'hace'): 26, ('O', 'poco'): 55, ('O', 'mi'): 72, ('O', 'pareja'): 13, ('O', 'y'): 1024, ('O', 'yo'): 36, ('O', 'comiendo'): 10, ('O', 'resultó'): 4, ('O', 'todo'): 115, ('O', 'muy'): 396, ('O', 'bien'): 165, ('O', ','): 1664, ('O', 'tanto'): 39, ('O', 'la'): 755, ('B-positive', 'comida'): 169, ('O', 'el'): 642, ('B-positive', 'vino'): 6, ('B-positive', 'trato'): 44, ('B-positive', 'decoración'): 7, ('O', '…'): 45, ('O', 'nos'): 146, ('O', 'gustó'): 11, ('O', 'mucho'): 53, ('O', '.'): 1623, ('O', 'Por'): 30, ('O', 'poner'): 5, ('O', 'algún'): 6, ('O', 'pero'): 191, ('O', 'quizá'): 4, ('B-negative', 'jamón'): 1, ('O', 'no'): 369, ('O', 'era'): 52, ('O', 'lo'): 270, ('O', '"'): 39, ('O', 'ibérico'): 2, ('O', 'que'): 845, ('O', 'cabía'): 1, ('O', 'esperar'): 5, ('O', 'Bien'): 6, ('O', 'sabe'): 5,