In [159]:
import numpy as np

In [161]:
def  MLE_emission_parameters(train_dir):
    ''' Calculates the emission parameters by count(y->x)/count(y)
    
    :param train_dir: our train file path to either ES or RU
    :type train_dir: str

    :return: count_y_dict, Count(y), keys are word '!', value MLE
    :rtype: dict

    :return: count_y_to_x_dict, Count(y->x), keys are tuples of word and label ('!', 'O'), value MLE
    :rtype: dict    

    :return: emission_dict, Count(y->x)/Count(y), keys are tuples of word and label ('!', 'O'), value MLE
    :rtype: dict
    
    '''
    train_dir = "data/ES/train"

    count_y_dict = {}
    count_y_to_x_dict = {}
    emission_dict = {}

    with open(train_dir, "r", encoding="utf8") as f:
        for line in f:
            # Parse each line
            if len(line.split(" ")) == 2:
                word, label = line.replace("\n","").split(" ")
            else:
                # skip lines with space 
                continue
            if label in count_y_dict:
                count_y_dict[label] = count_y_dict.get(label)+1
            else:
                count_y_dict[label] = 1
            if (word,label) in count_y_to_x_dict:
                count_y_to_x_dict[(word,label)] = count_y_to_x_dict.get((word,label))+1
            else:
                count_y_to_x_dict[(word,label)] = 1
    print("count(y): \n", count_y_dict, "\n")
    print("count(y->x): \n",list(count_y_to_x_dict.items())[0:5], len(count_y_to_x_dict), "\n")
    # Calculate our emission
    for key, value in count_y_to_x_dict.items(): # Default is iterate keys()
        word = key[0]
        label = key[1]
        emission_dict[key] = value / count_y_dict.get(label)
    print("MLE: \n",list(emission_dict.items())[0:5], len(emission_dict) ,"\n")

    return count_y_dict, count_y_to_x_dict, emission_dict

def  add_unknown_word_token(count_y_dict,emission_dict, k=1):
    ''' adds the unknown_word_token to our dictionary
    
    :param emission_dict: Emission dictionary
    :type emission_dict: dict

    :param k: we assume we have observed that there are k occurrences of such an event.
    :type k: int

    :return: emission_plus_unknown_dict, keys are tuple of word and label ('!', 'O'), value MLE
    :rtype: dict
    
    '''
    print("#UNK# values:")
    for key in count_y_dict:
        emission_dict[("#UNK#",key)] = k/(count_y_dict.get(key)+k)
        print(("#UNK#",key),emission_dict.get(("#UNK#",key)))
    return emission_dict


# Finding Emission Parameter

In [176]:
train_dir = "data/ES/train"

count_y_dict, count_y_to_x_dict, emission_dict = MLE_emission_parameters(train_dir)
emission_dict = add_unknown_word_token(count_y_dict,emission_dict,k=1)
print(emission_dict)

count(y): 
 {'O': 31627, 'B-positive': 1274, 'B-negative': 429, 'I-negative': 229, 'I-positive': 400, 'B-neutral': 85, 'I-neutral': 44} 

count(y->x): 
 [(('disfrutemos', 'O'), 1), (('de', 'O'), 1091), (('una', 'O'), 238), (('buenísima', 'O'), 4), (('calidad', 'O'), 159)] 5969 

MLE: 
 [(('disfrutemos', 'O'), 3.161855376735068e-05), (('de', 'O'), 0.03449584216017959), (('una', 'O'), 0.007525215796629462), (('buenísima', 'O'), 0.00012647421506940273), (('calidad', 'O'), 0.005027350049008759)] 5969 

#UNK# values:
('#UNK#', 'O') 3.161755406601745e-05
('#UNK#', 'B-positive') 0.000784313725490196
('#UNK#', 'B-negative') 0.002325581395348837
('#UNK#', 'I-negative') 0.004347826086956522
('#UNK#', 'I-positive') 0.0024937655860349127
('#UNK#', 'B-neutral') 0.011627906976744186
('#UNK#', 'I-neutral') 0.022222222222222223
{('disfrutemos', 'O'): 3.161855376735068e-05, ('de', 'O'): 0.03449584216017959, ('una', 'O'): 0.007525215796629462, ('buenísima', 'O'): 0.00012647421506940273, ('calidad', 'O')

# Predicting our sequence labels

In [19]:
def predict_y(emission_dict,test_dir="data/ES/dev.in",output_dir="data/ES/dev.p1.out"):
    ''' Finds our predicted_y with our emission_dict
    
    :param test_dir: our test file in either ES or RU
    :type test_dir: str

    :param output_dir: our output file for either ES or RU
    :type test_dir: str
    
    '''
    
    emission_word_set = set(i[0] for i in list(emission_dict.keys()))
    emission_label_lst = list(set(i[1] for i in list(emission_dict.keys())))


    with open(output_dir,'w', encoding="utf-8") as f:
        with open(test_dir,'r',encoding="utf-8") as file:
            for line in file:
                if len(line.replace("\n","")) > 0:
                    word = line.replace("\n","")
                else:
                    f.write("\n")                    
                    continue
                if word not in emission_word_set: # If there is no such word in emission set word as unknown
                    word = "#UNK#"

                label_lst = []
                for label in emission_label_lst:
                    if emission_dict.get((word,label)):
                        label_lst.append(emission_dict.get((word,label)))
                    else:
                        label_lst.append(0)
                predicted_y_idx = np.argmax(label_lst)
                predicted_y = emission_label_lst[predicted_y_idx] # Convert argmax index to predicted name
                f.write(f"{word} {predicted_y}\n") # write in our original word
            
def analysis(predicted_df,truth_df):

    correct_predictions = 0

    for i in range(len(truth_df)):
        predicted_df.iloc(i) 

In [20]:
test_dir = "data/ES/dev.in"
predict_y(emission_dict,test_dir)

# Finding Transition Parameters

In [163]:
def  MLE_transition_parameters(train_dir):
    ''' Calculates the emission parameters by count(y->x)/count(y)
    
    :param train_dir: our train file path to either ES or RU
    :type train_dir: str

    :return: count_y_dict, Count(y), keys are word '!', value MLE
    :rtype: dict

    :return: count_y_to_y_dict, Count(y->x), keys are tuples of word and label ('!', 'O'), value MLE
    :rtype: dict    

    :return: emission_dict, Count(y->x)/Count(y), keys are tuples of word and label ('!', 'O'), value MLE
    :rtype: dict

    '''
    
    train_dir = "data/ES/train"

    count_y_dict = {}
    count_y_to_y_dict = {}
    transition_dict = {}
    prev_label = ""
    count = 0
    othercount = 0

    with open(train_dir, "r", encoding="utf8") as f:
        for line in f:
            # Parse each line
            if len(line.split(" ")) == 2:
                word, label = line.replace("\n","").split(" ")
            else:
                word = ''
                label = ''
            if label == '' and prev_label != '':
                count_y_dict["STOP"] = count_y_dict.get("STOP") + 1 if count_y_dict.get("STOP") else 1
            elif label !='':
                if prev_label == '':
                    count_y_dict["START"] = count_y_dict.get("START") + 1 if count_y_dict.get("START") else 1
                if label in count_y_dict:
                    count_y_dict[label] = count_y_dict.get(label)+1
                else:
                    count_y_dict[label] = 1
            if prev_label == '' and label != '':
                if ("START", label) in count_y_to_y_dict:
                    count_y_to_y_dict[("START", label)] = count_y_to_y_dict.get(("START", label)) + 1
                else:
                    count_y_to_y_dict[("START", label)] = 1
            elif label == '' and prev_label != '':
                if (prev_label, "STOP") in count_y_to_y_dict:
                    count_y_to_y_dict[(prev_label, "STOP")] = count_y_to_y_dict.get((prev_label, "STOP")) + 1
                else:
                    count_y_to_y_dict[(prev_label, "STOP")] = 1
            elif label != '' and prev_label != '':
                if (prev_label, label) in count_y_to_y_dict:
                    count_y_to_y_dict[(prev_label, label)] = count_y_to_y_dict.get((prev_label, label)) + 1
                else:
                    count_y_to_y_dict[(prev_label, label)] = 1
            prev_label = label
    print("count(y): \n", count_y_dict, "\n")
    print("count(y->x): \n",list(count_y_to_y_dict.items()), len(count_y_to_y_dict), "\n")
    # Calculate our transition
    for key, value in count_y_to_y_dict.items(): # Default is iterate keys()
        prev_label = key[0]
        label = key[1]
        transition_dict[key] = value / count_y_dict.get(prev_label)
    print("MLE: \n",list(transition_dict.items()), len(transition_dict) ,"\n")

    return count_y_dict, count_y_to_y_dict, transition_dict

In [164]:
train_dir = "data/ES/train"

count_y_dict, count_y_to_y_dict, transition_dict = MLE_transition_parameters(train_dir)

count(y): 
 {'START': 2065, 'O': 31627, 'B-positive': 1274, 'STOP': 2065, 'B-negative': 429, 'I-negative': 229, 'I-positive': 400, 'B-neutral': 85, 'I-neutral': 44} 

count(y->x): 
 [(('START', 'O'), 1918), (('O', 'O'), 27939), (('O', 'B-positive'), 1162), (('B-positive', 'O'), 1100), (('O', 'STOP'), 2050), (('O', 'B-negative'), 402), (('B-negative', 'O'), 347), (('START', 'B-negative'), 27), (('B-negative', 'I-negative'), 78), (('I-negative', 'I-negative'), 151), (('I-negative', 'O'), 78), (('START', 'B-positive'), 110), (('B-positive', 'I-positive'), 162), (('I-positive', 'O'), 160), (('I-positive', 'I-positive'), 238), (('O', 'B-neutral'), 74), (('B-neutral', 'O'), 69), (('START', 'B-neutral'), 10), (('B-positive', 'B-positive'), 2), (('B-negative', 'STOP'), 4), (('B-positive', 'STOP'), 9), (('B-neutral', 'I-neutral'), 16), (('I-neutral', 'O'), 16), (('I-positive', 'STOP'), 2), (('B-positive', 'B-neutral'), 1), (('I-neutral', 'I-neutral'), 28)] 26 

MLE: 
 [(('START', 'O'), 0.928813

# Viterbi Algorithm

In [174]:
def viterbi(emission_dict, transition_dict, test_dir, output_dir = "data/ES/dev.p2.out"):
    test_array = []
    viterbi_array = [{"word": "", "START": (1, ''), 'STOP': 0, 'O': 0, 'B-positive': 0, 'B-negative': 0, 'B-neutral': 0, 'I-positive': 0, 'I-negative': 0, 'I-neutral': 0}]
    labels = ['START', 'STOP', 'O', 'B-positive', 'B-negative', 'B-neutral', 'I-positive', 'I-negative', 'I-neutral']

    emission_word_set = set(i[0] for i in list(emission_dict.keys()))

    with open(test_dir, 'r',encoding="utf-8") as file:
        for line in file:
            test_array += [line.replace("\n","")]

    count = 1
    for word in test_array:
        temp_dict = {'word': word}
        if word == '':
            temp_list = []
            for prev_y in labels:
                if viterbi_array[count - 1].get(prev_y) != 0:
                    if transition_dict.get((prev_y, "STOP")):
                        temp_list.append(np.longdouble(viterbi_array[count - 1].get(prev_y)[0] * transition_dict.get((prev_y, "STOP"))))
                    else:
                        temp_list.append(0)
                else:
                    temp_list.append(0)

            max_index = np.argmax(temp_list)
            for y in labels:
                if y == "STOP":
                    temp_dict[y] = (temp_list[max_index], labels[max_index])
                elif y == "START":
                    temp_dict[y] = (1, '')
                else:
                    temp_dict[y] = 0
            viterbi_array.append(temp_dict)
        else:
            if count == 5328:
                print(word)
                print(word in emission_word_set)
            if word not in emission_word_set:
                word = "#UNK#"
            for t in labels:
                if emission_dict.get((word, t)):
                    if count ==  5328:
                        print(t)
                    temp_list = []
                    for prev_y in labels:
                        if viterbi_array[count - 1].get(prev_y) != 0:
                            # if count ==  5328:
                            #     print(emission_dict.get((word, t)))
                            #     print(transition_dict.get((prev_y, t)))
                            if transition_dict.get((prev_y, t)):
                                temp_list.append(viterbi_array[count - 1].get(prev_y)[0] * transition_dict.get((prev_y, t)) * emission_dict.get((word, t)))
                            else:
                                temp_list.append(0)
                        else:
                            temp_list.append(0)
                    max_index = np.argmax(temp_list)
                    temp_dict[t] = (temp_list[max_index], labels[max_index])
                else:
                    temp_dict[t] = 0
                
            viterbi_array.append(temp_dict)
        count += 1
    print(viterbi_array)
    result_array = [""]*len(viterbi_array)
    for i in range(len(viterbi_array) - 1, 1, -1):
        if i == len(viterbi_array) - 1:
            result_array[i] = viterbi_array[i].get("word")
        tmp_list = []
        if viterbi_array[i].get('word') == '':
            result_array[i] = ''
            prev_label = viterbi_array[i].get('STOP')[1]
        else:
            result_array[i] = viterbi_array[i].get("word") + " " + prev_label
            prev_label = viterbi_array[i].get(prev_label)[1]

    with open(output_dir,'w', encoding="utf-8") as f:
        for i in result_array[1:]:
            f.write(i + '\n')




In [175]:
test_dir = "data/ES/dev.in"

viterbi(emission_dict, transition_dict, test_dir)

pollo
True
I-negative
[{'word': '', 'START': (1, ''), 'STOP': 0, 'O': 0, 'B-positive': 0, 'B-negative': 0, 'B-neutral': 0, 'I-positive': 0, 'I-negative': 0, 'I-neutral': 0}, {'word': 'La', 'START': 0, 'STOP': 0, 'O': (0.004551999927116554, 'START'), 'B-positive': 0, 'B-negative': (3.0477996579691498e-05, 'START'), 'B-neutral': 0, 'I-positive': 0, 'I-negative': 0, 'I-neutral': 0}, {'word': 'comida', 'START': 0, 'STOP': 0, 'O': (4.068619584557698e-06, 'O'), 'B-positive': (2.389199063778885e-05, 'O'), 'B-negative': (2.697385411967062e-06, 'O'), 'B-neutral': (1.1277155042358169e-06, 'O'), 'I-positive': 0, 'I-negative': 0, 'I-neutral': 0}, {'word': 'estuvo', 'START': 0, 'STOP': 0, 'O': (1.2392850065408388e-08, 'B-positive'), 'B-positive': 0, 'B-negative': 0, 'B-neutral': 0, 'I-positive': 0, 'I-negative': 0, 'I-neutral': 0}, {'word': 'muy', 'START': 0, 'STOP': 0, 'O': (1.5092199771583816e-10, 'O'), 'B-positive': 0, 'B-negative': 0, 'B-neutral': 0, 'I-positive': (0, 'START'), 'I-negative': 0,

TypeError: 'int' object is not subscriptable