### Project Summary 
The files for this project are in the files RU.zip and ES.zip. For each dataset, we provide a labelled
training set train, an unlabelled development set dev.in, and a labelled development set dev.out.
The labelled data has the format of one token per line with token and tag separated by tab and a single
empty line that separates sentences.

### Part 1


Extract the files from RU.zip and ES.zip

In [75]:
import numpy as np
from collections import Counter

espath_train = "./ES/train"
espath_devin = "./ES/dev.in"
espath_devout = "./ES/dev.out"
espath_devp1out = "./ES/dev.p1.out"
espath_devp2out = "./ES/dev.p2.out"
espath_devp3out = "./ES/dev.p3.out"
rupath_train = "./RU/train"
rupath_devin = "./RU/dev.in"
rupath_devout = "./RU/dev.out"
rupath_devp1out = "./RU/dev.p1.out"
rupath_devp2out = "./RU/dev.p2.out"
rupath_devp3out = "./RU/dev.p3.out"

N = 7
randomSeed = np.random.default_rng(3017634)
START, O, BPOS, IPOS, BNEU, INEU, BNEG, INEG, END = 0, 1, 2, 3, 4, 5, 6, 7, 8
labels = {"START": START,
          "O": O,
          "B-positive": BPOS,
          "I-positive": IPOS,
          "B-neutral": BNEU,
          "I-neutral": INEU,
          "B-negative": BNEG,
          "I-negative": INEG,
          "END": END}
ls_Label = ["START", "O", "B-positive", "I-positive", "B-neutral", "I-neutral", "B-negative", "I-negative", "END"]

In [76]:
# Read data
def read_trg(fp):
    ls = []
    with open(fp, "r", encoding="utf-8") as file:
        readline = file.readlines()
        for line in readline:
            if len(line.strip().rsplit(" ", 1)) == 2:
                token, label = line.strip().rsplit(" ", 1)
                ls.append((token, labels[label]))
            else:
                continue
    return ls

def read_devin(fp):
    results = []
    with open(fp, "r", encoding="utf-8") as file:
        readline = file.readlines()
        for line in readline:
            results.append(line.strip())
    return results

def read_devout(fp):
    ls = []
    with open(fp, "r", encoding="utf-8") as file:
        readline = file.readlines()
        for line in readline:
            if len(line.strip().rsplit(" ", 1)) == 2:
                token, label = line.strip().rsplit(" ", 1)
                ls.append((token, labels[label]))
            else:
                continue
    return ls

# Define functions to calculate emissions
def getUniqueTokens(d):
    return list(set(i[0] for i in d))

def CountLabels(input_data):
    return Counter(e[1] for e in input_data)

def emissionProbs(d, UT, k=1):
    emCount = np.zeros((N, len(UT) + 1), dtype=np.longdouble)

    labelCount = np.array(list(val[1] for val in sorted(CountLabels(d).items())))

    for token, label in d:
        emCount[label - 1][UT.index(token)] += 1

    emCount[:, -1] = np.full((1, N), k)[0]
    emParams = np.empty((N, len(UT) + 1), dtype=np.longdouble)

    for index, _ in enumerate(emCount):
        emParams[index] = emCount[index] / (labelCount[index] + k)

    return emParams

def get_TokenLabel(inp, UT, emParams):
    if inp not in UT:
        column = emParams[:, -1]
    else:
        column = emParams[:, UT.index(inp)]

    Ridx = randomSeed.choice(np.argwhere(np.isclose(column, column.max())).flatten()) + 1
    return ls_Label[Ridx]

# Write predictions to new file
def write_file_predictions(lg):
    if lg == "ES":
        trg_d = read_trg(espath_train)
        UT = getUniqueTokens(trg_d)
        emParams = emissionProbs(trg_d, UT)

        predicted_results = []
        tst_d = read_dev_in_data(espath_devin)
        for token in tst_d:
            if token:
                predicted_results.append(token + " " + get_TokenLabel(token, UT, emParams))
            else:
                predicted_results.append("")
        with open(espath_devp1out, "w+", encoding="utf-8") as file:
            for line in predicted_results:
                file.write(line + "\n")

    elif lg == "RU":
        trg_d = read_trg(rupath_train)
        UT = getUniqueTokens(trg_d)
        emParams = emissionProbs(trg_d, UT)

        predicted_results = []
        tst_d = read_devin(rupath_devin)
        for token in tst_d:
            if token:
                predicted_results.append(token + " " + get_TokenLabel(token, UT, emParams))
            else:
                predicted_results.append("")
        with open(rupath_devp1out, "w+", encoding="utf-8") as file:
            for line in predicted_results:
                file.write(line + "\n")
# Call function
for lg in ["ES", "RU"]:
    write_file_predictions(lg)