In [15]:
import numpy as np
import os

In [16]:
train_file_path = os.path.join("Data", "ES", "train")
test_file_path = os.path.join("Data", "ES", "dev.in")

train_words = []
train_tags = ["START"]

with open(train_file_path, "r", encoding="utf-8") as train_file:
    for l in train_file:
        if l.strip():  # Check if the line is not empty after stripping whitespace
            lst = l.split()
            x = " ".join(lst[:-1])  # Join all elements except the last as the word
            y = lst[-1]
            train_words.append(x)
            train_tags.append(y)
        else:
            train_tags.append("STOP")
            train_tags.append("START")
del train_tags[-1]

# Now you have train_words and tags populated with data


In [17]:
def estimate_transition_parameters(train_tags):
    transition_counts = {}  # To count transitions between states
    state_counts = {}  # To count occurrences of each state

    # Count transitions and state occurrences
    for i in range(len(train_tags) - 1):
        current_state = train_tags[i]
        next_state = train_tags[i + 1]

        if current_state not in state_counts:
            state_counts[current_state] = 0
        state_counts[current_state] += 1

        if current_state not in transition_counts:
            transition_counts[current_state] = {}
        if next_state not in transition_counts[current_state]:
            transition_counts[current_state][next_state] = 0
        transition_counts[current_state][next_state] += 1
    
    
    # Calculate transition probabilities
    transition_probs = {}
    for current_state, next_states in transition_counts.items():
        total_count = state_counts[current_state]
        transition_probs[current_state] = {next_state: count / total_count for next_state, count in next_states.items()}

    return transition_probs

In [18]:
### ALTERNATE EMISSION PROBABILITIES GENERATION ###
def estimate_emission_parameters(train_words, train_tags, k=1):
    edited_tags=[]
    for i in train_tags:
        if i!="START" and i!="STOP":
            edited_tags.append(i)

    emission_counts = {}  # To count emissions from states to words
    state_counts = {}  # To count occurrences of each state

    # Count emissions and state occurrences
    for i in range(len(train_words)):
        word = train_words[i]
        tag = edited_tags[i]

        if tag not in state_counts:
            state_counts[tag] = 0
        state_counts[tag] += 1

        if tag not in emission_counts:
            emission_counts[tag] = {}
        if word not in emission_counts[tag]:
            emission_counts[tag][word] = 0
        emission_counts[tag][word] += 1
    
    # Calculate emission probabilities with #UNK# handling
    emission_probs = {}
    for tag, word_counts in emission_counts.items():
        total_count = state_counts[tag]
        emission_probs[tag] = {}
        for word, count in word_counts.items():
            emission_probs[tag][word] = count / (total_count+k)
        emission_probs[tag]['#UNK#'] = k / (total_count + k)

    return emission_probs

In [19]:
emission_word_tag=estimate_emission_parameters(train_words,train_tags)
transition_parameters=estimate_transition_parameters(train_tags)

for i in emission_word_tag:
    print(i,emission_word_tag[i]["#UNK#"])
#print(transition_parameters["STOP"])

O 3.444000551040088e-05
B-positive 0.0008613264427217916
B-negative 0.002617801047120419
B-neutral 0.0136986301369863
I-neutral 0.022727272727272728
I-positive 0.0031746031746031746
I-negative 0.005813953488372093


In [20]:
def viterbi(sequence, train_tags, transition_parameters, emission_parameters):
    tags=[]
    for i in train_tags:
        if i!="START" and i!="STOP" and i not in tags:
            tags.append(i)
    n = len(sequence)
    num_tags = len(tags)

    LTR = np.zeros((num_tags, n))
    RTL = np.zeros((num_tags, n), dtype=int)

    for i in range(num_tags):
        emission_prob = emission_parameters[tags[i]].get(sequence[0], 1e-10)
        LTR[i,0] = np.log(transition_parameters['START'].get(tags[i], 1e-10)) + np.log(emission_prob)


    for w in range(1, n):
        for i in range(num_tags):
            max_prob = float('-inf')
            max_backpointer = -1000000
            for j in range(num_tags):
                transition_prob = transition_parameters[tags[j]].get(tags[i], 1e-10)
                emission_prob = emission_parameters[tags[i]].get(sequence[w], 1e-10)
                prob = LTR[j,w-1] + np.log(transition_prob) + np.log(emission_prob)
                if prob > max_prob:
                    max_prob = prob
                    max_backpointer = j

            LTR[i, w] = max_prob
            RTL[i, w] = max_backpointer

    stop_max_prob = float('-inf')
    stop_max_backpointer = -1000000
    for i in range(num_tags):
        transition_prob = transition_parameters[tags[i]].get('STOP', 1e-10)
        prob = LTR[i,n-1] + np.log(transition_prob)
        if prob > stop_max_prob:
            stop_max_prob = prob
            stop_max_backpointer = i

    # Retrieve the best path using backpointers
    best_path = []
    current_backpointer = stop_max_backpointer
    for w in range(n - 1, -1, -1):
        best_path.insert(0, tags[current_backpointer])
        if w == 0:
            break
        current_backpointer = RTL[current_backpointer, w]

    return best_path

In [21]:
test_file=open(test_file_path,"r")

pred_output_path = os.path.join(os.getcwd(), "Data", "ES", "dev.p4_2.out")
pred_output=open(pred_output_path,"w")
sequence=[]
all_tags_out=[]
for l in test_file:
    if l!="\n":
        sequence.append(l[0:-1])
    else:
        predicted_tags=viterbi(sequence,train_tags,transition_parameters,emission_word_tag)
        for i in predicted_tags:
            all_tags_out.append(i)
        for i in range(0,len(predicted_tags)):
            pred_output.write(sequence[i]+" "+predicted_tags[i]+"\n")
        pred_output.write(l)
        pred_output.flush()
        sequence=[]
actual_tags=[]
with open("data/ES/dev.out","r") as f:
    for l in f:
        if l.strip():  # Check if the line is not empty after stripping whitespace
            lst = l.split()
            y = lst[-1]
            actual_tags.append(y)
print(all_tags_out)
print(actual_tags)
corr=0
for i in range(len(actual_tags)):
    if all_tags_out[i]==actual_tags[i]:
        corr+=1
print(corr/len(actual_tags))


['B-negative', 'I-negative', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-positive', 'I-positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-positive', 'O', 'O', 'O', 'O', 'B-positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-negative', 'I-negative', 'I-negative', 'I-negative', 'I-negative', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-positive', 'I-positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-negative', 'O', 'O', 'O', 'O', 'O