In [None]:
import string
from collections import defaultdict
import numpy as np

In [None]:
def read_post(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    return lines

In [None]:
filename = "WSJ_02-21.pos"
lines = read_post(filename)

print(lines[0])

In [None]:
with open("WSJ_24.pos", 'r') as f:
    y = f.readlines()
    
print("A sample of the test corpus")
print(y[0:10])

In [None]:
words = [line.split('\t')[0] for line in lines]

In [None]:
freq = defaultdict(int)
for word in words:
    freq[word] += 1

print(freq["the"])

In [None]:
vocab = [k for k,v in freq.items() if k!=" " and k!="\n" and v>1]
# vocab: dictionary that has the index of the corresponding words
vocab_dic = {}

# Get the index of the corresponding words. 
for i, word in enumerate(sorted(vocab)): 
    vocab_dic[word] = i

In [None]:
vocab.sort()
print(vocab[:10])

In [None]:
punctuation = string.punctuation
noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
verb_suffix = ["ate", "ify", "ise", "ize"]
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
adv_suffix = ["ward", "wards", "wise"]

def tag_word(tok):
    """
    Assign unknown word tokens
    """
    # Digits
    if any(char.isdigit() for char in tok):
        return "--unk_digit--"

    # Punctuation
    elif any(char in punctuation for char in tok):
        return "--unk_punct--"

    # Upper-case
    elif any(char.isupper() for char in tok):
        return "--unk_upper--"

    # Nouns
    elif any(tok.endswith(suffix) for suffix in noun_suffix):
        return "--unk_noun--"

    # Verbs
    elif any(tok.endswith(suffix) for suffix in verb_suffix):
        return "--unk_verb--"

    # Adjectives
    elif any(tok.endswith(suffix) for suffix in adj_suffix):
        return "--unk_adj--"

    # Adverbs
    elif any(tok.endswith(suffix) for suffix in adv_suffix):
        return "--unk_adv--"

    return "--unk--"
        

In [None]:
def preprocess(vocab, data_fp):
    """
    Preprocess data
    """
    orig = []
    prep = []

    # Read data
    with open(data_fp, "r") as data_file:

        for cnt, word in enumerate(data_file):

            # End of sentence
            if not word.split():
                orig.append(word.strip())
                word = "--n--"
                prep.append(word)
                continue

            # Handle unknown words
            elif word.strip() not in vocab:
                orig.append(word.strip())
                word = tag_word(word)
                prep.append(word)
                continue

            else:
                orig.append(word.strip())
                prep.append(word.strip())

    assert(len(orig) == len(open(data_fp, "r").readlines()))
    assert(len(prep) == len(open(data_fp, "r").readlines()))

    return orig, prep
# TO be deleted later 

In [None]:
_, prep = preprocess(vocab, "test.words")     
print(prep[0])

In [None]:
print(tag_word("scrutinize"))

In [None]:
def get_word_tag(line, vocab): 
    if not line.split():
        word = "--n--"
        tag = "--s--"
        return word, tag
    else:
        word, tag = line.split()
        if word not in vocab: 
            # Handle unknown words
            word = tag_word(word)
        return word, tag
    return None 

In [None]:
print(len(lines))

In [None]:
def create_pair_dictionary(lines, corpus):
    i = 0
    tag_trans = defaultdict(int)
    tag_counts = defaultdict(int)
    emissions = defaultdict(int)
    prev_tag = "--s--"
    for line in lines:
        word, next_tag = get_word_tag(line, corpus)
        tag_trans[(prev_tag, next_tag)] +=1
        tag_counts[next_tag] +=1
        emissions[(next_tag, word)] +=1
        prev_tag = next_tag
        i+=1
        if i%50000==0:
            print(i)
     
    return tag_trans, tag_counts, emissions


In [None]:
def calculate_probabilities_tags(tag_transitions, counts, alpha):
    prob = np.zeros((len(counts), len(counts)))
    counts_sorted = sorted(counts.keys())
    for i in range(len(counts)):
        for j in range(len(counts)):
            # if counts_sorted[i] == "--s--":
            #     print(i)
            # print(counts_sorted[j])
            # print(tag_transitions[(counts_sorted[i], counts_sorted[j])])
            # print("\n-----------------\n\n\n")
            # print(counts[counts_sorted[j]]+alpha*len(counts))
            prob[i][j] = (tag_transitions[(counts_sorted[i], counts_sorted[j])] + alpha)/(counts[counts_sorted[i]]+alpha*len(counts))
            
    return prob


In [None]:
tag_trans, tag_counts, emissions = create_pair_dictionary(lines, prep)


In [None]:
probs = calculate_probabilities_tags(tag_trans, tag_counts, 0.001)

In [None]:
print(probs[3][1])

In [None]:
def calculate_probabilities_emissions(emissions, counts, alpha, corpus):
    prob = np.zeros((len(counts), len(corpus)))
    counts_sorted = sorted(counts.keys())
    for i in range(len(counts)):
        for j in range(len(corpus)):
            # print(counts_sorted[i])
            # print(vocab[j])
            # print(emissions[(counts_sorted[i], vocab[j])])
            # print("\n-----------------\n\n\n")
            # print(counts[counts_sorted[j]]+alpha*len(counts))
            prob[i][j] = (emissions[(counts_sorted[i], corpus[j])] + alpha)/(counts[counts_sorted[i]]+alpha*len(corpus))
    return prob

In [None]:
prob_emissions = calculate_probabilities_emissions(emissions, tag_counts, 0.001, vocab)

In [None]:
print(prob_emissions[0][0])

In [None]:
def viterbi_initialization(prob_emissions, probs, corpus):

    # prob_emissions = np.array(prob_emissions)
    # probs = np.array(probs)
    # probs_pi = np.vstack([probs[6]]*len(probs))
    # #print(probs_pi)
    # c = np.matmul(probs_pi, prob_emissions)
    # #print(c)
    # return c

    c = np.zeros((len(probs), len(corpus)))
    for i in range(len(probs)):
        pi = probs[6][i]
        for j in range(len(corpus)):
            emission_prob = prob_emissions[i][vocab_dic[corpus[0]]]
            c[i][j] = np.log(pi)+np.log(emission_prob)
    d = np.zeros((len(probs), len(corpus)), dtype=int)

    return c,d


In [None]:
c,d = viterbi_initialization(prob_emissions, probs, prep)
print(c[0][0])
print(c[0])

In [None]:

def viterbi_forward(trans_mat, emission_prob, c, d,vocab, corpus):
    n_states = len(c)
    n_obs = len(corpus)

    for j in range(1, n_obs):
        for i in range(n_states):
            max_prob = -np.inf
            for k in range(n_states):
                prob = c[k][j-1] + np.log(trans_mat[k][i]) + np.log(emission_prob[i][vocab.get(corpus[j], 0)])
                if prob > max_prob:
                    max_prob = prob
            c[i][j] = max_prob
                
    return c


In [None]:
#print(len(vocab))
forward = viterbi_forward(probs,prob_emissions, c,d,vocab_dic, prep)

In [None]:
print(forward[0][0])