# POS_TAGGER
## - step 1: Read and preprocess the dataset
## - step 2: Build a Most Frequent Class (MFC) tagger to use as a       baseline
## - step 3: Build an HMM POS Tagger
## - step 4:

In [None]:
import matplotlib.pyplot as plt 
import numpy as np 
from itertools import chain
from collections import Counter, defaultdict
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution
import pandas as pd
import sklearn as skl 


In [None]:
import string
from collections import defaultdict

## Working with data file and create vocabulary

### Read data file

In [None]:
with open("WSJ_02-21.pos", 'r') as f:
    lines = f.readlines()

In [None]:
for i in range(20):
    print(lines[i])

In [None]:
words = [line.split('\t')[0] for line in lines]
# words

### Create vocabulary file

In [None]:
freq = defaultdict(int)

for word in words:
    freq[word] +=1

In [None]:
freq['more']

In [None]:
vocabulary = [k for k, v in freq.items() if (v > 1 and k != '\n')]

In [None]:
vocabulary.sort()

In [None]:
vocabulary_file = open('vocabulary_file.txt', "w+")
for i in vocabulary:
    vocabulary_file = open('vocabulary_file.txt', "a+")
    vocabulary_file.writelines(i + '\n')
# for i in range(0,10):
#     print(vocabulary[i])

### Processing new text sources

#### working words don't exist in the vocabulary

In [None]:
def assign_unknow(word):

    #punctuation characters
    punct = set(string.punctuation)

    #suffixes
    noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
    verb_suffix = ["ate", "ify", "ise", "ize"]
    adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
    adv_suffix = ["ward", "wards", "wise"]
    
    if any(char.isdigit() for char in word):
        return "--unknow_digit--"

    elif any(char in punct for char in word):
        return "--unknow_punct--"

    elif any(char.isupper() for char in word):
        return "--unknow_upper"

    elif any(word.endswith(suffix) for suffix in noun_suffix):
        return "--unknow_noun--" 
    
    elif any(word.endswith(suffix) for suffix in verb_suffix):
        return "--unknow_verb--"

    elif any(word.endswith(suffix) for suffix in adj_suffix):
        return "--unknow_adj--"
    
    elif any(word.endswith(suffix) for suffix in adv_suffix):
        return "--unknow_adv--"

    return "--unknow--"

### Getting the correct tag for a word

In [None]:
def get_word_tag(line, vocabulary):
    if not line.split():
        word = "--n--"
        tag = "--s--"
    else:
        word, tag = line.split()
        if word not in vocabulary:
            word = assign_unknow(word)

    return word, tag 

In [None]:
print(get_word_tag('\n', vocabulary))
print(get_word_tag('In\tIN\n', vocabulary))
print(get_word_tag('tardigrade\tNN\n', vocabulary))
print(get_word_tag('scrutinize\tVB\n', vocabulary))

## Working with tags and Numpy

In [None]:
import numpy as np 
import pandas as pd

### test with only 3 tags (RB, NN, TO)

In [None]:
tags = ['RB', 'NN', 'TO']

### testing with transition_counts, which counts the number of times a particular tag happend next to another. The keys of dictionary have the form (previous_tag, tag) and the values are the frequency of occurrences. The trainsition_dictionary just works with tags only

In [None]:
# for example: define a transition_counts dictionary with random value:
transition_counts = {
    ('NN', 'NN'): 16241,
    ('RB', 'RB'): 2263,
    ('TO', 'TO'): 2,
    ('NN', 'TO'): 5256,
    ('RB', 'TO'): 855,
    ('TO', 'NN'): 734,
    ('NN', 'RB'): 2431,
    ('RB', 'NN'): 358,
    ('TO', 'RB'): 200
}


### Using numpy of matrix creation:

In [None]:
num_tags = len(tags)
print(num_tags)

transition_matrix = np.zeros((num_tags, num_tags))

print(transition_matrix)
print(transition_matrix.shape)

In [None]:
sorted_tags = sorted(tags)

sorted_tags

In [None]:
for i in range(num_tags):
    for j in range(num_tags):
        tag_tuple = (sorted_tags[i], sorted_tags[j])

        transition_matrix[i, j] = transition_counts.get(tag_tuple)

transition_matrix

In [None]:
def print_transition_matrix(matrix):
    print(pd.DataFrame(matrix, index = sorted_tags, columns = sorted_tags))

print_transition_matrix(transition_matrix)

### Working with Numpy for matrix manipulation

In [None]:
rows_sum = transition_matrix.sum(axis = 1, keepdims = True)

rows_sum

In [None]:
# normalize the matrix 
rows_sum = transition_matrix.sum(axis = 1, keepdims = True)

transition_matrix = transition_matrix / rows_sum

print_transition_matrix(transition_matrix)

In [None]:
### Create rules use for unknown word

noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
verb_suffix = ["ate", "ify", "ise", "ize"]
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
adv_suffix = ["ward", "wards", "wise"]

In [None]:
import string
punct = set(string.punctuation)

In [None]:
def assign_unknow(word):

    noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
    verb_suffix = ["ate", "ify", "ise", "ize"]
    adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
    adv_suffix = ["ward", "wards", "wise"]

    if any(char.isdigit() for char in word):
        return "--unknow_digit--"

    elif any(char in punct for char in word):
        return "--unknow_punct--"

    elif any(char.isupper() for char in word):
        return "--unknow_upper--"

    elif any(word.endswith(suffix) for suffix in noun_suffix):
        return "--unknow_noun--"

    elif any(word.endswith(suffix) for suffix in verb_suffix):
        return "--unknow_verb--"

    elif any(word.endswith(suffix) for suffix in adj_suffix):
        return "--unknow_adj--"

    elif any(word.endswith(suffix) for suffix in adv_suffix):
        return "--unknow_adv--"

    return "--unknow--"

In [None]:
def get_word_tag(line, vocab):
    if not line.split():
        word = "--n--"
        tag = "--s--"
    else:
        word, tag = line.split()
        if word not in vocab:
            word = assign_unknow(word)
        return word, tag
    return None

In [None]:
with open("hmm_vocab.txt", 'r') as f:
    vocabulary = f.read().split('\n')


print(get_word_tag('\n', vocabulary))
print(get_word_tag('In\tIN\n', vocabulary))
print(get_word_tag('tardigrade\tNN\n', vocabulary))
print(get_word_tag('scrutinize\tVB\n', vocabulary))

In [None]:
# print(vocabulary)

In [None]:
with open("WSJ_02-21.pos", 'r') as f:
    training_set = f.readlines()

# print(training_set)

In [None]:
def preprocess(vocab, data_fp):
    """
    Preprocess data
    """
    orig = []
    prep = []

    # Read data
    with open(data_fp, "r") as data_file:

        for cnt, word in enumerate(data_file):

            # End of sentence
            if not word.split():
                orig.append(word.strip())
                word = "--n--"
                prep.append(word)
                continue

            # Handle unknown words
            elif word.strip() not in vocab:
                orig.append(word.strip())
                word = assign_unknow(word)
                prep.append(word)
                continue

            else:
                orig.append(word.strip())
                prep.append(word.strip())

    assert(len(orig) == len(open(data_fp, "r").readlines()))
    assert(len(prep) == len(open(data_fp, "r").readlines()))

    return orig, prep

In [None]:
vocabulary_with_index = {}

for i, word in enumerate(sorted(vocabulary)):
    vocabulary_with_index[word] = i

# vocabulary_with_index

In [None]:
with open("WSJ_24.pos", 'r') as f:
    test_set = f.readlines()

for i in test_set:
    print(i)

In [None]:
_, test_set_without_tag = preprocess(vocabulary_with_index, "test.words")
print(test_set_without_tag[0:10])

## Build MFC

### Create 3 dictionaries:
 - transition dictionary: maps (prev_tag, tag) to the number of times it has appeared
 - emission_dictionary: maps (tag, word) to the number of times it happended
 - tag_dictionary: maps (tag) to the number of times it has occured 

In [None]:
### pass into an training set and vocabulary then return 3 dictionaries above

def create_dictionaries(training_set, vocabulary):
    transition_dictionary = defaultdict(int)
    emission_dictionary = defaultdict(int)
    tag_dictionary = defaultdict(int)
    
    # define tag for the begining state (begining tag)
    prev_tag = '--s--'

    i = 0

    for word_tag in training_set:
        i += 1
        
        if i % 50000 == 0:
            print(f"word count = {i}") 

        if word_tag == '\n':
            prev_tag = '--s--'
            tag_dictionary['--s--'] += 1
            continue

        word, tag = get_word_tag(word_tag, vocabulary)

        transition_dictionary[(prev_tag, tag)] += 1

        emission_dictionary[(tag, word)] += 1

        tag_dictionary[tag] += 1

        prev_tag = tag

    return transition_dictionary, emission_dictionary, tag_dictionary

In [None]:
transition_dictionary, emission_dictionary, tag_dictionary = create_dictionaries(training_set, vocabulary_with_index)

In [None]:
states = sorted(tag_dictionary.keys())
print('amount of states: ', len(states))
print('list of states: ')
print(states)

In [195]:
for ex in list(transition_dictionary.items())[:10]:
    print(ex)

(('--s--', 'IN'), 5050)
(('IN', 'DT'), 32364)
(('DT', 'NNP'), 9044)
(('NNP', 'CD'), 1752)
(('CD', 'NN'), 7377)
(('NN', 'IN'), 32885)
(('IN', '``'), 546)
(('``', 'DT'), 1014)
(('DT', 'NN'), 38873)
(('NN', "''"), 686)


In [None]:
for ex in list(emission_dictionary.items())[:10]:
    print(ex)

In [None]:
for word, count in emission_dictionary.items():
    if word[1] == 'review':
        print(word, count) 

### Create predict function

In [209]:
def predict(test_set_without_tag, test_set, emission_dictionary, vocabulary, states):
    correct = 0

    all_words = set(emission_dictionary.keys())

    total = len(test_set)
    for word, word_with_tag in zip(test_set_without_tag, test_set):
        item = word_with_tag.split()

        if len(item) == 2:
            true_label = item[1]
        else:
            continue

        count_final = 0
        pos_final = ''

        if word in vocabulary:
            for pos in states:
                key = (pos, word)

                if key in emission_dictionary:
                    count = emission_dictionary[key]

                    if count > count_final:
                        count_final = count
                        pos_final = pos
            if pos_final == true_label:
                correct += 1

    accuracy = correct / total
    return accuracy    

In [210]:
accuracy = predict(test_set_without_tag, test_set, emission_dictionary, vocabulary_with_index, states)
print(accuracy)

0.8888563993099213
