# POS_TAGGER
## - step 1: Read and preprocess the dataset
## - step 2: Build a Most Frequent Class (MFC) tagger to use as a       baseline
## - step 3: Build an HMM POS Tagger
## - step 4:

In [1]:
import matplotlib.pyplot as plt 
import numpy as np 
from itertools import chain
from collections import Counter, defaultdict
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution
import pandas as pd
import sklearn as skl 


In [2]:
import string
from collections import defaultdict

## Working with data file and create vocabulary

### Read data file

In [3]:
with open("WSJ_02-21.pos", 'r') as f:
    lines = f.readlines()

In [4]:
for i in range(20):
    print(lines[i])

In	IN

an	DT

Oct.	NNP

19	CD

review	NN

of	IN

``	``

The	DT

Misanthrope	NN

''	''

at	IN

Chicago	NNP

's	POS

Goodman	NNP

Theatre	NNP

(	(

``	``

Revitalized	VBN

Classics	NNS

Take	VBP



In [5]:
words = [line.split('\t')[0] for line in lines]
# words

### Create vocabulary file

In [6]:
freq = defaultdict(int)

for word in words:
    freq[word] +=1

In [7]:
freq['more']

1870

In [8]:
vocabulary = [k for k, v in freq.items() if (v > 1 and k != '\n')]

In [9]:
vocabulary.sort()

In [10]:
vocabulary_file = open('vocabulary_file.txt', "w+")
for i in vocabulary:
    vocabulary_file = open('vocabulary_file.txt', "a+")
    vocabulary_file.writelines(i + '\n')
# for i in range(0,10):
#     print(vocabulary[i])

### Processing new text sources

#### working words don't exist in the vocabulary

In [11]:
def assign_unknow(word):

    #punctuation characters
    punct = set(string.punctuation)

    #suffixes
    noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
    verb_suffix = ["ate", "ify", "ise", "ize"]
    adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
    adv_suffix = ["ward", "wards", "wise"]
    
    if any(char.isdigit() for char in word):
        return "--unknow_digit--"

    elif any(char in punct for char in word):
        return "--unknow_punct--"

    elif any(char.isupper() for char in word):
        return "--unknow_upper"

    elif any(word.endswith(suffix) for suffix in noun_suffix):
        return "--unknow_noun--" 
    
    elif any(word.endswith(suffix) for suffix in verb_suffix):
        return "--unknow_verb--"

    elif any(word.endswith(suffix) for suffix in adj_suffix):
        return "--unknow_adj--"
    
    elif any(word.endswith(suffix) for suffix in adv_suffix):
        return "--unknow_adv--"

    return "--unknow--"

### Getting the correct tag for a word

In [12]:
def get_word_tag(line, vocabulary):
    if not line.split():
        word = "--n--"
        tag = "--s--"
    else:
        word, tag = line.split()
        if word not in vocabulary:
            word = assign_unknow(word)

    return word, tag 

In [13]:
print(get_word_tag('\n', vocabulary))
print(get_word_tag('In\tIN\n', vocabulary))
print(get_word_tag('tardigrade\tNN\n', vocabulary))
print(get_word_tag('scrutinize\tVB\n', vocabulary))

('--n--', '--s--')
('In', 'IN')
('--unknow--', 'NN')
('--unknow_verb--', 'VB')


## Working with tags and Numpy

In [14]:
import numpy as np 
import pandas as pd

### test with only 3 tags (RB, NN, TO)

In [15]:
tags = ['RB', 'NN', 'TO']

### testing with transition_counts, which counts the number of times a particular tag happend next to another. The keys of dictionary have the form (previous_tag, tag) and the values are the frequency of occurrences. The trainsition_dictionary just works with tags only

In [16]:
# for example: define a transition_counts dictionary with random value:
transition_counts = {
    ('NN', 'NN'): 16241,
    ('RB', 'RB'): 2263,
    ('TO', 'TO'): 2,
    ('NN', 'TO'): 5256,
    ('RB', 'TO'): 855,
    ('TO', 'NN'): 734,
    ('NN', 'RB'): 2431,
    ('RB', 'NN'): 358,
    ('TO', 'RB'): 200
}


### Using numpy of matrix creation:

In [17]:
num_tags = len(tags)
print(num_tags)

transition_matrix = np.zeros((num_tags, num_tags))

print(transition_matrix)
print(transition_matrix.shape)

3
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(3, 3)


In [18]:
sorted_tags = sorted(tags)

sorted_tags

['NN', 'RB', 'TO']

In [19]:
for i in range(num_tags):
    for j in range(num_tags):
        tag_tuple = (sorted_tags[i], sorted_tags[j])

        transition_matrix[i, j] = transition_counts.get(tag_tuple)

transition_matrix

array([[1.6241e+04, 2.4310e+03, 5.2560e+03],
       [3.5800e+02, 2.2630e+03, 8.5500e+02],
       [7.3400e+02, 2.0000e+02, 2.0000e+00]])

In [20]:
def print_transition_matrix(matrix):
    print(pd.DataFrame(matrix, index = sorted_tags, columns = sorted_tags))

print_transition_matrix(transition_matrix)

         NN      RB      TO
NN  16241.0  2431.0  5256.0
RB    358.0  2263.0   855.0
TO    734.0   200.0     2.0


### Working with Numpy for matrix manipulation

In [21]:
rows_sum = transition_matrix.sum(axis = 1, keepdims = True)

rows_sum

array([[23928.],
       [ 3476.],
       [  936.]])

In [22]:
# normalize the matrix 
rows_sum = transition_matrix.sum(axis = 1, keepdims = True)

transition_matrix = transition_matrix / rows_sum

print_transition_matrix(transition_matrix)

          NN        RB        TO
NN  0.678745  0.101596  0.219659
RB  0.102992  0.651036  0.245972
TO  0.784188  0.213675  0.002137


In [23]:
### Create rules use for unknown word

noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
verb_suffix = ["ate", "ify", "ise", "ize"]
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
adv_suffix = ["ward", "wards", "wise"]

In [24]:
import string
punct = set(string.punctuation)

In [25]:
def assign_unknow(word):

    noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
    verb_suffix = ["ate", "ify", "ise", "ize"]
    adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
    adv_suffix = ["ward", "wards", "wise"]

    if any(char.isdigit() for char in word):
        return "--unknow_digit--"

    elif any(char in punct for char in word):
        return "--unknow_punct--"

    elif any(char.isupper() for char in word):
        return "--unknow_upper--"

    elif any(word.endswith(suffix) for suffix in noun_suffix):
        return "--unknow_noun--"

    elif any(word.endswith(suffix) for suffix in verb_suffix):
        return "--unknow_verb--"

    elif any(word.endswith(suffix) for suffix in adj_suffix):
        return "--unknow_adj--"

    elif any(word.endswith(suffix) for suffix in adv_suffix):
        return "--unknow_adv--"

    return "--unknow--"

In [26]:
def get_word_tag(line, vocab):
    if not line.split():
        word = "--n--"
        tag = "--s--"
    else:
        word, tag = line.split()
        if word not in vocab:
            word = assign_unknow(word)
        return word, tag
    return None

In [27]:
with open("hmm_vocab.txt", 'r') as f:
    vocabulary = f.read().split('\n')


print(get_word_tag('\n', vocabulary))
print(get_word_tag('In\tIN\n', vocabulary))
print(get_word_tag('tardigrade\tNN\n', vocabulary))
print(get_word_tag('scrutinize\tVB\n', vocabulary))

None
('In', 'IN')
('--unknow--', 'NN')
('--unknow_verb--', 'VB')


In [28]:
# print(vocabulary)

In [29]:
with open("WSJ_02-21.pos", 'r') as f:
    training_set = f.readlines()

# print(training_set)

In [30]:
def preprocess(vocab, data_fp):
    """
    Preprocess data
    """
    orig = []
    prep = []

    # Read data
    with open(data_fp, "r") as data_file:

        for cnt, word in enumerate(data_file):

            # End of sentence
            if not word.split():
                orig.append(word.strip())
                word = "--n--"
                prep.append(word)
                continue

            # Handle unknown words
            elif word.strip() not in vocab:
                orig.append(word.strip())
                word = assign_unknow(word)
                prep.append(word)
                continue

            else:
                orig.append(word.strip())
                prep.append(word.strip())

    assert(len(orig) == len(open(data_fp, "r").readlines()))
    assert(len(prep) == len(open(data_fp, "r").readlines()))

    return orig, prep

In [31]:
vocabulary_with_index = {}

for i, word in enumerate(sorted(vocabulary)):
    vocabulary_with_index[word] = i

# vocabulary_with_index

In [32]:
with open("WSJ_24.pos", 'r') as f:
    test_set = f.readlines()

for i in test_set:
    print(i)

N

of	IN

syndication	NN

.	.



Under	IN

current	JJ

rules	NNS

,	,

even	RB

when	WRB

a	DT

network	NN

fares	VBZ

well	RB

with	IN

a	DT

100%-owned	JJ

series	NN

--	:

ABC	NNP

,	,

for	IN

example	NN

,	,

made	VBD

a	DT

killing	NN

in	IN

broadcasting	VBG

its	PRP$

popular	JJ

crime\/comedy	NN

``	``

Moonlighting	NNP

''	''

--	:

it	PRP

is	VBZ

n't	RB

allowed	VBN

to	TO

share	VB

in	IN

the	DT

continuing	VBG

proceeds	NNS

when	WRB

the	DT

reruns	NNS

are	VBP

sold	VBN

to	TO

local	JJ

stations	NNS

.	.



Instead	RB

,	,

ABC	NNP

will	MD

have	VB

to	TO

sell	VB

off	RP

the	DT

rights	NNS

for	IN

a	DT

one-time	JJ

fee	NN

.	.



The	DT

networks	NNS

admit	VBP

that	IN

the	DT

chances	NNS

of	IN

getting	VBG

the	DT

relief	NN

they	PRP

want	VBP

are	VBP

slim	JJ

--	:

for	IN

several	JJ

years	NNS

at	IN

the	DT

least	JJS

.	.



Six	CD

years	NNS

ago	IN

they	PRP

were	VBD

tantalizingly	RB

close	JJ

.	.



The	DT

Reagan-era	NNP

Federal	NNP

Communicat

In [33]:
_, test_set_without_tag = preprocess(vocabulary_with_index, "test.words")
print(test_set_without_tag[0:10])

['The', 'economy', "'s", 'temperature', 'will', 'be', 'taken', 'from', 'several', '--unknow--']


## Build MFC

### Create 3 dictionaries:
 - transition dictionary: maps (prev_tag, tag) to the number of times it has appeared
 - emission_dictionary: maps (tag, word) to the number of times it happended
 - tag_dictionary: maps (tag) to the number of times it has occured 

In [34]:
### pass into an training set and vocabulary then return 3 dictionaries above

def create_dictionaries(training_set, vocabulary):
    transition_dictionary = defaultdict(int)
    emission_dictionary = defaultdict(int)
    tag_dictionary = defaultdict(int)
    
    # define tag for the begining state (begining tag)
    prev_tag = '--s--'

    i = 0

    for word_tag in training_set:
        i += 1
        
        if i % 50000 == 0:
            print(f"word count = {i}") 

        if word_tag == '\n':
            prev_tag = '--s--'
            tag_dictionary['--s--'] += 1
            continue

        word, tag = get_word_tag(word_tag, vocabulary)

        transition_dictionary[(prev_tag, tag)] += 1

        emission_dictionary[(tag, word)] += 1

        tag_dictionary[tag] += 1

        prev_tag = tag

    return transition_dictionary, emission_dictionary, tag_dictionary

In [35]:
transition_dictionary, emission_dictionary, tag_dictionary = create_dictionaries(training_set, vocabulary_with_index)

word count = 50000
word count = 100000
word count = 150000
word count = 200000
word count = 250000
word count = 300000
word count = 350000
word count = 400000
word count = 450000
word count = 500000
word count = 550000
word count = 600000
word count = 650000
word count = 700000
word count = 750000
word count = 800000
word count = 850000
word count = 900000
word count = 950000


In [36]:
states = sorted(tag_dictionary.keys())
print('amount of states: ', len(states))
print('list of states: ')
print(states)

amount of states:  46
list of states: 
['#', '$', "''", '(', ')', ',', '--s--', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']


In [37]:
for ex in list(transition_dictionary.items())[:10]:
    print(ex)

(('--s--', 'IN'), 5050)
(('IN', 'DT'), 32364)
(('DT', 'NNP'), 9044)
(('NNP', 'CD'), 1752)
(('CD', 'NN'), 7377)
(('NN', 'IN'), 32885)
(('IN', '``'), 546)
(('``', 'DT'), 1014)
(('DT', 'NN'), 38873)
(('NN', "''"), 686)


In [38]:
for ex in list(emission_dictionary.items())[:10]:
    print(ex)

(('IN', 'In'), 1735)
(('DT', 'an'), 3142)
(('NNP', 'Oct.'), 317)
(('CD', '19'), 100)
(('NN', 'review'), 36)
(('IN', 'of'), 22925)
(('``', '``'), 6967)
(('DT', 'The'), 6795)
(('NN', 'Misanthrope'), 3)
(("''", "''"), 6787)


In [39]:
for word, count in emission_dictionary.items():
    if word[1] == 'review':
        print(word, count) 

('NN', 'review') 36
('VB', 'review') 21
('VBP', 'review') 1


### Create predict function

In [40]:
def predict(test_set_without_tag, test_set, emission_dictionary, vocabulary, states):

    # define the number of correction word
    correct = 0

    # define a variable get all words in the training_set
    all_words = set(emission_dictionary.keys())

    # define a value get number of words in test set
    total = len(test_set)

    # predict
    for word, word_with_tag in zip(test_set_without_tag, test_set):

        # get word and tag in the test_set with tag and split it
        item = word_with_tag.split()

        # get the true label in test_set
        if len(item) == 2:
            true_label = item[1]
        else:
            continue
        
        # define a variable with the value is the time they appear with a tag
        count_final = 0
        pos_final = ''

        #search word in vocabulary
        if word in vocabulary:
            # predict base on the time it appears with a tag 
            for pos in states:
                key = (pos, word)

                if key in emission_dictionary:
                    count = emission_dictionary[key]

                    if count > count_final:
                        count_final = count
                        pos_final = pos
            if pos_final == true_label:
                correct += 1

    accuracy = correct / total
    return accuracy    

In [41]:
accuracy = predict(test_set_without_tag, test_set, emission_dictionary, vocabulary_with_index, states)
print(accuracy)

0.8888563993099213


## Build HMM for POS

### Build matrices:
- Creating the 'A' transition probabilites matrix
- Creating the 'B' emission probabilites matrix

#### The 'A' transition probabilities matrix


In [51]:
def create_transition_probabilities_matrix(alpha, tag_dictionary, transition_dictionary):
    all_tags = sorted(tag_dictionary.keys())

    num_tags = len(all_tags)

    A = np.zeros((num_tags, num_tags))

    trans_key = set(transition_dictionary.keys())

    for i in range(num_tags):
        for j in range(num_tags):
            count = 0

            key = (all_tags[i], all_tags[j])

            if key in transition_dictionary:
                count = transition_dictionary[key]
            
            count_prev = tag_dictionary[all_tags[i]]

            A[i, j] = (count +alpha)/ (count_prev + alpha * num_tags)
    return A

In [74]:
alpha = 0.001

A = create_transition_probabilities_matrix(alpha, tag_dictionary, transition_dictionary)

A_dataframe = pd.DataFrame(A, index=states, columns = states)
print(A_dataframe)
A_dataframe.to_csv('transition_probabilities_matrix.csv')

                  #             $            ''             (             )  \
#      7.039973e-06  7.039973e-06  7.039973e-06  7.039973e-06  7.039973e-06   
$      1.356476e-07  1.356476e-07  1.356476e-07  1.356476e-07  1.356476e-07   
''     1.445286e-07  1.446731e-04  6.937517e-03  6.792989e-03  5.058645e-03   
(      7.320398e-07  1.691019e-01  7.320398e-07  7.320398e-07  7.320398e-07   
)      7.267199e-07  7.274466e-04  7.267199e-07  7.274466e-04  7.267199e-07   
,      2.052248e-08  1.559729e-03  5.738088e-02  3.078578e-04  2.052248e-08   
--s--  2.513052e-05  7.029767e-04  4.017117e-04  3.615205e-03  2.513052e-05   
.      2.533053e-08  2.533053e-08  5.932414e-02  1.342544e-03  5.395429e-03   
:      4.193170e-04  2.703264e-02  1.257532e-03  6.288707e-04  2.095537e-07   
CC     1.670770e-04  1.971020e-02  4.175880e-08  3.758710e-04  4.180056e-05   
CD     2.734628e-08  1.094125e-04  3.828753e-04  1.668150e-03  8.614105e-03   
DT     1.588548e-04  9.249537e-03  3.666819e-05  5.2