In [28]:
import numpy as np
import sklearn
from nltk.corpus import brown
from sklearn.model_selection import train_test_split

def HMM():
    tagged_data = []
    train_corpus = []
    test_corpus = []
    all_tags = []
    all_words = set()
    tags_to_num = {}
    num_to_tags = {}
    transition = {}
    emission = {}
    
    #Preprocess dataset
    def __init__(self, train_test_ratio = 0.2):
        tagged_sentences = brown.tagged_sents(tagset='universal')
        for i in range(len(tagged_sentences)):
            sent = []
            sent.append(("^", "^"))
            sent = sent + tagged_sentences[i]
            tagged_data.append(sent)
    
    #Split dataset into train and test data
    def split(train_test_ratio = 0.2):
        train_corpus, test_corpus = train_test_split(tagged_data, test_size = 0.2)
    
    #Obtain other parameters like transition and emission probabilities
    def extract_prob():    
        #All tags in the train corpus
        for sent in train_corpus:
            for (w, t) in sent:
                if t not in all_tags:
                    all_tags.append(t)

        #Count frequency for each tag
        tags_freq = {}
        for t in all_tags:
            tags_freq[t] = 0
        for sent in train_corpus:
            for (w, t) in sent:
                tags_freq[t] += 1

        #tag to number and vice-versa
        for i in range(len(all_tags)):
            tags_to_num[all_tags[i]] = i
            num_to_tags[i] = all_tags[i]

        #All possible words in the set
        for sent in train_corpus:
            for (w, t) in train_corpus:
                all_words.add(w)

        #Count frequency for each (word, tag)
        word_tag_freq = {}
        for w in all_words:
            for t in all_tags:
                word_tag_freq[(w, t)] = 0
        for sent in train_corpus:
            for (w, t) in sent:
                word_tag_freq[(w, t)] += 1

        #evaluate transition  counts
        for t1 in all_tags:
            for t2 in all_tags:
                transition[(t1, t2)] = 0
        for sent in train_corpus:
            for i in range(len(sent)-1):
                transition[(sent[i][1], sent[i+1][1])] += 1

        #evaluate transition and emission probabilities
        for (t1, t2) in transition.keys():
            transition[(t1, t2)] /= tags_freq[t1]
        for (w, t) in word_tag_freq.keys():
            emission[(w, t)] = word_tag_freq[(w, t)]/tags_freq[t]
    
    #Implements Viterbi Algorithm
    def viterbi(sentence):
        #returns predicted tag sequence
        len_sent = len(sentence)
        len_tagset = len(all_tags)

        #SEQSCORE and BACKPTR arrays
        SEQSCORE = [[0 for i in range(len_sent)] for j in range(len_tagset)]
        BACKPTR =  [[0 for i in range(len_sent)] for j in range(len_tagset)]

        null_tag = tags_to_num["^"]
        #initialise the null tag
        SEQSCORE[null_tag][0] = 1

        for i in range(1, len_sent):#Corresponds to a given word sentence[i]
            for cidx, ctag in enumerate(all_tags):#Ending at current tag all_tags[j]

                optimal_prob = 0 #Includes transitional probabilites
                optimal_tag = 0

                for pidx, ptag in enumerate(all_tags):#Previous tag all_tags[k]
                    prob_k_j_i = SEQSCORE[pidx][i-1]*transition[(ptag, ctag)]
                    if prob_k_j_i > optimal_prob:
                        optimal_prob = prob_k_j_i
                        optimal_tag = pidx

                if sentence[i] in all_words:
                    SEQSCORE[cidx][i] = optimal_prob*emission[(sentence[i], ctag)]
                else:
                    SEQSCORE[cidx][i] = optimal_prob
                BACKPTR[cidx][i] = optimal_tag

        #Sequence identification step
        CT = 0
        optimal_prob = 0
        for i in range(len_tagset):
            if SEQSCORE[i][len_sent-1]>optimal_prob:
                optimal_prob = SEQSCORE[i][len_sent-1]
                CT = i

        pred_tags = [CT for i in range(len_sent)]
        for i in reversed(range(len_sent-1)):
            pred_tags[i] = BACKPTR[pred_tags[i+1]][i+1]

        pred_tags = [num_to_tags[idx] for idx in pred_tags]
        return pred_tags


# #Load tag corpus and add the sentence beginner tag
# def process_load():
#     tagged_sentences = brown.tagged_sents(tagset='universal')
#     tagged_data = []
#     for i in range(len(tagged_sentences)):
#         sentence = []
#         sentence.append(("^", "^"))
#         sentence = sentence + tagged_sentences[i]
#         tagged_data.append(sentence)
#     return tagged_data

# #Return the emission and transition probabilities
# def extract_prob(tagged_data):
    
#     #All tags in the set
#     all_tags = []
#     for sentence in tagged_data:
#         for (word, tag) in sentence:
#             if tag not in all_tags:
#                 all_tags.append(tag)
    
#     #Count frequency for each tag
#     tags_freq = {}
#     for tag in all_tags:
#         tags_freq[tag] = 0
#     for sentence in tagged_data:
#         for (word, tag) in sentence:
#             tags_freq[tag] += 1
    
#     #tag to number and vice-versa
#     tags_to_num = {}
#     num_to_tags = {}
#     for i in range(len(all_tags)):
#         tags_to_num[all_tags[i]] = i
#         num_to_tags[i] = all_tags[i]
        
#     #All possible words in the set
#     all_words = set()
#     for sentence in tagged_data:
#         for (word, tag) in sentence:
#             all_words.add(word)
            
#     #Count frequency for each (word, tag)
#     word_tag_freq = {}
#     for w in all_words:
#         for t in all_tags:
#             word_tag_freq[(w, t)] = 0
#     for sentence in tagged_data:
#         for (w, t) in sentence:
#             word_tag_freq[(w, t)] += 1
    
#     #evaluate transition  counts
#     transition = {}
#     for t1 in all_tags:
#         for t2 in all_tags:
#             transition[(t1, t2)] = 0
#     for sentence in tagged_data:
#         for i in range(len(sentence)-1):
#             transition[(sentence[i][1], sentence[i+1][1])] += 1
    
#     #evaluate transition and emission probabilities
#     emission = {}
#     for (t1, t2) in transition.keys():
#         transition[(t1, t2)] /= tags_freq[t1]
#     for (w, t) in word_tag_freq.keys():
#         emission[(w, t)] = word_tag_freq[(w, t)]/tags_freq[t]
#     return transition, emission, all_words, all_tags, tags_to_num, num_to_tags

# #Implements Viterbi Algorithm
# def viterbi(sentence, transition, emission, all_words, all_tags, tags_to_num, num_to_tags):
#     #returns predicted tag sequence
#     len_sent = len(sentence)
#     len_tagset = len(all_tags)
    
#     #SEQSCORE and BACKPTR arrays
#     SEQSCORE = [[0 for i in range(len_sent)] for j in range(len_tagset)]
#     BACKPTR =  [[0 for i in range(len_sent)] for j in range(len_tagset)]
    
#     null_tag = tags_to_num["^"]
#     #initialise the null tag
#     SEQSCORE[null_tag][0] = 1
    
#     for i in range(1, len_sent):#Corresponds to a given word sentence[i]
#         for cidx, ctag in enumerate(all_tags):#Ending at current tag all_tags[j]
            
#             optimal_prob = 0 #Includes transitional probabilites
#             optimal_tag = 0
            
#             for pidx, ptag in enumerate(all_tags):#Previous tag all_tags[k]
#                 prob_k_j_i = SEQSCORE[pidx][i-1]*transition[(ptag, ctag)]
#                 if prob_k_j_i > optimal_prob:
#                     optimal_prob = prob_k_j_i
#                     optimal_tag = pidx
            
#             if sentence[i] in all_words:
#                 SEQSCORE[cidx][i] = optimal_prob*emission[(sentence[i], ctag)]
#             else:
#                 SEQSCORE[cidx][i] = optimal_prob
#             BACKPTR[cidx][i] = optimal_tag
                
#     #Sequence identification step
#     CT = 0
#     optimal_prob = 0
#     for i in range(len_tagset):
#         if SEQSCORE[i][len_sent-1]>optimal_prob:
#             optimal_prob = SEQSCORE[i][len_sent-1]
#             CT = i
    
#     pred_tags = [CT for i in range(len_sent)]
#     for i in reversed(range(len_sent-1)):
#         pred_tags[i] = BACKPTR[pred_tags[i+1]][i+1]
    
#     pred_tags = [num_to_tags[idx] for idx in pred_tags]
#     return pred_tags
    
    
# tagged_corpus = process_load()
# train_corpus, test_corpus = sklearn.model_selection.train_test_split(tagged_corpus, test_size = 0.2)
# transition, emission, all_words, all_tags, tags_to_num, num_to_tags = extract_prob(train_corpus)

count_accurate = 0
count_total = 0
for sentence in test_corpus:
    untagged_sentence = []
    for (w, t) in sentence:
        untagged_sentence.append(w)
    predicted_tags = viterbi(untagged_sentence, transition, emission, all_words, all_tags, tags_to_num, num_to_tags)
    for i in range(len(sentence)):
        if predicted_tags[i] == senten  ce[i][1]:
            count_accurate += 1
        count_total += 1
print(count_accurate/count_total)

0.9595177670194431


In [None]:
# Load tagged corpora
def load_data():
	tagged_sentences = nltk.corpus.brown.tagged_sents(tagset='universal')
	tagged_data = []
	for i in range(len(tagged_sentences)):
		tagged_data.append(("^", "^"))
		for w in tagged_sentences[i]:
			tagged_data.append(w)
	return tagged_data
# Find all unique tags and count for each tag
def extract_prob(tagged_data):
	tags_all = []
	for (word,tag) in tagged_data:
		tags_all.append(tag)

	(tagset, count) = np.unique(tags_all, return_counts=True)
	print(tagset)
	tags_dict = {}
	for i in range(len(tagset)):
		tags_dict[tagset[i]] = count[i]

	# Find all unique words
	words_all = []
	for (w,t) in tagged_data:
		words_all.append(w)
	words = np.unique(words_all)

	# Find count of all (word, tag pairs)
	word_tag_freq = {}
	for w in words:
		for t in tagset:
			word_tag_freq[(w,t)] = 0
	for (w,t) in tagged_data:
		word_tag_freq[(w,t)] += 1

	# Find count of one tag following another
	# t1 is the first tag, t2 is the following tag
	transitions = {}
	for t1 in tagset:
		for t2 in tagset:
			transitions[(t1,t2)] = 0
	for i in range(len(tagged_data)-1):
		transitions[(tagged_data[i][1], tagged_data[i+1][1])] += 1

	# Obtain the emission and transition probabilities
	emission = word_tag_freq
	for (w,t) in emission.keys():
		emission[(w,t)] = emission[(w,t)]/tags_dict[t]
	for (t1,t2) in transitions.keys():
		transitions[(t1,t2)] = transitions[(t1,t2)]/tags_dict[t1]
	return transitions, emission