In [11]:
import os
import xml.etree.ElementTree as ET

file_append = []

for subdir, dirs, files in os.walk(r'../Train-corpus'):
    for filename in files:
        filepath = subdir + os.sep + filename
        mytree = ET.parse(filepath)
        root = mytree.getroot()

        # Handled mw tags
        for element in root.iter('mw'):
            mw_words = ""

        # strip the text field
            for words in element.iter('w'):
                mw_words += words.text
            tag = element.attrib.get('c5')
            file_append.append(mw_words.strip() + "_" + tag)

        # Handled w tags
        for element in root.iter('w'):
            word = element.text
            tag = element.attrib.get('c5')
            tag_list = tag.split('-')
            for tags in tag_list:
                file_append.append(word.strip() + "_" + tags)

        # Handled c tags
        for element in root.iter('c'):
            file_append.append(element.text.strip() + "_" + element.attrib.get('c5'))


# mw tags i.e. multiword there are tags init
# c tags i.e. puctuations
# if tag1-tag2 then word_tag1, word_tag2
# Handle the spacing 
# Don't take words from hw tags

In [12]:
myDict = {}

for word in file_append:
    if (word in myDict):
        myDict[word] += 1
    else:
        myDict[word] = 1


In [13]:
import operator

word_dict = {}
tag_dict = {}

for element in file_append:
    temp = element.split("_")
    word = temp[0].strip()
    tag = temp[1]

    if (word in word_dict):
        word_dict[word] += 1
    else: 
        word_dict[word] = 1

    if (tag in tag_dict):
        tag_dict[tag] += 1
    else: 
        tag_dict[tag] = 1

word_dict_ordered = dict( sorted(word_dict.items(), key=operator.itemgetter(1),reverse=True))
tag_dict_ordered = dict( sorted(tag_dict.items(), key=operator.itemgetter(1),reverse=True))

In [14]:
probability_of_word_given_tag = {}

# P(Word|Tag) = P(Word_Tag)/P(Tag) = Count(Word_Tag)/Count(Tag)

for word in word_dict.keys():
    for tag in tag_dict.keys():
        temp = word+"_"+tag
        temp1 = word+"|"+tag
        if (temp in myDict):
            probability_of_word_given_tag[temp1] = myDict[temp] / tag_dict[tag]
        else:
            probability_of_word_given_tag[temp1] = 0

In [15]:
def predict( word ):
    
    if word not in word_dict:
        return "UNC"
    mx_p = -1
    tag_gen = ""
    for tags in tag_dict.keys():
        word_given_tag = word + "|" + tags
        P = 0
        if word in word_dict:
            P = (float(probability_of_word_given_tag[word_given_tag]) * int(
                tag_dict[tags])) / int(word_dict[word])
        if P > mx_p:
            mx_p = P
            tag_gen = tags
    return tag_gen


In [21]:
import glob
test_files = glob.glob("../Test-corpus/*/*.xml")
print(test_files[0])

In [17]:
ref = []
tagged = []

correct = 0
total = 0
mw_correct = 0
mw_total = 0
p_correct = 0
p_total = 0

for test_file in test_files:
    mytree1 = ET.parse(test_file)
    root1 = mytree1.getroot()

    """Normal words tag"""
    for element in root1.iter('w'):
        word = element.text.strip()
        tag = element.attrib.get('c5')

        tag_gen = predict(word)
        if tag == tag_gen:
            correct += 1

        if tag.find('-') != -1:
            if tag.split('-')[0] == tag_gen or tag.split('-')[1] == tag_gen:
                correct += 1
                ref.append(tag_gen)
            else:
                ref.append(tag)
        else:
            ref.append(tag)
        total += 1
        tagged.append(tag_gen)


    """Multi-words"""
    for element in root1.iter("mw"):
        word = ""
        for word_mw in element.iter('w'):
            word += word_mw.text
        if word[-1] == " ":
            word = word[0:-1]
        tag = element.attrib.get('c5')
        tag_gen = predict(word)
        if tag == tag_gen:
            mw_correct += 1

        if tag.find('-') != -1:
            if tag.split('-')[0] == tag_gen or tag.split('-')[1] == tag_gen:
                mw_correct += 1
                ref.append(tag_gen)
            else:
                ref.append(tag)
        else:
            ref.append(tag)
        mw_total += 1
        tagged.append(tag_gen)

    """Punctuations"""
    for element in root1.iter("c"):
        try:
            word = element.text.strip()
            tag = element.attrib.get('c5')
            tag_gen = predict(word)
            if tag == tag_gen:
                p_correct += 1

            if tag.find('-') != -1:
                if tag.split('-')[0] == tag_gen or tag.split('-')[1] == tag_gen:
                    p_correct += 1
                    ref.append(tag_gen)
                else:
                    ref.append(tag)
            else:
                ref.append(tag)
            p_total += 1
            tagged.append(tag_gen)
        except:
            print("")

correct += mw_correct + p_correct
total += mw_total + p_total
print(correct,total)
print(correct/total)

# Correct words: 3818421
# Total words: 4159247
# 0.9180558403961102

In [18]:
from nltk.metrics import ConfusionMatrix
import sys 

stdoutOrigin=sys.stdout 
sys.stdout = open("../generatedFiles/Confusion_matrix.txt", "w")
cm = ConfusionMatrix(ref, tagged)
print(cm)
sys.stdout.close()
sys.stdout=stdoutOrigin

In [19]:
from collections import Counter  
labels = list(tag_dict.keys())
labels.append('UNC')

true_positives = Counter()
false_negatives = Counter()
false_positives = Counter()

for i in labels:
    for j in labels:
        if i == j:
            true_positives[i] += cm[i,j]
        else:
            false_negatives[i] += cm[i,j]
            false_positives[j] += cm[i,j]

print ("TP:", sum(true_positives.values()))
print ("FN:", sum(false_negatives.values()))
print ("FP:", sum(false_positives.values()))
TP=sum(true_positives.values())
FP=sum(false_positives.values())
FN=sum(false_negatives.values())


precision = TP / float(TP+FP)
recall = TP / float(TP+FN)
fscore = 2 * (precision * recall) / float(precision + recall)
print( fscore , precision , recall)

In [19]:
import csv
tag_csv=csv.reader(open("../generatedFiles/tag_freq.csv",'r',encoding="utf8"))
tag_list=[]
for tags in tag_csv:
    tag_list.append(tags[0])
# print(len(tag_list))
tag_number={}
tag_number['<S>']=0
tag_number['<E>']=0
i=1
for tags in tag_list:
    tag_number[tags]=i
    i+=1
# print(temp_dict)

In [6]:
import numpy as np 
transition_matrix=np.zeros((62,62))


In [12]:
def increment(curr,prev):
    if '-' in prev:
        compound_prev=prev.split('-')
        if '-' in curr:
            compound_curr=curr.split('-')
            transition_matrix[tag_number[compound_prev[0]]][tag_number[compound_curr[0]]]+=1
            transition_matrix[tag_number[compound_prev[0]]][tag_number[compound_curr[1]]]+=1
            transition_matrix[tag_number[compound_prev[1]]][tag_number[compound_curr[0]]]+=1
            transition_matrix[tag_number[compound_prev[1]]][tag_number[compound_curr[1]]]+=1
        else:
            transition_matrix[tag_number[compound_prev[0]]][tag_number[curr]]+=1
            transition_matrix[tag_number[compound_prev[1]]][tag_number[curr]]+=1
    else:
        if '-' in curr:
            compound_curr=curr.split('-')
            transition_matrix[tag_number[prev]][tag_number[compound_curr[0]]]+=1
            transition_matrix[tag_number[prev]][tag_number[compound_curr[1]]]+=1
        else:
            transition_matrix[tag_number[prev]][tag_number[curr]]+=1
            

In [13]:
import glob
import xml.etree.ElementTree as ET 
files = glob.glob("../Train-corpus/*/*.xml")
for filepath in files:
    # print(filepath)
    root=ET.parse(filepath).getroot()
    for s_tag in root.iter('s'):
        prev='<S>'
        curr=''
        end='<E>'
        for tags in s_tag:
            if('c5' not in tags.attrib):
                for u_tag in tags:
                    curr=u_tag.attrib.get('c5')
                    increment(curr,prev)
                    prev=curr
            elif tags.tag == 'mw':
                for w in tags.iter('w'):
                    curr = w.attrib.get('c5')
                    increment(curr, prev)
                    prev = curr
            else:
                curr=tags.attrib.get('c5')
                increment(curr,prev)
                prev=curr
        increment(end,prev)
    break

In [6]:
print(transition_matrix[0])

[  0.  40.  10. 154. 127.  27.  10. 101. 124. 362.  74.   2.   4.   1.
   0.  59.  21.  16.  16.   2.  13.  84.  63.   4.   2.   2.   2.   0.
   2.   0.   3.   0.   0.  16.   0.   0.   2.   0.   0.  25.   9.   0.
   0.  12.   0.   0.  22.   0.   0.   1.   2.   4.   3.   0.  10.   0.
   0.   1.   1.   0.   0.   0.]


In [14]:
for row in range(len(transition_matrix)):
    s=np.sum(transition_matrix[row])
    if s!=0.0:
        for i in range(len(transition_matrix[row])):
            transition_matrix[row][i]/=s

In [21]:
print(tag_list)
print(tag_number)

['NN1', 'PUN', 'AT0', 'PRP', 'AJ0', 'NN2', 'NP0', 'AV0', 'PNP', 'CJC', 'PRF', 'VVN', 'VVD', 'VVI', 'DT0', 'PUQ', 'VVB', 'CRD', 'TO0', 'VVG', 'CJS', 'DPS', 'VM0', 'VBD', 'VBZ', 'AVP', 'VVZ', 'CJT', 'POS', 'XX0', 'NN0', 'VBI', 'DTQ', 'VBB', 'VHD', 'ORD', 'VHZ', 'PUR', 'PUL', 'PNI', 'UNC', 'VHB', 'AVQ', 'VBN', 'PNQ', 'EX0', 'AJC', 'VHI', 'PNX', 'AJS', 'VDB', 'VDD', 'VBG', 'ITJ', 'VDI', 'ZZ0', 'VDZ', 'VHG', 'VDN', 'VHN', 'VDG']
{'<S>': 0, '<E>': 0, 'NN1': 1, 'PUN': 2, 'AT0': 3, 'PRP': 4, 'AJ0': 5, 'NN2': 6, 'NP0': 7, 'AV0': 8, 'PNP': 9, 'CJC': 10, 'PRF': 11, 'VVN': 12, 'VVD': 13, 'VVI': 14, 'DT0': 15, 'PUQ': 16, 'VVB': 17, 'CRD': 18, 'TO0': 19, 'VVG': 20, 'CJS': 21, 'DPS': 22, 'VM0': 23, 'VBD': 24, 'VBZ': 25, 'AVP': 26, 'VVZ': 27, 'CJT': 28, 'POS': 29, 'XX0': 30, 'NN0': 31, 'VBI': 32, 'DTQ': 33, 'VBB': 34, 'VHD': 35, 'ORD': 36, 'VHZ': 37, 'PUR': 38, 'PUL': 39, 'PNI': 40, 'UNC': 41, 'VHB': 42, 'AVQ': 43, 'VBN': 44, 'PNQ': 45, 'EX0': 46, 'AJC': 47, 'VHI': 48, 'PNX': 49, 'AJS': 50, 'VDB': 51,

In [24]:
def Viterbi(word_list):
    pred_tag = []
    prev_prob_list = np.ones(61)
    for word in word_list:
        
    

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [17]:
import glob
import xml.etree.ElementTree as ET 
# files = glob.glob("../Train-corpus/*/*.xml")
# for filepath in files:

filepath = "../Train-corpus/A1/A1A.xml"
root=ET.parse(filepath).getroot()
for s_tag in root.iter('s'):
    word_list = []
    tag_list = []
    for tags in s_tag:
        if('c5' not in tags.attrib):
            for u_tag in tags:
                curr_tag = u_tag.attrib.get('c5')
                curr_word = u_tag.text 
                tag_list.append(curr_tag)
                word_list.append(curr_word)
        else:
            curr_tag = u_tag.attrib.get('c5')
            curr_word = u_tag.text 
            tag_list.append(curr_tag)
            word_list.append(curr_word)
    pred_tag_list = Viterbi(word)

        


['THEORY ', 'AMONG ', 'THE ', 'ENGLISH ']
['NN1', 'PRP', 'AT0', 'NN1']
