In [13]:
import numpy as np

In [14]:
def computeEmissions():
    #read the ner_rare.counts file
    #compute e(x|y) for all x and y

    words = set()
    tags = set()

    #find out all the distinct words and tags
    with open('ner_rare.counts') as f:
        for line in f:
            tokens = line.strip().split()
            if(tokens[1] == 'WORDTAG'):
                words.add(tokens[3])
                tags.add(tokens[2])
                
    #emissions[word][tag] -> e(word | tag)
    emissions = dict()

    for word in words:
        emissions[word] = dict()

        #by defult if a tag is not assigned to a word => count(x, y) = 0
        for tag in tags:
            emissions[word][tag] = 0

    #count(y)
    count_of_tags = dict()
    for tag in tags:
        count_of_tags[tag] = 0

    with open('ner_rare.counts') as f:
        for line in f:
            tokens = line.strip().split()
            if(tokens[1] == 'WORDTAG'):
                word = tokens[3]
                tag = tokens[2]
                tag_count = int(tokens[0])
                count_of_tags[tag] += tag_count
                emissions[word][tag] = int(tokens[0])

    #normalize the counts now
    for word in words:
        for tag in tags: 
            emissions[word][tag] = float(emissions[word][tag]) / float(count_of_tags[tag])
    
    return emissions


In [15]:
def isRare(emissions, x):
    return not (x in emissions)

In [16]:
def findTagAndMaxEmission(emissions, x):
    emissions_for_x = emissions[x]
    return max(emissions_for_x.iteritems(), key = lambda x: x[1])

In [17]:
def writeTagsForDev(emissions):
    with open('ner_dev.dat') as f_input, open('4_2.txt', 'w') as f_output:
        for line in f_input:
            tokens = line.strip().split()
            
            line_written_to_output = ''
            
            #not a new line
            if(len(tokens) > 0):
                
                #x is modified to _RARE_ if it is qualifies as _RARE_. It is used to index emissions
                x = tokens[0]
                
                #word is used when writing to the output file, this is not modified
                word = tokens[0]
                
                if(isRare(emissions, word)):
                    x = '_RARE_'
                
                tag, max_emission = findTagAndMaxEmission(emissions, x)
                log_max_emission = np.log(max_emission)
                line_written_to_output = ' '.join([word, 
                                                   tag,
                                                  '{}'.format(log_max_emission)])
                line_written_to_output = line_written_to_output + '\n'
            else:
                line_written_to_output = '\n'
            
            f_output.write(line_written_to_output)
                    

In [18]:
emissions = computeEmissions()
writeTagsForDev(emissions)