# Part 2

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

### Get Data

In [2]:
ds_fd = Path('./dataset/')
AL_train = ds_fd/'AL/train'
EN_train = ds_fd/'EN/train'
SG_train = ds_fd/'SG/train'
CN_train = ds_fd/'CN/train'

In [3]:
def read_data(path):
    with open(path) as f:
        X, Y = [], []
        temp_x, temp_y = [], []
        for x in f:
            if x != '\n':
                x, y = x.split(' ')
                y = y.strip('\n')
                temp_x.append(x)
                temp_y.append(y)
            else:
                X.append(temp_x)
                Y.append(temp_y)
                temp_x, temp_y = [], []
    return X, Y       

In [4]:
def get_unique_tags(Y):
    tags = [t for y in Y for t in y]
    return list(set(tags))

In [5]:
def get_unique_words(X):
    words = [w for x in X for w in x]
    return list(set(words))

### Find Emission Parameters

The emission matrix is in the form of (tag, vocabulary)

In [6]:
def get_emission_para(X, Y):
    
    y = [t for sublist in Y for t in sublist]
    x = [w for sublist in X for w in sublist]
    
    words = get_unique_words(X)
    tags = get_unique_tags(Y)
    
    word2index = {w:i for i,w in enumerate(words)}
    tag2index = {t:i for i,t in enumerate(tags)}
        
    counts = np.zeros((len(tags), len(words)))
    
    for i in range(len(x)):
        counts[tag2index[y[i]], word2index[x[k]]] += 1
    
    emission_prob = counts/np.sum(counts,1)[:,None]
    
    return emission_prob

### Find Emission Parameters by Smoothing

Based on the get_emission_para()

First count sum of all the words under k appearances, set it as the counts for '#UNK#'

Second delete columns

third update vocabulary dictionary value

In [7]:
def get_emission_para_smoothing(X, Y, K):
    
    y = [t for sublist in Y for t in sublist]
    x = [w for sublist in X for w in sublist]
    
    words = get_unique_words(X)
    tags = get_unique_tags(Y)
    
    word2index = {w:i for i,w in enumerate(words)}
    tag2index = {t:i for i,t in enumerate(tags)}
    index2tag = {i:t for i,t in enumerate(tags)}
        
    counts = np.zeros((len(tags), len(words)))
    
    for i in range(len(x)):
        counts[tag2index[y[i]], word2index[x[i]]] += 1
    
    removed_index = np.sum(counts, 0) < K

    if (np.sum(removed_index) > 0):
        
        counts = np.append(counts, np.sum(counts[:, removed_index], 1)[:,None], 1)
        counts = np.delete(counts, np.nonzero(removed_index), 1)
        
        new_words = [words[j] for j in range(len(words)) if not removed_index[j]]
        word2index = {w:i for i,w in enumerate(new_words)}
        word2index['#UNK#'] = len(new_words)
    
    emission_prob = counts/np.sum(counts,1)[:,None]
    
    return emission_prob, word2index, tag2index, index2tag

### Naive Approach

In [8]:
def naive_evaluation(word, emission_prob, word2index, index2tag):
    if word in vocab:
        return index2tag[np.argmax(emission_prob[:,word2index[word]])]
    else:
        return index2tag[np.argmax(emission_prob[:,word2index['#UNK#']])]

In [9]:
AL_dev_in = ds_fd/'AL/dev.in'
EN_dev_in = ds_fd/'EN/dev.in'
SG_dev_in = ds_fd/'SG/dev.in'
CN_dev_in = ds_fd/'CN/dev.in'

In [10]:
def naive_prediction_output(ds):
    
    train = ds_fd/(ds + '/train')
    dev_in = ds_fd/(ds + '/dev.in')
    
    X, Y = read_data(train)
    emission_prob, word2index, tag2index, index2tag = get_emission_para_smoothing(X, Y, 3)
    
    with open(dev_in) as f:
        input_data = f.readlines()
    
    output_string = ''
    
    for instance in input_data:
        if (instance != '\n'):
            word = instance.strip('\n')
            tag = naive_evaluation(word, emission_prob, word2index, index2tag)
            output_string += word + ' ' + tag + '\n'
        else:
            output_string += '\n'
    
    dev_out = ds_fd/(ds + '/dev.p2.out')
    with open(dev_out, 'w') as f:
        f.write(output_string)
    
    print('Done with writing predictions')
    
    return output_string

### Find Transition Parameters

In [11]:
def get_transition_para(Y):
    
    tags = get_unique_tags(Y)
    
    tag2index = {t:i for i,t in enumerate(tags)}
        
    counts = np.zeros((len(tags)+1, len(tags)+1))
    
    for instance in Y:
        for k in range(len(instance)-1):
            counts[tag2index[instance[k]], tag2index[instance[k+1]]] += 1
        counts[-1, tag2index[instance[0]]] += 1
        counts[tag2index[instance[-1]], -1] += 1
    
    transition_prob = counts/np.sum(counts,1)[:,None]
    
    return transition_prob

In [12]:
def viterbi_algo(sentence, emission_prob, transition_prob, word2index, tag2index, index2tag):
    
    score_matrix = np.zeros((len(tag2index), len(sentence)))
    path_matrix = np.zeros((len(tag2index), len(sentence)), dtype=int)
    
    for i in range(len(sentence)):
        for j in range(len(tag2index)):
            if i == 0:
                if sentence[i] in word2index:
                    score_matrix[j, i] = transition_prob[-1, j] * emission_prob[j, word2index[sentence[i]]]
                else:
                    score_matrix[j, i] = transition_prob[-1, j] * emission_prob[j, word2index["#UNK#"]]
            else:
                if sentence[i] in word2index:
                    competitors = score_matrix[:, i-1] * transition_prob[:-1, j] * emission_prob[j, word2index[sentence[i]]]
                else:
                    competitors = score_matrix[:, i-1] * transition_prob[:-1, j] * emission_prob[j, word2index["#UNK#"]]
                path_matrix[j,i] = np.argmax(competitors)
                score_matrix[j,i] = np.max(competitors)
        last_parent_node = np.argmax(score_matrix[:, -1] * transition_prob[:-1, -1])
    
    path = [last_parent_node]
    for m in range(len(sentence)-1, 0, -1):
        path.insert(0, path_matrix[path[0], m])
    output_tags = []
    for n in path:
        output_tags.append(index2tag[n])
    
    return output_tags 

In [13]:
def viterbi_algo(sentence, emission_prob, transition_prob, word2index, tag2index, index2tag):
    
    transition_prob = np.log(transition_prob)
    emission_prob = np.log(emission_prob)
    
    score_matrix = np.zeros((len(tag2index), len(sentence)))
    path_matrix = np.zeros((len(tag2index), len(sentence)), dtype=int)
    
    for i in range(len(sentence)):
        for j in range(len(tag2index)):
            if i == 0:
                if sentence[i] in word2index:
                    score_matrix[j, i] = transition_prob[-1, j] + emission_prob[j, word2index[sentence[i]]]
                else:
                    score_matrix[j, i] = transition_prob[-1, j] + emission_prob[j, word2index["#UNK#"]]
            else:
                if sentence[i] in word2index:
                    competitors = score_matrix[:, i-1] + transition_prob[:-1, j] + emission_prob[j, word2index[sentence[i]]]
                else:
                    competitors = score_matrix[:, i-1] + transition_prob[:-1, j] + emission_prob[j, word2index["#UNK#"]]
                path_matrix[j,i] = np.argmax(competitors)
                score_matrix[j,i] = np.max(competitors)
        last_parent_node = np.argmax(score_matrix[:, -1] + transition_prob[:-1, -1])
    
    path = [last_parent_node]
    for m in range(len(sentence)-1, 0, -1):
        path.insert(0, path_matrix[path[0], m])
    output_tags = []
    for n in path:
        output_tags.append(index2tag[n])
    
    return output_tags

In [18]:
! python EvalScript/evalResult.py dataset/AL/dev.out dataset/AL/dev.p3.out


#Entity in gold data: 8408
#Entity in prediction: 8455

#Correct Entity : 6708
Entity  precision: 0.7934
Entity  recall: 0.7978
Entity  F: 0.7956

#Correct Sentiment : 6059
Sentiment  precision: 0.7166
Sentiment  recall: 0.7206
Sentiment  F: 0.7186


In [19]:
! python EvalScript/evalResult.py dataset/EN/dev.out dataset/EN/dev.p3.out


#Entity in gold data: 13179
#Entity in prediction: 12713

#Correct Entity : 10784
Entity  precision: 0.8483
Entity  recall: 0.8183
Entity  F: 0.8330

#Correct Sentiment : 10372
Sentiment  precision: 0.8159
Sentiment  recall: 0.7870
Sentiment  F: 0.8012


In [20]:
def viterbi_output(ds):
    
    train = ds_fd/(ds + '/train')
    dev_in = ds_fd/(ds + '/dev.in')
    
    X, Y = read_data(train)
    emission_prob, word2index, tag2index, index2tag = get_emission_para_smoothing(X, Y, 3)
    transition_prob = get_transition_para(Y)
    print(emission_prob.shape)
    print(transition_prob.shape)
    sentences = []
    sentence = []
    
    with open(dev_in) as f:
        
        for line in f:
            if (line != '\n'):
                word = line.strip('\n')
                sentence.append(word)
            else:
                sentences.append(sentence)
                sentence = []
    
    tags = []

    for sentence in sentences:
        tags.append(viterbi_algo(sentence, emission_prob, transition_prob, word2index, tag2index, index2tag))
    
    output_string = ''
    for i in range(len(sentences)):
        for j in range(len(sentences[i])):
            output_string += sentences[i][j] + ' ' + tags[i][j] + '\n'
        output_string += '\n'
    
    dev_out = ds_fd/(ds + '/dev.p3.out')
    with open(dev_out, 'w') as f:
        f.write(output_string)
    
    print('Done with writing predictions')
    
    return output_string

In [21]:
_ = viterbi_output('AL')

(42, 2698)
(43, 43)


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


Done with writing predictions


'杭 B-CITY\n州 I-CITY\n市 I-CITY\n西 B-DISTRICT\n湖 I-DISTRICT\n区 I-DISTRICT\n古 B-TOWN\n荡 I-TOWN\n新 B-COMMUNITY\n村 I-COMMUNITY\n西 I-COMMUNITY\n34 B-HOUSENO\n幢 I-HOUSENO\n八 B-CELLNO\n单 I-CELLNO\n元 I-CELLNO\n1375 B-ROOMNO\n\n塘 B-TOWN\n雅 I-TOWN\n镇 I-TOWN\n顶 B-POI\n塘 I-POI\n工 I-POI\n业 I-POI\n园 I-POI\n鑫 I-POI\n龙 I-POI\n水 I-POI\n准 I-POI\n仪 I-POI\n器 I-POI\n有 I-POI\n限 I-POI\n公 I-POI\n司 I-POI\n\n中 B-COUNTRY\n国 I-COUNTRY\n浙 B-PROV\n江 I-PROV\n省 I-PROV\n温 B-CITY\n州 I-CITY\n市 I-CITY\n鹿 B-DISTRICT\n城 I-DISTRICT\n区 I-DISTRICT\n百 B-TOWN\n里 I-TOWN\n东 B-ROAD\n路 I-ROAD\n47 B-ROADNO\n号 I-ROADNO\n\n浙 B-PROV\n江 I-PROV\n省 I-PROV\n宁 B-CITY\n波 I-CITY\n市 I-CITY\n宁 B-DISTRICT\n海 I-DISTRICT\n县 I-DISTRICT\n宁 I-DISTRICT\n海 I-DISTRICT\n县 I-DISTRICT\n_ B-REDUNDANT\n长 B-TOWN\n街 I-TOWN\n镇 I-TOWN\n_ B-REDUNDANT\n手 B-PERSON\n机 I-PERSON\n号 I-PERSON\n\n浙 B-PROV\n江 I-PROV\n省 I-PROV\n杭 B-CITY\n州 I-CITY\n余 B-DISTRICT\n杭 I-DISTRICT\n区 I-DISTRICT\n乔 B-TOWN\n司 I-TOWN\n街 I-TOWN\n道 I-TOWN\n和 B-COMMUNITY\n睦 I-COMMUNITY\n村 I-COMMUNITY\n8

In [17]:
_ = viterbi_output('EN')

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


Done with writing predictions


"HBO B-NP\nhas B-VP\nclose I-VP\nto B-PP\n24 B-NP\nmillion I-NP\nsubscribers I-NP\nto B-PP\nits B-NP\nHBO I-NP\nand O\nCinemax B-NP\nnetworks I-NP\n, O\nwhile B-SBAR\nShowtime B-NP\nand O\nits B-NP\nsister I-NP\nservice I-NP\n, O\nThe B-NP\nMovie I-NP\nChannel I-NP\n, O\nhave B-VP\nonly I-VP\nabout B-PP\n10 B-NP\nmillion I-NP\n, O\naccording B-PP\nto B-PP\nPaul B-NP\nKagan I-NP\nAssociates I-NP\n, O\na B-NP\nCarmel I-NP\n, O\nCalif. B-NP\n, O\nresearch B-NP\nfirm I-NP\n. O\n\nWASHINGTON B-NP\nLIES I-NP\nLOW I-NP\nafter B-PP\nthe B-NP\nstock I-NP\nmarket I-NP\n's B-NP\nroller-coaster I-NP\nride I-NP\n. O\n\nThis B-NP\nmay B-VP\nseem I-VP\nto I-VP\nbe I-VP\na B-NP\npreposterous I-NP\nand O\nutterly B-NP\nfutile I-NP\neffort I-NP\nin I-NP\nAfrica I-NP\n. O\n\nAmerican B-NP\nExpress I-NP\nBank I-NP\nearnings I-NP\nfell B-VP\n50 B-NP\n% I-NP\nto B-PP\n$ B-NP\n21.3 I-NP\nmillion I-NP\nfrom B-PP\n$ B-NP\n42.5 I-NP\nmillion I-NP\ndespite B-PP\na B-NP\n29 I-NP\n% I-NP\nrevenue I-NP\ngain I-NP\n