In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

### Get data

In [2]:
DATA_FOLDER = Path('./dataset/')
AL = DATA_FOLDER/'AL'
AL_train = AL/'train'
AL_dev_x = AL/'dev.in'
AL_dev_y = AL/'dev.out'
AL_out_2 = AL/'dev.p2.out'
AL_out_3 = AL/'dev.p3.out'
AL_out_4 = AL/'dev.p4.out'

EN = DATA_FOLDER/'EN'
EN_train = EN/'train'
EN_dev_x = EN/'dev.in'
EN_dev_y = EN/'dev.out'
EN_out_2 = EN/'dev.p2.out'
EN_out_3 = EN/'dev.p3.out'
EN_out_4 = EN/'dev.p4.out'

CN = DATA_FOLDER/'CN'
CN_train = CN/'train'
CN_dev_x = CN/'dev.in'
CN_dev_y = CN/'dev.out'
CN_out_2 = CN/'dev.p2.out'
CN_out_3 = CN/'dev.p3.out'

SG = DATA_FOLDER/'SG'
SG_train = SG/'train'
SG_dev_x = SG/'dev.in'
SG_dev_y = SG/'dev.out'
SG_out_2 = SG/'dev.p2.out'
SG_out_3 = SG/'dev.p3.out'


EVAL_script = './EvalScript/evalResult.py'

In [3]:
def get_train_data(filename):
    with open(filename) as f:
        lines = f.readlines()
    x, y = [], []
    temp_x, temp_y = [], []
    for l in lines:
        if len(l) == 1:
            assert(len(temp_x) == len(temp_y))
            x.append(temp_x)
            y.append(temp_y)
            temp_x, temp_y = [], []
            continue
        xx, yy = l.split()
        temp_x.append(xx)
        temp_y.append(yy)
    if len(temp_x) != 0:
        x.append(temp_x)
        y.append(temp_y)
    assert(len(x) == len(y))
    
    return x, y

def get_test_data(filename, word2index):
    """Return:
                x: nested list of string
                x_int: nested list of integer"""
    with open(filename) as f:
        lines = f.readlines()
    x = []
    temp_x = []
    for l in lines:
        if len(l.strip()) == 0:
            x.append(temp_x)
            temp_x = []
            continue
        xx = l.split()
        temp_x.append(xx[0])
    if len(temp_x) != 0:
        x.append(temp_x)
    x_int = [[word2index[oo] for oo in o] for o in x ]
    return x, x_int

In [4]:
words, labels = get_train_data(AL_train)
vocab = list(set([oo for o in words for oo in o]))
tags = list(set([oo for o in labels for oo in o])) + ['SOS', 'EOS']
word2index = {o:i for i,o in enumerate(vocab)}
index2word = {i:o for i,o in enumerate(vocab)}
tag2index = {o:i for i,o in enumerate(tags)}
x = [[word2index[oo] for oo in o] for o in words]
y = [[tag2index[oo] for oo in o] for o in labels]

### part 1 Emission features

In [5]:
def emission(x, y, vocab, tags):
    emission = np.zeros((len(vocab), len(tags)))
    flat_y = [oo for o in y for oo in o]
    flat_x = [oo for o in x for oo in o]
    for xx, yy in zip(flat_x,flat_y):
        emission[xx, yy] += 1
    
    y_count = np.zeros(len(tags))
    for yy in flat_y:
        y_count[yy] += 1
    emission = emission/ y_count[None, :]
    np.nan_to_num(emission, 0)
    return emission

### Part 2 Adding smoothing parameters

In [6]:
from collections import Counter, defaultdict
vocab_count = Counter([oo for o in words for oo in o])
vocab = [o for o, v in dict(vocab_count).items() if v>=3] + ['#UNK#']
word2index = defaultdict(int)
for i,o in enumerate(vocab): word2index[o] = i+1
x = [ [word2index[oo] for oo in o] for o in words]

### Emission decoding

In [7]:
def decoding(x, emission_matrix):
    """emission matrix: (vocab_size, tag_size)
    x: converted to integer arrays"""
    return emission_matrix[x].argmax(axis=1)

def batch_decoding(output_filename, dev_x_filename, word2index, emission_matrix, tags):
    with open(output_filename, 'w') as f:
        words, dev_x = get_test_data(dev_x_filename, word2index)
        for ws,o in zip(words, dev_x):
            path = decoding(o, emission_matrix)
            for w, p in zip(ws, path):
                f.write(w + ' ' + tags[p] + '\n')
            f.write('\n')

### Grab everythings together

In [8]:
def main(train_file, output_filename, dev_x_filename):
    # read data
    words, labels = get_train_data(train_file)
    # create vocab
    tags = list(set([oo for o in labels for oo in o])) + ['SOS', 'EOS']
    tag2index = {o:i for i,o in enumerate(tags)}
    vocab_count = Counter([oo for o in words for oo in o])
    vocab = [o for o, v in dict(vocab_count).items() if v>=3] + ['#UNK#']
    word2index = defaultdict(int)
    for i,o in enumerate(vocab): word2index[o] = i+1
    # text to int
    x = [[word2index[oo] for oo in o] for o in words]
    y = [[tag2index[oo] for oo in o] for o in labels]
    # training
    emission_matrix = emission(x, y, vocab, tags)
    # decoding
    batch_decoding(output_filename, dev_x_filename, word2index, emission_matrix, tags)

## test model performance

##### AL

In [9]:
main(AL_train, AL_out_2, AL_dev_x)
! python {EVAL_script} {AL_dev_y} {AL_out_2}


#Entity in gold data: 8408
#Entity in prediction: 19484

#Correct Entity : 2898
Entity  precision: 0.1487
Entity  recall: 0.3447
Entity  F: 0.2078

#Correct Sentiment : 2457
Sentiment  precision: 0.1261
Sentiment  recall: 0.2922
Sentiment  F: 0.1762


#### SG

In [10]:
main(SG_train, SG_out_2, SG_dev_x)
! python {EVAL_script} {SG_dev_y} {SG_out_2}


#Entity in gold data: 4537
#Entity in prediction: 18451

#Correct Entity : 2632
Entity  precision: 0.1426
Entity  recall: 0.5801
Entity  F: 0.2290

#Correct Sentiment : 1239
Sentiment  precision: 0.0672
Sentiment  recall: 0.2731
Sentiment  F: 0.1078


#### EN

In [11]:
main(EN_train, EN_out_2, EN_dev_x)
! python {EVAL_script} {EN_dev_y} {EN_out_2}


#Entity in gold data: 13179
#Entity in prediction: 19406

#Correct Entity : 9152
Entity  precision: 0.4716
Entity  recall: 0.6944
Entity  F: 0.5617

#Correct Sentiment : 7644
Sentiment  precision: 0.3939
Sentiment  recall: 0.5800
Sentiment  F: 0.4692


#### CN

In [12]:
main(CN_train, CN_out_2, CN_dev_x)
! python {EVAL_script} {CN_dev_y} {CN_out_2}


#Entity in gold data: 1478
#Entity in prediction: 9373

#Correct Entity : 765
Entity  precision: 0.0816
Entity  recall: 0.5176
Entity  F: 0.1410

#Correct Sentiment : 285
Sentiment  precision: 0.0304
Sentiment  recall: 0.1928
Sentiment  F: 0.0525


## Part 3

### Transition

In [93]:
def transition(y, tags, tag2index):
    """
    tags: included 'SOS' and 'EOS'
    transition from: (v1, v2, ..., 'SOS')
    transition to:   (v1, v2, ..., 'EOS')
    rows are transition_from
    cols are transition_to
    """
    transition = np.zeros((len(tags)-1, len(tags)-1))
    for yy in y: 
        transition[-1, yy[0]] += 1 # START transition
        for i in range(len(yy)-1): # tags transition from position 0 to len(yy)-2
            transition[yy[i], yy[i+1]] += 1
        transition[yy[-1], -1] += 1 # STOP transition
    transition = transition/np.sum(transition, axis=1)
#     np.nan_to_num(transition, 0)
    return transition

transition_matrix = transition(y, tags, tag2index)
print("transition_matrix shape (transition_from, transition_to): ", transition_matrix.shape)

transition_matrix shape (transition_from, transition_to):  (43, 43)


### viterbi decode

In [148]:
def viterbi(x, transition_matrix, emission_matrix, tags):
    """transition_matrix: before log
    x: [1, 2, 4, 19, ...]
    transition: after log
    return: 
            path: (len(x), )
            log(max_score)
    """

    score = np.zeros( (len(x)+2, len(tags)-2) )
    argmax = np.zeros( (len(x)+2, len(tags)-2), dtype=np.int)
    transition, emission = np.log(transition_matrix), np.log(emission_matrix)
    score[1, :] = transition[-1, :-1] + emission[x[0], :-2] # initialization at j=1
    for j in range(2, len(x)+1): 
        for t in range(len(tags)-2):
            pi = score[j-1, :]  # (num_of_tags-2,)
            a = transition[:-1, t] # (num_of_tags-2,)
            b = emission[x[j-1], t] # (1,)
            top1 = (pi + a).argsort()[-1]
            argmax[j, t] = top1
            score[j, t] = (pi + a)[top1] + b
    # j=n+1 step
    pi = score[len(x)]
    a = transition[:-1, -1]
    argmax_stop = int( (pi+a).argsort()[-1] )
    max_stop = (pi+a)[argmax_stop]
    argmax = argmax[2:-1]
    # decoding
    path = [argmax_stop]
    temp_index = argmax_stop
    for i in range(len(argmax)-1, -1, -1):
        temp_index = argmax[i, temp_index]
        path.append(temp_index)
    return path[::-1], max_stop


def viterbi_decode_batch(x, transition_matrix, emission_matrix,
                         output_filename, dev_x_filename, word2index, tags):
    with open(output_filename, 'w') as f:
        words, dev_x = get_test_data(dev_x_filename, word2index)
        for i, (ws,o) in enumerate(zip(words, dev_x)):
            path, log_max_score = viterbi(o, transition_matrix, emission_matrix, tags)
            for w, p in zip(ws, path):
                f.write(w + ' ' + tags[p] + '\n')
            f.write('\n')
            
    return 

In [167]:
def q3(train_file, output_filename, dev_x_filename):
    # read data
    words, labels = get_train_data(train_file)
    # create vocab
    tags = list(set([oo for o in labels for oo in o])) + ['SOS', 'EOS']
    tag2index = {o:i for i,o in enumerate(tags)}
    vocab_count = Counter([oo for o in words for oo in o])
    vocab = [o for o, v in dict(vocab_count).items() if v>=3] + ['#UNK#']
    word2index = defaultdict(int)
    for i,o in enumerate(vocab): word2index[o] = i+1
    # text to int
    x = [[word2index[oo] for oo in o] for o in words]
    y = [[tag2index[oo] for oo in o] for o in labels]
    # training emission
    emission_matrix = emission(x, y, vocab, tags)
    # emission_matrix += 1e-5 Adding this smoothing will increase performance
    print("emission_matrix shape:", emission_matrix.shape)
    # training transition
    transition_matrix = transition(y, tags, tag2index)
    print("transition_matrix shape", transition_matrix.shape)
    # decoding
    viterbi_decode_batch(x, transition_matrix, emission_matrix,
                         output_filename, dev_x_filename, word2index, tags)
    return 

## Test model performance

In [168]:
q3(AL_train, AL_out_3, AL_dev_x)
! python {EVAL_script} {AL_dev_y} {AL_out_3}

emission_matrix shape: (2698, 44)
transition_matrix shape (43, 43)

#Entity in gold data: 8408
#Entity in prediction: 8520

#Correct Entity : 6731
Entity  precision: 0.7900
Entity  recall: 0.8005
Entity  F: 0.7953

#Correct Sentiment : 6077
Sentiment  precision: 0.7133
Sentiment  recall: 0.7228
Sentiment  F: 0.7180


In [169]:
q3(EN_train, EN_out_3, EN_dev_x)
! python {EVAL_script} {EN_dev_y} {EN_out_3}

emission_matrix shape: (6187, 23)
transition_matrix shape (22, 22)

#Entity in gold data: 13179
#Entity in prediction: 12724

#Correct Entity : 10785
Entity  precision: 0.8476
Entity  recall: 0.8183
Entity  F: 0.8327

#Correct Sentiment : 10370
Sentiment  precision: 0.8150
Sentiment  recall: 0.7869
Sentiment  F: 0.8007


In [170]:
q3(SG_train, SG_out_3, SG_dev_x)
! python {EVAL_script} {SG_dev_y} {SG_out_3}

emission_matrix shape: (10733, 9)
transition_matrix shape (8, 8)

#Entity in gold data: 4537
#Entity in prediction: 3036

#Correct Entity : 1662
Entity  precision: 0.5474
Entity  recall: 0.3663
Entity  F: 0.4389

#Correct Sentiment : 1035
Sentiment  precision: 0.3409
Sentiment  recall: 0.2281
Sentiment  F: 0.2733


In [171]:
q3(CN_train, CN_out_3, CN_dev_x)
! python {EVAL_script} {CN_dev_y} {CN_out_3}

emission_matrix shape: (7364, 9)
transition_matrix shape (8, 8)

#Entity in gold data: 1478
#Entity in prediction: 769

#Correct Entity : 309
Entity  precision: 0.4018
Entity  recall: 0.2091
Entity  F: 0.2750

#Correct Sentiment : 210
Sentiment  precision: 0.2731
Sentiment  recall: 0.1421
Sentiment  F: 0.1869


## Part 4 Top 7th sequences

In [182]:
def viterbi_top_k(x, transition_matrix, emission_matrix, tags, k=7):
    """transition_matrix: before log
    x: [1, 2, 4, 19, ...]
    transition: after log
    time complexity: O(knt^2)
    return: 
            path: (len(x), )
            log(max_score)
    """

    score = np.zeros( (len(x)+2, len(tags)-2, 7) )
    argmax = np.zeros( (len(x)+2, len(tags)-2, 7), dtype=np.int)
    transition, emission = np.log(transition_matrix), np.log(emission_matrix)
    # initialization at j=1
    score[1, :] = (transition[-1, :-1] + emission[x[0], :-2])[:, None] 
    for j in range(2, len(x)+1): 
        for t in range(len(tags)-2):
            pi = score[j-1, :]  # (num_of_tags-2, 7)
            a = transition[:-1, t] # (num_of_tags-2,)
            b = emission[x[j-1], t] # (1,)
            previous_all_scores = (pi + a[:,None]).flatten()
            topk = previous_all_scores.argsort()[-k:][::-1] # big to small
            argmax[j, t] = topk // 7
            score[j, t] = previous_all_scores[topk] + b
            
    # j=n+1 step
    pi = score[len(x)] # (num_of_tags-2, 7)
    a = transition[:-1, -1]
    argmax_stop_k = (pi + a[:,None]).flatten().argsort()[-k:][::-1][-1]//7 # big to small
    max_stop = np.max(pi+a[:,None])
    argmax = argmax[2:-1] # (len(x)-1, num_of_tags-2, 7)
    
    # decoding
    path = [argmax_stop_k]
    temp_index = argmax_stop_k
    for i in range(len(argmax)-1, -1, -1):
        temp_index = argmax[i, temp_index, 0]
        path.append(temp_index)
    return path[::-1], max_stop

def viterbi_decode_batch(x, transition_matrix, emission_matrix,
                         output_filename, dev_x_filename, word2index, tags):
    with open(output_filename, 'w') as f:
        words, dev_x = get_test_data(dev_x_filename, word2index)
        for i, (ws,o) in enumerate(zip(words, dev_x)):
            path, log_max_score = viterbi_top_k(o, transition_matrix, emission_matrix, tags)
            for w, p in zip(ws, path):
                f.write(w + ' ' + tags[p] + '\n')
            f.write('\n')
            
    return 

### Grab everything together

In [183]:
def q4(train_file, output_filename, dev_x_filename):
    # read data
    words, labels = get_train_data(train_file)
    # create vocab
    tags = list(set([oo for o in labels for oo in o])) + ['SOS', 'EOS']
    tag2index = {o:i for i,o in enumerate(tags)}
    vocab_count = Counter([oo for o in words for oo in o])
    vocab = [o for o, v in dict(vocab_count).items() if v>=3] + ['#UNK#']
    word2index = defaultdict(int)
    for i,o in enumerate(vocab): word2index[o] = i+1
    # text to int
    x = [[word2index[oo] for oo in o] for o in words]
    y = [[tag2index[oo] for oo in o] for o in labels]
    # training emission
    emission_matrix = emission(x, y, vocab, tags)
    # emission_matrix += 1e-5 Adding this smoothing will increase performance
    print("emission_matrix shape:", emission_matrix.shape)
    # training transition
    transition_matrix = transition(y, tags, tag2index)
    print("transition_matrix shape", transition_matrix.shape)
    # decoding
    viterbi_decode_batch(x, transition_matrix, emission_matrix,
                         output_filename, dev_x_filename, word2index, tags)
    return 

## Test model performance

In [184]:
q4(AL_train, AL_out_4, AL_dev_x)
! python {EVAL_script} {AL_dev_y} {AL_out_4}

emission_matrix shape: (2698, 44)
transition_matrix shape (43, 43)

#Entity in gold data: 8408
#Entity in prediction: 8520

#Correct Entity : 6731
Entity  precision: 0.7900
Entity  recall: 0.8005
Entity  F: 0.7953

#Correct Sentiment : 6077
Sentiment  precision: 0.7133
Sentiment  recall: 0.7228
Sentiment  F: 0.7180


In [185]:
q4(EN_train, EN_out_4, EN_dev_x)
! python {EVAL_script} {EN_dev_y} {EN_out_4}

emission_matrix shape: (6187, 23)
transition_matrix shape (22, 22)

#Entity in gold data: 13179
#Entity in prediction: 12761

#Correct Entity : 10785
Entity  precision: 0.8452
Entity  recall: 0.8183
Entity  F: 0.8315

#Correct Sentiment : 10370
Sentiment  precision: 0.8126
Sentiment  recall: 0.7869
Sentiment  F: 0.7995
