In [4]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

### Get data

In [7]:
DATA_FOLDER = Path('./dataset/')
AL = DATA_FOLDER/'AL'
AL_train = AL/'train'
AL_dev_x = AL/'dev.in'
AL_dev_y = AL/'dev.out'
AL_out_2 = AL/'dev.p2.out'
AL_out_3 = AL/'dev.p3.out'
AL_out_4 = AL/'dev.p4.out'

EN = DATA_FOLDER/'EN'
EN_train = EN/'train'
EN_dev_x = EN/'dev.in'
EN_dev_y = EN/'dev.out'
EN_out_2 = EN/'dev.p2.out'
EN_out_3 = EN/'dev.p3.out'
EN_out_4 = EN/'dev.p4.out'

CN = DATA_FOLDER/'CN'
CN_train = CN/'train'
CN_dev_x = CN/'dev.in'
CN_dev_y = CN/'dev.out'
CN_out_2 = CN/'dev.p2.out'
CN_out_3 = CN/'dev.p3.out'

SG = DATA_FOLDER/'SG'
SG_train = SG/'train'
SG_dev_x = SG/'dev.in'
SG_dev_y = SG/'dev.out'
SG_out_2 = SG/'dev.p2.out'
SG_out_3 = SG/'dev.p3.out'


EVAL_script = './EvalScript/evalResult.py'

In [8]:
def get_train_data(filename):
    with open(filename) as f:
        lines = f.readlines()
    x, y = [], []
    temp_x, temp_y = [], []
    for l in lines:
        if len(l) == 1:
            assert(len(temp_x) == len(temp_y))
            x.append(temp_x)
            y.append(temp_y)
            temp_x, temp_y = [], []
            continue
        xx, yy = l.split()
        temp_x.append(xx)
        temp_y.append(yy)
    if len(temp_x) != 0:
        x.append(temp_x)
        y.append(temp_y)
    assert(len(x) == len(y))
    
    return x, y

def get_test_data(filename, word2index):
    """Return:
                x: nested list of string
                x_int: nested list of integer"""
    with open(filename) as f:
        lines = f.readlines()
    x = []
    temp_x = []
    for l in lines:
        if len(l.strip()) == 0:
            x.append(temp_x)
            temp_x = []
            continue
        xx = l.split()
        temp_x.append(xx[0])
    if len(temp_x) != 0:
        x.append(temp_x)
    x_int = [[word2index[oo] for oo in o] for o in x ]
    return x, x_int

In [9]:
words, labels = get_train_data(AL_train)
vocab = list(set([oo for o in words for oo in o]))
tags = list(set([oo for o in labels for oo in o])) + ['SOS', 'EOS']
word2index = {o:i for i,o in enumerate(vocab)}
index2word = {i:o for i,o in enumerate(vocab)}
tag2index = {o:i for i,o in enumerate(tags)}
x = [[word2index[oo] for oo in o] for o in words]
y = [[tag2index[oo] for oo in o] for o in labels]

### part 1 Emission features

In [10]:
def emission(x, y, vocab, tags):
    emission = np.zeros((len(vocab), len(tags)))
    flat_y = [oo for o in y for oo in o]
    flat_x = [oo for o in x for oo in o]
    for xx, yy in zip(flat_x,flat_y):
        emission[xx, yy] += 1
    
    y_count = np.zeros(len(tags))
    for yy in flat_y:
        y_count[yy] += 1
    emission = emission/ y_count[None, :]
    np.nan_to_num(emission, 0)
    return emission

### Part 2 Adding smoothing parameters

In [11]:
from collections import Counter, defaultdict
vocab_count = Counter([oo for o in words for oo in o])
vocab = [o for o, v in dict(vocab_count).items() if v>=3] + ['#UNK#']
word2index = defaultdict(int)
for i,o in enumerate(vocab): word2index[o] = i+1
x = [ [word2index[oo] for oo in o] for o in words]

### Emission decoding

In [12]:
def decoding(x, emission_matrix):
    """emission matrix: (vocab_size, tag_size)
    x: converted to integer arrays"""
    return emission_matrix[x].argmax(axis=1)

def batch_decoding(output_filename, dev_x_filename, word2index, emission_matrix, tags):
    with open(output_filename, 'w') as f:
        words, dev_x = get_test_data(dev_x_filename, word2index)
        for ws,o in zip(words, dev_x):
            path = decoding(o, emission_matrix)
            for w, p in zip(ws, path):
                f.write(w + ' ' + tags[p] + '\n')
            f.write('\n')

### Grab everythings together

In [13]:
def main(train_file, output_filename, dev_x_filename):
    # read data
    words, labels = get_train_data(train_file)
    # create vocab
    tags = list(set([oo for o in labels for oo in o])) + ['SOS', 'EOS']
    tag2index = {o:i for i,o in enumerate(tags)}
    vocab_count = Counter([oo for o in words for oo in o])
    vocab = [o for o, v in dict(vocab_count).items() if v>=3] + ['#UNK#']
    word2index = defaultdict(int)
    for i,o in enumerate(vocab): word2index[o] = i+1
    # text to int
    x = [[word2index[oo] for oo in o] for o in words]
    y = [[tag2index[oo] for oo in o] for o in labels]
    # training
    emission_matrix = emission(x, y, vocab, tags)
    # decoding
    batch_decoding(output_filename, dev_x_filename, word2index, emission_matrix, tags)

## test model performance

##### AL

In [9]:
main(AL_train, AL_out_2, AL_dev_x)
! python {EVAL_script} {AL_dev_y} {AL_out_2}


#Entity in gold data: 8408
#Entity in prediction: 19484

#Correct Entity : 2898
Entity  precision: 0.1487
Entity  recall: 0.3447
Entity  F: 0.2078

#Correct Sentiment : 2457
Sentiment  precision: 0.1261
Sentiment  recall: 0.2922
Sentiment  F: 0.1762


#### SG

In [10]:
main(SG_train, SG_out_2, SG_dev_x)
! python {EVAL_script} {SG_dev_y} {SG_out_2}


#Entity in gold data: 4537
#Entity in prediction: 18451

#Correct Entity : 2632
Entity  precision: 0.1426
Entity  recall: 0.5801
Entity  F: 0.2290

#Correct Sentiment : 1239
Sentiment  precision: 0.0672
Sentiment  recall: 0.2731
Sentiment  F: 0.1078


#### EN

In [11]:
main(EN_train, EN_out_2, EN_dev_x)
! python {EVAL_script} {EN_dev_y} {EN_out_2}


#Entity in gold data: 13179
#Entity in prediction: 19406

#Correct Entity : 9152
Entity  precision: 0.4716
Entity  recall: 0.6944
Entity  F: 0.5617

#Correct Sentiment : 7644
Sentiment  precision: 0.3939
Sentiment  recall: 0.5800
Sentiment  F: 0.4692


#### CN

In [12]:
main(CN_train, CN_out_2, CN_dev_x)
! python {EVAL_script} {CN_dev_y} {CN_out_2}


#Entity in gold data: 1478
#Entity in prediction: 9373

#Correct Entity : 765
Entity  precision: 0.0816
Entity  recall: 0.5176
Entity  F: 0.1410

#Correct Sentiment : 285
Sentiment  precision: 0.0304
Sentiment  recall: 0.1928
Sentiment  F: 0.0525


## Part 3

### Transition

In [82]:
def transition(y, tags, tag2index):
    SOS = tag2index['SOS']
    EOS = tag2index['EOS']
    y = [[SOS]+o+[EOS] for o in y]
    transition = np.zeros((len(tags), len(tags)))
    
    for yy in y:
        for i in range(len(yy)-1):
            start = yy[i]
            end = yy[i+1]
            transition[start, end] += 1
    
    transition = transition/np.sum(transition, axis=1)
    np.nan_to_num(transition, 0)
    return transition
transition_matrix = transition(y, tags, tag2index)
print("transition_matrix shape (transition_from, transition_to): ", transition_matrix.shape)

transition_matrix shape (transition_from, transition_to):  (44, 44)


### viterbi decode

In [86]:
def viterbi_score(x, transition_matrix, emission_matrix, sos_index, tags):
    """transition_matrix: before log
    x: [1, 2, 4, 19, ...]
    transition: after log
    score: （len(x)+1, #tags)
    """
    # correct way of initialization
    score = np.zeros( (len(x)+2, len(tags)) )
    score[0, :] = -np.inf
    score[0, sos_index] = 0

    argmax = np.zeros( (len(x)+1, len(tags)), dtype=np.int)
    transition, emission = np.log(transition_matrix), np.log(emission_matrix)
    for step in range(1, len(score)):
        for state in range(len(score[step])):
            previous_score = score[step-1]
            t_score = transition[:, state]
            top1 = (previous_score + t_score).argsort()[-1]
            argmax[step-1, state] = top1
            if step==len(x)+1:
                score[step, state] = (previous_score + t_score)[top1]
            else:
                x_word_index = x[step-1]
                e_score = emission[x_word_index, state]
                score[step, state] = (previous_score + t_score)[top1] + e_score
#     print(score[-1, -1])
    return score[1:,:], argmax

def viterbi_decode_path(argmax, eos_index):
    """
    argmax: (len(x)+1, #tags)
    eos_Index: EOS tag index in tags
    Returns:
            path: (len(x), )"""
    path = []
    temp_arg = eos_index
    for i in range(len(argmax)-1, -1, -1):
        temp_arg = argmax[i, temp_arg]
        path.append(temp_arg)
    return path

def viterbi(x, transition_matrix, emission_matrix, sos_index, eos_index, tags):
    """
    x: list of int
    transition_matrix: (#tags(with eos,sos), #tags(with eos,sos))
    emission_matrix: (#vocab_size,#tags(with eos,sos) )"""
    score, argmax = viterbi_score(x, transition_matrix, emission_matrix, sos_index, tags)
    path = viterbi_decode_path(argmax, eos_index)
    return path[:-1][::-1]

def viterbi_decode_batch(x, transition_matrix, emission_matrix, sos_index, eos_index, 
                         output_filename, dev_x_filename, word2index, tags):
    with open(output_filename, 'w') as f:
        words, dev_x = get_test_data(dev_x_filename, word2index)
        for i, (ws,o) in enumerate(zip(words, dev_x)):
            path = viterbi(o, transition_matrix, emission_matrix, sos_index, eos_index, tags)
            for w, p in zip(ws, path):
                f.write(w + ' ' + tags[p] + '\n')
            f.write('\n')
            
    return 

#### Grab everything together

In [98]:


def q3(train_file, output_filename, dev_x_filename):
    # read data
    words, labels = get_train_data(train_file)
    # create vocab
    tags = list(set([oo for o in labels for oo in o])) + ['SOS', 'EOS']
    tag2index = {o:i for i,o in enumerate(tags)}
    vocab_count = Counter([oo for o in words for oo in o])
    vocab = [o for o, v in dict(vocab_count).items() if v>=3] + ['#UNK#']
    word2index = defaultdict(int)
    for i,o in enumerate(vocab): word2index[o] = i+1
    # text to int
    x = [[word2index[oo] for oo in o] for o in words]
    y = [[tag2index[oo] for oo in o] for o in labels]
    # training emission
    emission_matrix = emission(x, y, vocab, tags)
    emission_matrix += 1e-9
    print("emission_matrix shape:", emission_matrix.shape)
    # training transition
    transition_matrix = transition(y, tags, tag2index)
    print("transition_matrix shape", transition_matrix.shape)
    # decoding
    viterbi_decode_batch(x, transition_matrix, emission_matrix,  tags.index('SOS'), tags.index('EOS'), 
                         output_filename, dev_x_filename, word2index, tags)
    return 

## Test model performance

In [88]:
q3(AL_train, AL_out_3, AL_dev_x)
! python {EVAL_script} {AL_dev_y} {AL_out_3}

emission_matrix shape: (2698, 44)
transition_matrix shape (44, 44)

#Entity in gold data: 8408
#Entity in prediction: 8588

#Correct Entity : 6703
Entity  precision: 0.7805
Entity  recall: 0.7972
Entity  F: 0.7888

#Correct Sentiment : 5993
Sentiment  precision: 0.6978
Sentiment  recall: 0.7128
Sentiment  F: 0.7052


In [99]:
q3(EN_train, EN_out_3, EN_dev_x)
! python {EVAL_script} {EN_dev_y} {EN_out_3}

emission_matrix shape: (6187, 23)
transition_matrix shape (23, 23)

#Entity in gold data: 13179
#Entity in prediction: 13056

#Correct Entity : 11091
Entity  precision: 0.8495
Entity  recall: 0.8416
Entity  F: 0.8455

#Correct Sentiment : 10667
Sentiment  precision: 0.8170
Sentiment  recall: 0.8094
Sentiment  F: 0.8132


In [18]:
q3(SG_train, SG_out_3, SG_dev_x)
! python {EVAL_script} {SG_dev_y} {SG_out_3}


#Entity in gold data: 4537
#Entity in prediction: 4335

#Correct Entity : 1387
Entity  precision: 0.3200
Entity  recall: 0.3057
Entity  F: 0.3127

#Correct Sentiment : 801
Sentiment  precision: 0.1848
Sentiment  recall: 0.1765
Sentiment  F: 0.1806


In [20]:
q3(CN_train, CN_out_3, CN_dev_x)
! python {EVAL_script} {CN_dev_y} {CN_out_3}


#Entity in gold data: 1478
#Entity in prediction: 923

#Correct Entity : 315
Entity  precision: 0.3413
Entity  recall: 0.2131
Entity  F: 0.2624

#Correct Sentiment : 210
Sentiment  precision: 0.2275
Sentiment  recall: 0.1421
Sentiment  F: 0.1749


## Part 4 Top 7th sequences

In [35]:
def viterbi_score_top_k(x, transition_matrix, emission_matrix, sos_index, tags, k=7):
    """transition_matrix: before log
    transition: after log
    score: （len(x)+1, #tags, 7)
    argmax: (len(x)+1, #tags, 7)
    time complexity: O(knt^2)
    """
    # correct way of initialization
    score = np.zeros( (len(x)+2, len(tags), k) )
    score[0, :, :] = -np.inf
    score[0, sos_index, :] = 0
    argmax = np.zeros( (len(x)+1, len(tags), k), dtype=np.int64)
    transition, emission = np.log(transition_matrix), np.log(emission_matrix)
    
    for step in range(1, len(score)):
        for state in range(len(score[step])): # state means tag
            previous_score = score[step-1] # (num_of_tags, 7)
            t_score = transition[:, state] # (num_of_tags,)
            
            previous_all_scores = (previous_score + t_score[:,np.newaxis]).flatten() # (7*num_of_tags, )
            top_7_arg = previous_all_scores.argsort()[-k:][::-1] # (7, )
            argmax[step-1, state] = top_7_arg // 7
            top_7_scores = previous_all_scores[top_7_arg] # (7,)
            if step-1==len(x):
                # last step
                score[step, state] = top_7_scores
            else:
                # all steps before last step
                x_word_index = x[step-1]
                e_score = emission[x_word_index, state] # (1,)
                score[step, state] = top_7_scores + e_score
    return score[1:,:, :], argmax



def viterbi_decode_path_top_k(argmax, eos_index, k=7):
    """
    argmax: (len(x)+1, #tags, 7)
    eos_Index: EOS tag index in tags
    Returns:
            path: (len(x),)
    """
    path = []
    temp_state = eos_index
    
    for i in range(len(argmax)-1, -1, -1):
        temp_state = argmax[i, temp_state][-1] if i == len(argmax)-1 else argmax[i, temp_state][0] # (1,)
        path.append(temp_state)
    return path


def viterbi_top_k(x, transition_matrix, emission_matrix, sos_index, eos_index, tags):
    """
    x: list of int
    transition_matrix: (#tags(with eos,sos), #tags(with eos,sos))
    emission_matrix: (#vocab_size,#tags(with eos,sos) )"""
    score, argmax = viterbi_score_top_k(x, transition_matrix, emission_matrix, sos_index, tags, 7)
    path = viterbi_decode_path_top_k(argmax, eos_index, 7)
    return path[:-1][::-1]

def viterbi_decode_batch_top_k(x, transition_matrix, emission_matrix, sos_index, eos_index, 
                         output_filename, dev_x_filename, word2index, tags):
    with open(output_filename, 'w') as f:
        words, dev_x = get_test_data(dev_x_filename, word2index)
        for i, (ws,o) in enumerate(zip(words, dev_x)):
            path = viterbi_top_k(o, transition_matrix, emission_matrix, sos_index, eos_index, tags)
            for w, p in zip(ws, path):
                f.write(w + ' ' + tags[p] + '\n')
            f.write('\n')   
    return 

### Grab everything together

In [79]:
def q4(train_file, output_filename, dev_x_filename):
    # read data
    words, labels = get_train_data(train_file)
    # create vocab
    tags = list(set([oo for o in labels for oo in o])) + ['SOS', 'EOS']
    tag2index = {o:i for i,o in enumerate(tags)}
    vocab_count = Counter([oo for o in words for oo in o])
    vocab = [o for o, v in dict(vocab_count).items() if v>=3] + ['#UNK#']
    word2index = defaultdict(int)
    for i,o in enumerate(vocab): word2index[o] = i+1
    # text to int
    x = [[word2index[oo] for oo in o] for o in words]
    y = [[tag2index[oo] for oo in o] for o in labels]
    # training emission
    emission_matrix = emission(x, y, vocab, tags)
    emission_matrix += 1e-9
    # training transition
    transition_matrix = transition(y, tags, tag2index)
    # decoding
    viterbi_decode_batch_top_k(x, transition_matrix, emission_matrix,  tags.index('SOS'), tags.index('EOS'), 
                         output_filename, dev_x_filename, word2index, tags)
    return 

## Test model performance

In [80]:
q4(AL_train, AL_out_4, AL_dev_x)
! python {EVAL_script} {AL_dev_y} {AL_out_4}


#Entity in gold data: 8408
#Entity in prediction: 8588

#Correct Entity : 6703
Entity  precision: 0.7805
Entity  recall: 0.7972
Entity  F: 0.7888

#Correct Sentiment : 5993
Sentiment  precision: 0.6978
Sentiment  recall: 0.7128
Sentiment  F: 0.7052


In [81]:
q4(EN_train, EN_out_4, EN_dev_x)
! python {EVAL_script} {EN_dev_y} {EN_out_4}


#Entity in gold data: 13179
#Entity in prediction: 13056

#Correct Entity : 11091
Entity  precision: 0.8495
Entity  recall: 0.8416
Entity  F: 0.8455

#Correct Sentiment : 10667
Sentiment  precision: 0.8170
Sentiment  recall: 0.8094
Sentiment  F: 0.8132
