In [75]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

### Get data

In [254]:
DATA_FOLDER = Path('./dataset/')
AL = DATA_FOLDER/'AL'
AL_train = AL/'train'
AL_dev_x = AL/'dev.in'
AL_dev_y = AL/'dev.out'
AL_out_2 = AL/'dev.p2.out'

SG = DATA_FOLDER/'SG'
SG_train = SG/'train'
SG_dev_x = SG/'dev.in'
SG_dev_y = SG/'dev.out'
SG_out_2 = SG/'dev.p2.out'

CN = DATA_FOLDER/'CN'
CN_train = CN/'train'
CN_dev_x = CN/'dev.in'
CN_dev_y = CN/'dev.out'
CN_out_2 = CN/'dev.p2.out'

EN = DATA_FOLDER/'EN'
EN_train = EN/'train'
EN_dev_x = EN/'dev.in'
EN_dev_y = EN/'dev.out'
EN_out_2 = EN/'dev.p2.out'

EVAL_script = './EvalScript/evalResult.py'

In [199]:
def get_train_data(filename):
    with open(filename) as f:
        lines = f.readlines()
    x, y = [], []
    temp_x, temp_y = [], []
    for l in lines:
        if len(l) == 1:
            assert(len(temp_x) == len(temp_y))
            x.append(temp_x)
            y.append(temp_y)
            temp_x, temp_y = [], []
            continue
        xx, yy = l.split()
        temp_x.append(xx)
        temp_y.append(yy)
    if len(temp_x) != 0:
        x.append(temp_x)
        y.append(temp_y)
    assert(len(x) == len(y))
    
    return x, y

def get_test_data(filename, word2index):
    """Return:
                x: nested list of string
                x_int: nested list of integer"""
    with open(filename) as f:
        lines = f.readlines()
    x = []
    temp_x = []
    for l in lines:
        if len(l.strip()) == 0:
            x.append(temp_x)
            temp_x = []
            continue
        xx = l.split()
        temp_x.append(xx[0])
    if len(temp_x) != 0:
        x.append(temp_x)
    x_int = [[word2index[oo] for oo in o] for o in x ]
    return x, x_int

In [201]:
words, labels = get_train_data(AL_train)
vocab = list(set([oo for o in words for oo in o]))
tags = list(set([oo for o in labels for oo in o])) + ['SOS', 'EOS']
word2index = {o:i for i,o in enumerate(vocab)}
index2word = {i:o for i,o in enumerate(vocab)}
tag2index = {o:i for i,o in enumerate(tags)}
x = [[word2index[oo] for oo in o] for o in words]
y = [[tag2index[oo] for oo in o] for o in labels]

### part 1 Emission features

In [297]:
def emission(x, y, vocab, tags):
    emission = np.zeros((len(vocab), len(tags)))
    flat_y = [oo for o in y for oo in o]
    flat_x = [oo for o in x for oo in o]
    for xx, yy in zip(flat_x,flat_y):
        emission[xx, yy] += 1
    
    y_count = np.zeros(len(tags))
    for yy in flat_y:
        y_count[yy] += 1
    emission = emission/ y_count[None, :]
    np.nan_to_num(emission, 0)
    return emission

In [94]:
emission_matrix = emission(x, y)
print("emission_matrix shape (vocab, tags): ", emission_matrix.shape)
transition_matrix = transition(y)
print("transition_matrix shape (transition_from, transition_to): ", transition_matrix.shape)

emission_matrix shape (vocab, tags):  (5321, 44)
transition_matrix shape (transition_from, transition_to):  (44, 44)


  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


### Part 2 Adding smoothing parameters

In [204]:
from collections import Counter, defaultdict
vocab_count = Counter([oo for o in words for oo in o])
vocab = [o for o, v in dict(vocab_count).items() if v>=3] + ['#UNK#']
word2index = defaultdict(int)
for i,o in enumerate(vocab): word2index[o] = i+1
x = [ [word2index[oo] for oo in o] for o in words]

In [206]:
emission_matrix = emission(x, y)
print("emission_matrix shape (vocab, tags): ", emission_matrix.shape)

emission_matrix shape (vocab, tags):  (2698, 44)


  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


### Emission decoding

In [304]:
def decoding(x, emission_matrix):
    """emission matrix: (vocab_size, tag_size)
    x: converted to integer arrays"""
    return emission_matrix[x].argmax(axis=1)

def batch_decoding(output_filename, dev_x_filename, word2index, emission_matrix, tags):
    with open(output_filename, 'w') as f:
        words, dev_x = get_test_data(dev_x_filename, word2index)
        for ws,o in zip(words, dev_x):
            path = decoding(o, emission_matrix)
            for w, p in zip(ws, path):
                f.write(w + ' ' + tags[p] + '\n')
            f.write('\n')

### Grab everythings together

In [305]:
def main(train_file, output_filename, dev_x_filename):
    # read data
    words, labels = get_train_data(train_file)
    # create vocab
    tags = list(set([oo for o in labels for oo in o])) + ['SOS', 'EOS']
    tag2index = {o:i for i,o in enumerate(tags)}
    vocab_count = Counter([oo for o in words for oo in o])
    vocab = [o for o, v in dict(vocab_count).items() if v>=3] + ['#UNK#']
    word2index = defaultdict(int)
    for i,o in enumerate(vocab): word2index[o] = i+1
    # text to int
    x = [[word2index[oo] for oo in o] for o in words]
    y = [[tag2index[oo] for oo in o] for o in labels]
    # training
    emission_matrix = emission(x, y, vocab, tags)
    # decoding
    batch_decoding(output_filename, dev_x_filename, word2index, emission_matrix, tags)

## test model performance

##### AL

In [306]:
main(AL_train, AL_out_2, AL_dev_x)
! python {EVAL_script} {AL_dev_y} {AL_out_2}


#Entity in gold data: 8408
#Entity in prediction: 19484

#Correct Entity : 2898
Entity  precision: 0.1487
Entity  recall: 0.3447
Entity  F: 0.2078

#Correct Sentiment : 2457
Sentiment  precision: 0.1261
Sentiment  recall: 0.2922
Sentiment  F: 0.1762


  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


#### SG

In [307]:
main(SG_train, SG_out_2, SG_dev_x)
! python {EVAL_script} {SG_dev_y} {SG_out_2}

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()



#Entity in gold data: 4537
#Entity in prediction: 18451

#Correct Entity : 2632
Entity  precision: 0.1426
Entity  recall: 0.5801
Entity  F: 0.2290

#Correct Sentiment : 1239
Sentiment  precision: 0.0672
Sentiment  recall: 0.2731
Sentiment  F: 0.1078


#### EN

In [308]:
main(EN_train, EN_out_2, EN_dev_x)
! python {EVAL_script} {EN_dev_y} {EN_out_2}

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()



#Entity in gold data: 13179
#Entity in prediction: 19406

#Correct Entity : 9152
Entity  precision: 0.4716
Entity  recall: 0.6944
Entity  F: 0.5617

#Correct Sentiment : 7644
Sentiment  precision: 0.3939
Sentiment  recall: 0.5800
Sentiment  F: 0.4692


#### CN

In [309]:
main(CN_train, CN_out_2, CN_dev_x)
! python {EVAL_script} {CN_dev_y} {CN_out_2}


#Entity in gold data: 1478
#Entity in prediction: 9373

#Correct Entity : 765
Entity  precision: 0.0816
Entity  recall: 0.5176
Entity  F: 0.1410

#Correct Sentiment : 285
Sentiment  precision: 0.0304
Sentiment  recall: 0.1928
Sentiment  F: 0.0525


  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


## Part 3

In [259]:
def main(train_file, output_filename, dev_x_filename):
    # read data
    words, labels = get_train_data(AL_train)
    # create vocab
    tags = list(set([oo for o in labels for oo in o])) + ['SOS', 'EOS']
    tag2index = {o:i for i,o in enumerate(tags)}
    vocab_count = Counter([oo for o in words for oo in o])
    vocab = [o for o, v in dict(vocab_count).items() if v>=3] + ['#UNK#']
    word2index = defaultdict(int)
    for i,o in enumerate(vocab): word2index[o] = i+1
    # text to int
    x = [[word2index[oo] for oo in o] for o in words]
    y = [[tag2index[oo] for oo in o] for o in labels]
    return x, y

### Transition

In [None]:
def transition(y):
    SOS = tag2index['SOS']
    EOS = tag2index['EOS']
    y = [[SOS]+o+[EOS] for o in y]
    transition = np.zeros((len(tags), len(tags)))
    
    for yy in y:
        for i in range(len(yy)-1):
            start = yy[i]
            end = yy[i+1]
            transition[start, end] += 1
    
    transition = transition/np.sum(transition, axis=1)
    return transition

### Inference

In [158]:
def viterbi_decoding(x, transition_matrix, emission_matrix):
    score = np.ones( (len(x)+1, len(tags)) )
    transition_matrix, emission_matrix = np.log(transition_matrix), np.log(emission_matrix)
    for step in range(1, len(score)):
        for state in range(len(score[step])):
            score[step, state] = np.max(score[step-1] + transition[:, state]) + emission_matrix[x[step], state]
    return score

In [159]:
decoding_path = viterbi_decoding(x, transition_matrix, emission_matrix)