In [1]:
import numpy as np
from collections import defaultdict, Counter
import conlleval
from pathlib import Path
from scipy.optimize import fmin_l_bfgs_b
import copy

save_dir = Path('save')

In [2]:
def get_full_train(filename):
    file_object = open(filename, 'r')
    X = []
    Y1 = []
    Y2 = []

    x_sent = []
    y1_sent = []
    y2_sent = []
    try:
        for line in file_object:
            if len(line) != 1:
                word, pos, ner = line.split()
                x_sent.append(word)
                y1_sent.append(pos)
                y2_sent.append(ner)

            else:
                X.append(x_sent)
                x_sent = []

                Y1.append(y1_sent)
                y1_sent = []

                Y2.append(y2_sent)
                y2_sent = []
    finally:
        file_object.close()
    return X, Y1, Y2

In [3]:
partial_train = "../data/partial/train"
full_train = "../data/full/train"

ninf = -1e5

X, Y_POS, Y_NER = get_full_train(full_train)
print("X length: {}, Y length: {}".format(len(X), len(Y_NER)))

X length: 700, Y length: 700


## Part 1

In [4]:
def get_set(mlist):
    '''
    Assmue mlist is a 2-d list with STRING elements
    '''
    output = set()
    for line in mlist:
        output.update(line)
    return sorted(output)

In [5]:
def emission(Y, X, Y_NER_set, X_set, smooth=0.01):
    '''
    emission: Y -> X
    '''
    count_y = dict()
    count_yx = dict()
    
    for y_token in Y_NER_set:
        count_y[y_token] = len(X_set) * smooth
    
    for i in range(len(X)):
        for y_token, x_token in zip(Y[i], X[i]):
            if (y_token, x_token) in count_yx:
                count_y[y_token] += 1
                count_yx[(y_token, x_token)] += 1
            else:
                count_yx[(y_token, x_token)] = 1
        
    emission_prob = dict()

    for y_token in Y_NER_set:
        for x_token in X_set:
            key = "emission:" + y_token + "+" + x_token
            if (y_token, x_token) in count_yx:
                emission_prob[key] = np.log(count_yx[(y_token, x_token)] / count_y[y_token])
            else:
#                 emission_prob[key] = np.log(smooth / count_y[y_token])
                emission_prob[key] = -1e8
    return emission_prob

def calc_e(x_data, y_data, x_vocab, y_vocab):
    count_emission = Counter([(x,y) for x_instance, y_instance in zip(x_data, y_data) for x, y in zip(x_instance, y_instance)])
    count_label = Counter([oo for o in y_data for oo in o])
    
    e_score = {}
    for y in y_vocab:
        for x in x_vocab:
            feature = f"emission:{y}+{x}"
            
            if (x,y) not in count_emission:
                e_score[feature] = ninf
            else:
                score = np.log(count_emission[(x,y)]  /  count_label[y])
                e_score[feature] = score
    
    return e_score

In [6]:
def transition(Y, Y_NER_set, smooth=0.01):
    count_y = dict()
    count_yy = dict()

    for y_token in Y_NER_set+['START']:
        count_y[y_token] = (len(Y_NER_set) + 1) * smooth
            
    for y_sent in Y:
        y_p = ['START'] + y_sent + ['STOP']
        for i in range(len(y_p)-1):
            if (y_p[i], y_p[i+1]) in count_yy:
                count_y[y_p[i]] += 1
                count_yy[(y_p[i], y_p[i+1])] += 1
            else:
                count_yy[(y_p[i], y_p[i+1])] = 1
    
    transition_prob = dict()
    
    for y1 in ['START']+Y_NER_set:
        for y2 in Y_NER_set+['STOP']:
            if y1 == 'START'and y2 == 'STOP':
                break
            key = "transition:" + y1 + "+" + y2
            if (y1, y2) in count_yy:
                transition_prob[key] = np.log(count_yy[(y1, y2)] / count_y[y1])
            else:
#                 transition_prob[key] = np.log(smooth / count_y[y1])
                transition_prob[key] = -1e8
                
    return transition_prob

def calc_t(y_data, y_vocab):
    count_transition = Counter([ (y_prev, y) for y_instance in y_data for y_prev, y in zip(['START'] + y_instance, y_instance + ['STOP'])])
    count_label = Counter([y for y_instance in y_data for y in ['START'] + y_instance])
    
    f_score = {}
    for y_prev in ['START'] + y_vocab:
        for y in y_vocab + ['STOP']:
            feature = f"transition:{y_prev}+{y}"
            
            if (y_prev,y) not in count_transition:
                f_score[feature] = ninf
            else:
                score = np.log(count_transition[(y_prev,y)]  /  count_label[y_prev])
                f_score[feature] = score
    
    return f_score

In [7]:
X_set = get_set(X)
Y_NER_set = get_set(Y_NER)

# e = emission(Y_NER, X, Y_NER_set, X_set, smooth=0.001)
# t = transition(Y_NER, Y_NER_set, smooth=0.001)
e = calc_e(X, Y_NER, X_set, Y_NER_set)
t = calc_t(Y_NER, Y_NER_set)

W = {**e, **t}


## Part 2

In [8]:
def compute_score(x_sent, y_sent, f_weights):

    score = 0
    y_sent_p = ['START'] + y_sent + ['STOP']
    x_sent_p = ['START'] + x_sent
    for i in range(1, len(x_sent_p)):
        word, tag1, tag2 = x_sent_p[i], y_sent_p[i-1], y_sent_p[i]
        t_key = "transition:" + tag1 + "+" + tag2
        score += f_weights[t_key]
        e_key = "emission:" + tag2 + "+" + word
        score += f_weights[e_key]
        if i == len(x_sent_p) - 1:
            tag1 = y_sent_p[i]
            tag2 = y_sent_p[i+1]
            t_key = "transition:" + tag1 + "+" + tag2
            score += t[t_key]
        
    return score

# def compute_score(x_instance, y_instance, feature_dict):
#     feature_count = defaultdict(int)
    
#     for x, y in zip(x_instance, y_instance): feature_count[f"emission:{y}+{x}"] += 1
    
#     for y_prev, y in zip(['START'] + y_instance, y_instance + ['STOP']):
#         feature_count[f"transition:{y_prev}+{y}"] += 1
        
#     score = sum([feature_dict[feat]*count for feat, count in feature_count.items()])
#     return score

In [9]:
x_instance = "This is the second U.N.-Congolese offensive against militias in the region since the DRC 's constitutional referendum a week ago .".split()
y_instance = 'O O O O O O O O O O O O O B-geo O O O O O O O'.split()
compute_score(x_instance, y_instance, W)

-139.57175855522826

In [10]:
def viterbi(x_sent, f_weights, tags):

    # initialize
    q1 = list()
    for i in range(len(tags)):
        score = 0
        t_key = "transition:START+" + tags[i]
#         print("test 1: ", t[t_key])
        score += f_weights[t_key]
        e_key = "emission:" + tags[i] + "+" + x_sent[0]
#         print("test 2: ", e[e_key])
        score += f_weights[e_key]
        q1.append((i,score))

    argmax = list()
    q2 = []
    for i in range(1, len(x_sent)):
        argmax_sub = list()
        for j in range(len(tags)):
            scores = list()
            for (k, last_score) in q1:
                score = 0
                t_key = "transition:" + tags[k] + "+" + tags[j]
                score += f_weights[t_key]
                e_key = "emission:" + tags[j] + "+" + x_sent[i]
                score += f_weights[e_key]
                scores.append(score+last_score)
            best_score = np.max(scores)
            best_tag = np.argmax(scores)
            q2.append((best_tag, best_score))
            argmax_sub.append(best_tag)
        argmax.append(argmax_sub)
        q1 = q2
        q2 = []

    scores = list()
    for (k, last_score) in q1:
        score = 0
        t_key = "transition:" + tags[k] + "+STOP"
        score += f_weights[t_key]
        scores.append(score+last_score)
    final_best_score = np.max(scores)
    final_best_tag = np.argmax(scores)

    path = [final_best_tag]
    pointer = final_best_tag
    for i in range(len(argmax)-1, -1, -1):
        pointer = argmax[i][pointer]
        path.append(pointer)

    return path[::-1], final_best_score

In [11]:
evals = []
# for i in range(len(X)):
for i in range(len(X)-600):
    path_idx, score = viterbi(X[i], W, Y_NER_set)
    for j in range(len(path_idx)):
        line = "{} {} {} {}".format(X[i][j], Y_POS[i][j], Y_NER[i][j], Y_NER_set[path_idx[j]])
        evals.append(line)
res = conlleval.evaluate(evals)
print(conlleval.report(res))

processed 2202 tokens with 238 phrases; found: 255 phrases; correct: 155.
accuracy:  95.10%; precision:  60.78%; recall:  65.13%; FB1:  62.88
              art: precision: 100.00%; recall: 100.00%; FB1: 100.00  1
              geo: precision:  65.06%; recall:  73.97%; FB1:  69.23  83
              gpe: precision:  93.94%; recall:  96.88%; FB1:  95.38  33
              nat: precision: 100.00%; recall: 100.00%; FB1: 100.00  2
              org: precision:  41.46%; recall:  36.96%; FB1:  39.08  41
              per: precision:  36.96%; recall:  44.74%; FB1:  40.48  46
              tim: precision:  67.35%; recall:  71.74%; FB1:  69.47  49



## Part 3 (i): Calculate Loss by  forward algorithm

In [12]:
# import copy
# np.set_printoptions(precision=2)

# def f1(y1, y2, x):
#     return "transition:"+y1+"+"+y2
# def f2(y1, y2, x):
#     if y2 == "STOP":
#         return 0
# #     print("emission WTD:"+y2+"+"+x)
#     return "emission:"+y2+"+"+x

# def M(y1, y2, x, W, f_ls):
#     output = 0
#     for f in f_ls:
#         output += W[f(y1, y2, x)]
#     return output
# def forward_end(alpha, W, f_ls, Y_NER_set):
#     m = len(Y_NER_set)
        
#     M_stop = np.zeros(m)
#     for i in range(m):
#         M_stop[i] = M(y1=Y_NER_set[i], y2='STOP', x=None, W=W, f_ls=f_ls[:1])
#     mM_stop = M_stop.reshape(m)
    
# #     M_max = np.max(mM_stop)
# #     mM_stop -= M_max    
# #     mM_stop = np.exp(mM_stop)
        
# #     A_max = np.max(alpha)
# #     alpha -= A_max
# #     alpha = np.exp(alpha)
    
# #     alpha = np.log(np.dot(mM_stop, alpha)) + M_max + A_max

#     alpha = alpha.reshape(m)
#     alpha = mM_stop + alpha
#     l_max = np.max(alpha)
#     alpha -= l_max
#     alpha = np.exp(alpha)
#     alpha = np.sum(alpha)
#     alpha = np.log(alpha)
#     alpha = alpha + l_max
    
#     return alpha.item()  

# def f11(mM_mid, alpha, m):
#     a = alpha.reshape(1, -1)
#     a = mM_mid + a
#     l_max = np.max(a, axis=1).reshape(-1, 1)
#     a -= l_max
#     a = np.exp(a)
#     a = np.log(np.sum(a,axis=1).reshape(-1, 1))
#     a = a + l_max
#     return a

# def f12(mM_mid, alpha, m):
#     a = alpha.reshape((m, 1))
#     M_max = np.max(mM_mid, axis=1).reshape((m, 1))
#     mM_mid -= M_max
#     mM_mid = np.exp(mM_mid)
        
#     A_max = np.max(a)
#     a -= A_max
#     a = np.exp(a)
#     a = np.matmul(mM_mid, a)
#     a[a == 0] = 1
#     a = np.log(a) + M_max + A_max
#     return a

# def forward(x_sent, W, f_ls, Y_NER_set, idx):
#     # build M (start M->m*1; mid M->m*m; stop M->m*1)
#     m = len(Y_NER_set)
#     M_start = np.zeros(m)
#     for i in range(m):
#         M_start[i] = M(y1='START', y2=Y_NER_set[i], x=x_sent[0], W=W, f_ls=f_ls[:])
    
#     alpha = M_start.reshape((m,1))
    
#     M_mid = np.zeros((m, m))
#     for i in range(m):
#         for j in range(m):
#             t_score = W.get( f"transition:{Y_NER_set[j]}+{Y_NER_set[i]}", ninf)
# #             M_mid[i][j] = M(y1=Y_NER_set[j], y2=Y_NER_set[i], x=None, W=W, f_ls=f_ls[:1])
#             M_mid[i][j] = t_score
    
#     for i in range(1, idx):
#         mM_mid = copy.deepcopy(M_mid)
#         for j in range(m):
#             e_score = W.get( f"emission:{Y_NER_set[j]}+{x_sent[i]}", ninf)
#             s = e_score
# # mM_mid[j,:] += M(y1=None, y2=Y_NER_set[j], x=x_sent[i], W=W, f_ls=f_ls[1:])
#             mM_mid[:,j] += s

# #         M_max = np.max(mM_mid, axis=1).reshape((m, 1))
# #         mM_mid -= M_max
# #         mM_mid = np.exp(mM_mid)
        
# #         alpha = alpha.reshape((m,1))
# #         A_max = np.max(alpha)
# #         alpha -= A_max
# #         alpha = np.exp(alpha)
# #         alpha = np.log(np.matmul(mM_mid, alpha)+1) + M_max + A_max
        
# #         alpha = alpha.reshape(1, -1)
# #         alpha = mM_mid + alpha
# #         l_max = np.max(alpha, axis=1).reshape(-1, 1)
# #         alpha -= l_max
# #         alpha = np.exp(alpha)
# #         alpha = np.log(np.sum(alpha,axis=1).reshape(-1, 1))
# #         alpha = alpha + l_max
#             alpha = f11(mM_mid, alpha, m=m)

#     return alpha


# def get_loss(X, Y_NER, W, f_ls):
#     loss = 0
#     for x_sent, y_sent in zip(X, Y_NER):
#         score = compute_score(x_sent, y_sent, W)
#         alpha = forward(x_sent, W, f_ls, Y_NER_set, len(x_sent))
#         forward_score = forward_end(alpha, W, f_ls, Y_NER_set)
#         loss += forward_score - score
# #         print("score:{}, f:{}".format(score, forward_score))
# #         print("------------")
#     return loss

# get_loss(X[1:2], Y_NER[1:2], W, [f1, f2])
# # get_loss([x_instance], [y_instance], W, [f1, f2])

In [13]:
def logsumexp(a):
    b = a.max()
    return  b + np.log( (np.exp(a-b)).sum() )

def feature_emission(y_prev, y_now, x_sent, W, idx):
    if idx == 0:
        return 'NOT IN'
    else:
        key = f"emission:{y_prev}+{x_sent[idx-1]}"
        return key
    
def feature_transition(y_prev, y_now, x_sent, W, idx):
    key = f"transition:{y_prev}+{y_now}"
    return key

f_ls = [feature_emission, feature_transition]

def forward_(x_instance, y_vocab, W, feature_ls=f_ls):
    n, d = len(x_instance), len(y_vocab)
    scores = np.zeros( (n,d) )
    
    for i, y in enumerate(y_vocab):
        score = 0
        for f in feature_ls:
            score += W.get(f(y_prev='START', y_now=y, x_sent=x_instance, W=W, idx=0), 0)
        scores[0, i] = score
    
    for i in range(1, n):
        for y_i, y in enumerate(y_vocab):
            temp = copy.deepcopy(scores[i-1, :]) 
            for y_prev_i, y_prev in enumerate(y_vocab):
                score = 0
                for f in feature_ls:
                    score += W.get(f(y_prev=y_prev, y_now=y, x_sent=x_instance, W=W, idx=i), 0)
                temp[y_prev_i] += score
            scores[i, y_i] = logsumexp(temp)
    
    temp = copy.deepcopy(scores[-1, :])
    for i, y_prev in enumerate(y_vocab):
        score = 0
        for f in feature_ls:
            score += W.get(f(y_prev=y_prev, y_now='STOP', x_sent=x_instance, W=W, idx=n), 0)
        temp[i] += score
    alpha = logsumexp(np.array(temp))
    
    return scores, alpha



def loss_fn_instance_(x_instance, y_instance, feature_dict, y_vocab):
    first_term = compute_score(x_instance, y_instance, feature_dict)
    _, forward_score = forward_(x_instance, y_vocab, feature_dict, feature_ls=f_ls)
    return forward_score - first_term

In [14]:
def forward(x_instance, y_vocab, feature_dict):
    n, d = len(x_instance), len(y_vocab)
    scores = np.zeros( (n,d) )
    
    for i, y in enumerate(y_vocab):
        t_score = feature_dict.get( f"transition:START+{y}", ninf)
        scores[0, i] = t_score
    
    for i in range(1, n):
        for y_i, y in enumerate(y_vocab):
            temp = []
            for y_prev_i, y_prev in enumerate(y_vocab):
                t_score = feature_dict.get( f"transition:{y_prev}+{y}", ninf)
                e_score = feature_dict.get( f"emission:{y_prev}+{x_instance[i-1]}", ninf)
                temp.append(e_score + t_score + scores[i-1, y_prev_i])
            scores[i, y_i] = logsumexp(np.array(temp))
    
    temp = []
    for i, y_prev in enumerate(y_vocab):
        t_score = feature_dict.get( f"transition:{y_prev}+STOP", ninf)
        e_score = feature_dict.get( f"emission:{y_prev}+{x_instance[-1]}", ninf)
        temp.append(e_score + t_score + scores[-1, i])
    alpha = logsumexp(np.array(temp))
    
    return scores, alpha

def backward(x_instance, y_vocab, feature_dict, aggreg_fn=logsumexp):
    n, d = len(x_instance), len(y_vocab)
    scores = np.zeros( (n,d) )
    
    for i, y in enumerate(y_vocab):
        t_score = feature_dict.get( f"transition:{y}+STOP", ninf)
        e_score = feature_dict.get( f"emission:{y}+{x_instance[-1]}", ninf)
        scores[-1, i] = t_score + e_score
        
    for i in range(n-1, 0, -1):
        for y_i, y in enumerate(y_vocab):
            temp = []
            for y_next_i, y_next in enumerate(y_vocab):
                t_score = feature_dict.get( f"transition:{y}+{y_next}", ninf)
                e_score = feature_dict.get( f"emission:{y}+{x_instance[i-1]}")
                temp.append(e_score + t_score + scores[i, y_next_i])
            scores[i-1, y_i] = aggreg_fn(np.array(temp))
            
    temp = []
    for i, y_next in enumerate(y_vocab):
        t_score = feature_dict.get( f"transition:START+{y_next}")
        temp.append(t_score + scores[0, i])
    beta = aggreg_fn(np.array(temp))
    
    return scores, beta

## Part 3 (ii): Gradients by forward n backward

In [23]:

def backward_(x_instance, y_vocab, W, feature_ls=f_ls):
    n, d = len(x_instance), len(y_vocab)
    scores = np.zeros( (n,d) )
    
    for i, y in enumerate(y_vocab):
        s = 0
        for f in feature_ls:
            s += W.get(f(y_prev=y, y_now='STOP', x_sent=x_instance, W=W, idx=n), 0)
        scores[-1, i] = s
        
        
    for i in range(n-1, 0, -1):
        for y_i, y in enumerate(y_vocab):
            temp = copy.deepcopy(scores[i,:])
            for y_next_i, y_next in enumerate(y_vocab):
                s = 0
                for f in feature_ls:
                    s += W.get(f(y_prev=y, y_now=y_next, x_sent=x_instance, W=W, idx=i), 0)
                temp[y_next_i] += s
            scores[i-1, y_i] = logsumexp(np.array(temp))
            
    temp = copy.deepcopy(scores[0,:])
    for i, y_next in enumerate(y_vocab):
        s = 0
        for f in feature_ls:
            s += W.get(f(y_prev='START', y_now=y_next, x_sent=x_instance, W=W, idx=0), 0)
        temp[i] += s
    beta = logsumexp(np.array(temp))
    
    return scores, beta


def forward_backward(x_instance, y_vocab, feature_dict):
    n, d = len(x_instance), len(y_vocab)
    f_scores, alpha = forward_(x_instance, y_vocab, feature_dict)
    b_scores, beta = backward_(x_instance, y_vocab, feature_dict)
    
    feature_expected_count = defaultdict(float)
    
    for i in range(n):
        for y_i, y in enumerate(y_vocab):
            e_feature = f"emission:{y}+{x_instance[i]}"
            feature_expected_count[e_feature] += np.exp(f_scores[i, y_i] + b_scores[i, y_i] - alpha)
            
    for i, y_next in enumerate(y_vocab):
        t_feature = f"transition:START+{y_next}"
        feature_expected_count[t_feature] += np.exp(f_scores[0, i] + b_scores[0, i] - alpha)
        
        t_feature = f"transition:{y_next}+STOP"
        feature_expected_count[t_feature] += np.exp(f_scores[-1, i] + b_scores[-1, i] - alpha)
        
    for y_i, y in enumerate(y_vocab):
        for y_next_i, y_next in enumerate(y_vocab):
            t_feature = f"transition:{y}+{y_next}"
            t_score = feature_dict.get(t_feature, ninf)
            total = 0
            for i in range(n-1):
                e_score = feature_dict.get(f"emission:{y}+{x_instance[i]}", ninf)
                total += np.exp(f_scores[i, y_i] + b_scores[i+1, y_next_i] + t_score + e_score - alpha)
            feature_expected_count[t_feature] = total
            
    return feature_expected_count

def forward_backward_(x_instance, y_vocab, W, tf_ls=[feature_transition], sf_ls=[feature_emission]):
    n, d = len(x_instance), len(y_vocab)
    f_scores, alpha = forward_(x_instance, y_vocab, W, feature_ls=f_ls)
    b_scores, beta = backward_(x_instance, y_vocab, W, feature_ls=f_ls)
    
    feature_expected_count = defaultdict(float)
    
    for i in range(n):
        for y_i, y in enumerate(y_vocab):
            
            for sf in sf_ls:
                s_key = sf(y_prev=y, y_now=None, x_sent=x_instance, W=W, idx=i+1)
                feature_expected_count[s_key] += np.exp(f_scores[i, y_i] + b_scores[i, y_i] - alpha)
    
    for i in range(n+1):
        if i == 0:
            for yi, y in enumerate(y_vocab):
                for f in tf_ls:
                    key = f(y_prev='START', y_now=y, x_sent=x_instance, W=W, idx=0)
                    feature_expected_count[key] += np.exp(f_scores[i, yi] + b_scores[i, yi] - alpha)
        elif i == n:
            for yi, y in enumerate(y_vocab):
                for f in tf_ls:
                    key = f(y_prev=y, y_now='STOP', x_sent=x_instance, W=W, idx=n)
                    feature_expected_count[key] += np.exp(f_scores[n-1, yi] + b_scores[n-1, yi] - alpha)
        else:
            for y_prev_i, y_prev in enumerate(y_vocab):
                for y_now_i, y_now in enumerate(y_vocab):
                    
                    tkey_ls = []
                    t_score = 0
                    for tf in tf_ls:
                        t_key = tf(y_prev=y_prev, y_now=y_now, x_sent=x_instance, W=W, idx=i)
                        t_score += W.get(t_key, ninf)
                        tkey_ls.append(t_key)
                    s_score = 0
                    for sf in sf_ls:
                        s_key = sf(y_prev=y_prev, y_now=y_now, x_sent=x_instance, W=W, idx=i)
                        s_score += W.get(s_key, ninf)
                        
                    for t_key in tkey_ls:
                        feature_expected_count[t_key] += \
                            np.exp(f_scores[i-1, y_prev_i] + b_scores[i, y_now_i] + t_score + s_score - alpha)
            
    return feature_expected_count

# TEST
# y_vocab = Y_NER_set
# print(forward(x_instance, y_vocab, W)[1])
# print(forward_(x_instance, y_vocab, W)[1])
# print(backward(x_instance, y_vocab, W)[1])
# print(backward_(x_instance, y_vocab, W)[1])

l1 = forward_backward(X[43], Y_NER_set, W)
l2 = forward_backward_(X[43], Y_NER_set, W)
# print(l1)
# print(l2)
print(l1==l2)
l1 = list(set(forward_backward(X[23], Y_NER_set, W)))
l2 = list(set(forward_backward_(X[23], Y_NER_set, W)))
print(l1==l2)
l1 = list(set(forward_backward(X[22], Y_NER_set, W)))
l2 = list(set(forward_backward_(X[22], Y_NER_set, W)))
print(l1==l2)

def get_feature_count(x_instance, y_instance, feature_dict):
    feature_count = defaultdict(int)
    
    for x, y in zip(x_instance, y_instance): 
        feature_count[f"emission:{y}+{x}"] += 1
    
    for y_prev, y in zip(['START'] + y_instance, y_instance + ['STOP']):
        feature_count[f"transition:{y_prev}+{y}"] += 1
    
    return feature_count

True
True
True


## Part 4 (i): Gradient and training with regularization

In [24]:
def gradient_fn(x_data, y_data, feature_dict, y_vocab, eta=0.1):
    feature_grad = defaultdict(float)
    
    for x_instance, y_instance in zip(x_data, y_data):
        feature_expected_counts = forward_backward_(x_instance, y_vocab, feature_dict)
        feature_actual_counts = get_feature_count(x_instance, y_instance, feature_dict)
        for k, v in feature_expected_counts.items(): 
            feature_grad[k] += v
        for k, v in feature_actual_counts.items(): 
            feature_grad[k] -= v    
    
    if eta > 0: 
        for k, v in feature_dict.items(): 
            feature_grad[k] += 2*eta*v
    
    return feature_grad
        
    
def loss_fn(x_data, y_data, feature_dict, y_vocab, eta=0):
    loss = 0
    for x_instance, y_instance in zip(x_data, y_data):
        loss += loss_fn_instance_(x_instance, y_instance, feature_dict, y_vocab) 
    reg_loss = eta * sum([o**2 for o in feature_dict.values()]) if eta > 0 else 0
    return loss + reg_loss

In [25]:
# Gradient verification
feature_key_checks = ['emission:O+the', 'transition:START+O', 'transition:O+O', 'transition:I-per+I-per']
feature_gradients = gradient_fn(X, Y_NER, W, Y_NER_set, eta=0)
loss1 = loss_fn(X, Y_NER, W, Y_NER_set, eta=0)
delta = 1e-6

for feat_k in feature_key_checks:
    new_W = W.copy()
    new_W[feat_k] += delta
    loss2 = loss_fn(X, Y_NER, new_W, Y_NER_set, eta=0)
    numerical_grad = (loss2 - loss1) / delta
    analytic_grad = feature_gradients[feat_k]
    if abs(numerical_grad - analytic_grad) / max(abs(numerical_grad), 1e-8) < 1e-5: 
        print(f'{feat_k:>40} passed gradient checking!')
    else:
        print(f'{feat_k:>40} didnot pass gradient checking!')

                          emission:O+the passed gradient checking!
                      transition:START+O passed gradient checking!
                          transition:O+O passed gradient checking!
                  transition:I-per+I-per passed gradient checking!


In [None]:
def numpy_to_dict(weight, feature_dict):
    for i,k in enumerate(feature_dict.keys()):
        feature_dict[k] = weight[i]
    return feature_dict

def dict_to_numpy(grads, feature_dict):
    np_grads = np.zeros(len(feature_dict))
    for i, k in enumerate(feature_dict.keys()):
        np_grads[i] = grads[k]
    return np_grads

def get_loss_grad(weight, *args):
    x_data, y_data, feature_dict, y_vocab = args
    X, Y_NER, W, Y_NER_set = args
    feature_dict = numpy_to_dict(weight, W)
    loss = loss_fn(X, Y_NER, W, Y_NER_set, eta=0.1)
    grads = gradient_fn(X, Y_NER, W, Y_NER_set, eta=0.1)
    grads = dict_to_numpy(grads, W)
    return loss, grads


def callbackF(weight): print(f'Loss: \t {loss_fn(X, Y_NER, W, Y_NER_set, eta=0.1):.4f}')

# Initialization
init_weight = np.zeros(len(W))
feature_dict = numpy_to_dict(init_weight, W)


# Training
optimal_w, final_loss, _ = fmin_l_bfgs_b( 
    get_loss_grad, init_weight, pgtol=0.01, callback=callbackF,
    args=(X, Y_NER, W, Y_NER_set) 
)

# Save weights
feature_dict = numpy_to_dict(optimal_w, W)
weight_name = save_dir/'partial-part4-1.json'
with open(weight_name, 'w') as f: 
    json.dump(W, f)

## Part 4 (ii): write to file

In [None]:
# weight_name = save_dir/'partial-part4-1.json'
# with open(weight_name) as f: feature_dict = json.load(f)

# y_preds = inference(partial_dir/'dev.in', y_vocab, feature_dict, partial_dir/'dev.p4.out')
# y_label = read_data(partial_dir/'dev.out', column=1)

# y_label = [oo for o in y_label for oo in o+['O']]
# y_preds = [oo for o in y_preds for oo in o+['O']]

# prec, rec, f1 = evaluate(y_label, y_preds, verbose=False)
# print(f'precision: {prec:.3f} \t rec: {rec:.3f} \t f1 {f1:.3f}')

## Part 5 (i) & (ii): POS & Combined feature

In [26]:
def feature_POS(y_prev, y_now, x_sent, W, idx):
    return f"emission:{y_prev}+{x_sent[idx-1]}"

def feature_combine(y_prev, y_now, x_sent, W, idx):
    return f"combine:{y_prev}+{y_now}+{x_sent[idx]}"



## Part 6 (i): Design new features