In [1]:
from pathlib import Path
from collections import defaultdict, Counter
import numpy as np
from conlleval import evaluate
from scipy.optimize import fmin_l_bfgs_b
import json
ninf = -1e9

In [2]:
def read_data(path, column=0):
    """column=0 means input sequence, column=1 means label
    """
    with open(path) as f:
        lines = f.readlines()
    
    data = []
    sample = []
    
    for line in lines:
        formatted_line = line.strip()
        
        if len(formatted_line) > 0:
            split_data = formatted_line.split(" ")
            sample.append(split_data[column])

        else:
            data.append(sample)
            sample = []
            
    return data

In [3]:
full_dir = Path('full')
partial_dir = Path('partial')
save_dir = Path('save')
x_data, y_data = read_data(partial_dir/'train', 0), read_data(partial_dir/'train', 1)

In [4]:
# number of instances in the dataset
len(x_data), len(y_data)

(700, 700)

In [5]:
# y labels
y_vocab = sorted(list(set([oo for o in y_data for oo in o]))); y_vocab

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-org',
 'I-per',
 'I-tim',
 'O']

In [6]:
# x vocab
x_vocab = list(set([oo for o in x_data for oo in o])); len(x_vocab)

4068

## Part I (i): Emission scores

In [7]:
def calc_e(x_data, y_data, x_vocab, y_vocab):
    count_emission = Counter([(x,y) for x_instance, y_instance in zip(x_data, y_data) for x, y in zip(x_instance, y_instance)])
    count_label = Counter([oo for o in y_data for oo in o])
    
    e_score = {}
    for y in y_vocab:
        for x in x_vocab:
            feature = f"emission:{y}+{x}"
            
            if (x,y) not in count_emission:
                e_score[feature] = ninf
            else:
                score = np.log(count_emission[(x,y)]  /  count_label[y])
                e_score[feature] = score
    
    return e_score


emission_dict = calc_e(x_data, y_data, x_vocab, y_vocab)

## Part I (ii): Transition scores

In [8]:
def calc_t(y_data, y_vocab):
    count_transition = Counter([ (y_prev, y) for y_instance in y_data for y_prev, y in zip(['START'] + y_instance, y_instance + ['STOP'])])
    count_label = Counter([y for y_instance in y_data for y in ['START'] + y_instance])
    
    f_score = {}
    for y_prev in ['START'] + y_vocab:
        for y in y_vocab + ['STOP']:
            feature = f"transition:{y_prev}+{y}"
            
            if (y_prev,y) not in count_transition:
                f_score[feature] = ninf
            else:
                score = np.log(count_transition[(y_prev,y)]  /  count_label[y_prev])
                f_score[feature] = score
    
    return f_score

transition_dict = calc_t(y_data, y_vocab)
feature_dict = {**transition_dict, **emission_dict}

## Part II (i): Compute Score

In [9]:
def compute_score(x_instance, y_instance, feature_dict):
    feature_count = defaultdict(int)
    
    for x, y in zip(x_instance, y_instance): feature_count[f"emission:{y}+{x}"] += 1
    
    for y_prev, y in zip(['START'] + y_instance, y_instance + ['STOP']):
        feature_count[f"transition:{y_prev}+{y}"] += 1
        
    score = sum([feature_dict[feat]*count for feat, count in feature_count.items()])
    return score

x_instance = "This is the second U.N.-Congolese offensive against militias in the region since the DRC   's constitutional referendum a week ago .".split()
y_instance = 'O    O  O   O      O              O         O       O        O  O   O      O     O   B-geo O  O              O          O O    O   O'.split()
compute_score(x_instance, y_instance, feature_dict)

-139.57175855522826

## Part II (ii): Viterbi decoding

In [10]:
def viterbi(x_instance, y_vocab, feature_dict):
    n, d = len(x_instance), len(y_vocab)
    scores = np.full( (n,d), -np.inf) # initialize to be very negative
    bp = np.full( (n,d), 0, dtype=np.int)
    
    for i, y in enumerate(y_vocab):
        t_score = feature_dict.get( f"transition:START+{y}",  ninf)
        e_score = feature_dict.get( f"emission:{y}+{x_instance[0]}",  ninf)
        scores[0, i] = t_score + e_score
        
    for i in range(1, n):
        for y_i, y in enumerate(y_vocab):
            for y_prev_i, y_prev in enumerate(y_vocab):
                t_score = feature_dict.get( f"transition:{y_prev}+{y}", ninf)
                e_score = feature_dict.get( f"emission:{y}+{x_instance[i]}", ninf)
                score = t_score + e_score + scores[i-1, y_prev_i]
                if score > scores[i, y_i]:
                    scores[i, y_i] = score
                    bp[i, y_i] = y_prev_i
    
    final_score, final_bp = ninf, 0
    for i, y_prev in enumerate(y_vocab):
        t_score = feature_dict.get( f"transition:{y_prev}+STOP", ninf)
        score = t_score + scores[n-1, i]
        if score > final_score: 
            final_score = score
            final_bp = i
    decoded_sequence = [ y_vocab[final_bp], ]
    for i in range(n-1, 0, -1):
        final_bp = bp[i, final_bp]
        decoded_sequence = [ y_vocab[final_bp] ] + decoded_sequence
        
    return decoded_sequence

print("y_pred: ", " ".join(viterbi(x_instance, y_vocab, feature_dict)))
print("y_labl: ", " ".join(y_instance))

y_pred:  O O O O O O O O O O O O O B-geo O O O O O O O
y_labl:  O O O O O O O O O O O O O B-geo O O O O O O O


In [11]:
def inference(in_file_path, y_vocab, feature_dict, out_file_path):
    x_data = read_data(in_file_path, column=0)
    y_preds = []
    with open(out_file_path, 'w') as f:
        for x_instance in x_data:
            pred = viterbi(x_instance, y_vocab, feature_dict)
            for word, label in zip(x_instance, pred): f.write(f"{word} {label} \n")
            f.write('\n')
            y_preds.append(pred)
    return y_preds

y_preds = inference(partial_dir/'dev.in', y_vocab, feature_dict, partial_dir/'dev.p2.out')
y_label = read_data(partial_dir/'dev.out', column=1)

y_label = [oo for o in y_label for oo in o+['O']]
y_preds = [oo for o in y_preds for oo in o+['O']]

prec, rec, f1 = evaluate(y_label, y_preds, verbose=False)
print(f'precision: {prec:.3f} \t rec: {rec:.3f} \t f1 {f1:.3f}')

precision: 49.442 	 rec: 56.356 	 f1 52.673


## Part III (i): CRF loss
refer to [machine learning slides](https://drive.google.com/file/d/1RfPcnQigx4jdLtnTjjjI1Jgd2UufexhV/view?usp=sharing) for details

In [12]:
def logsumexp(a):
    b = a.max()
    return  b + np.log( (np.exp(a-b)).sum() )

def forward(x_instance, y_vocab, feature_dict):
    n, d = len(x_instance), len(y_vocab)
    scores = np.zeros( (n,d) )
    
    for i, y in enumerate(y_vocab):
        t_score = feature_dict.get( f"transition:START+{y}", ninf)
        scores[0, i] = t_score
    
    for i in range(1, n):
        for y_i, y in enumerate(y_vocab):
            temp = []
            for y_prev_i, y_prev in enumerate(y_vocab):
                t_score = feature_dict.get( f"transition:{y_prev}+{y}", ninf)
                e_score = feature_dict.get( f"emission:{y_prev}+{x_instance[i-1]}", ninf)
                temp.append(e_score + t_score + scores[i-1, y_prev_i])
            scores[i, y_i] = logsumexp(np.array(temp))
    
    temp = []
    for i, y_prev in enumerate(y_vocab):
        t_score = feature_dict.get( f"transition:{y_prev}+STOP", ninf)
        e_score = feature_dict.get( f"emission:{y_prev}+{x_instance[-1]}", ninf)
        temp.append(e_score + t_score + scores[-1, i])
    alpha = logsumexp(np.array(temp))
    
    return scores, alpha



def loss_fn_instance(x_instance, y_instance, feature_dict, y_vocab):
    first_term = compute_score(x_instance, y_instance, feature_dict)
    _, forward_score = forward(x_instance, y_vocab, feature_dict)
    return forward_score - first_term


loss_fn_instance(x_instance, y_instance, feature_dict, y_vocab)

1.3302877199753311

## Part III (ii): forward backward

In [13]:
def backward(x_instance, y_vocab, feature_dict, aggreg_fn=logsumexp):
    n, d = len(x_instance), len(y_vocab)
    scores = np.zeros( (n,d) )
    
    for i, y in enumerate(y_vocab):
        t_score = feature_dict.get( f"transition:{y}+STOP", ninf)
        e_score = feature_dict.get( f"emission:{y}+{x_instance[-1]}", ninf)
        scores[-1, i] = t_score + e_score
        
    for i in range(n-1, 0, -1):
        for y_i, y in enumerate(y_vocab):
            temp = []
            for y_next_i, y_next in enumerate(y_vocab):
                t_score = feature_dict.get( f"transition:{y}+{y_next}", ninf)
                e_score = feature_dict.get( f"emission:{y}+{x_instance[i-1]}")
                temp.append(e_score + t_score + scores[i, y_next_i])
            scores[i-1, y_i] = aggreg_fn(np.array(temp))
            
    temp = []
    for i, y_next in enumerate(y_vocab):
        t_score = feature_dict.get( f"transition:START+{y_next}")
        temp.append(t_score + scores[0, i])
    beta = aggreg_fn(np.array(temp))
    
    return scores, beta



def forward_backward(x_instance, y_vocab, feature_dict):
    n, d = len(x_instance), len(y_vocab)
    f_scores, alpha = forward(x_instance, y_vocab, feature_dict)
    b_scores, beta = backward(x_instance, y_vocab, feature_dict)
    
    feature_expected_count = defaultdict(float)
    
    for i in range(n):
        for y_i, y in enumerate(y_vocab):
            e_feature = f"emission:{y}+{x_instance[i]}"
            feature_expected_count[e_feature] += np.exp(f_scores[i, y_i] + b_scores[i, y_i] - alpha)
            
    for i, y_next in enumerate(y_vocab):
        t_feature = f"transition:START+{y_next}"
        feature_expected_count[t_feature] += np.exp(f_scores[0, i] + b_scores[0, i] - alpha)
        
        t_feature = f"transition:{y_next}+STOP"
        feature_expected_count[t_feature] += np.exp(f_scores[-1, i] + b_scores[-1, i] - alpha)
        
    for y_i, y in enumerate(y_vocab):
        for y_next_i, y_next in enumerate(y_vocab):
            t_feature = f"transition:{y}+{y_next}"
            t_score = feature_dict.get(t_feature, ninf)
            total = 0
            for i in range(n-1):
                e_score = feature_dict.get(f"emission:{y}+{x_instance[i]}", ninf)
                total += np.exp(f_scores[i, y_i] + b_scores[i+1, y_next_i] + t_score + e_score - alpha)
            feature_expected_count[t_feature] = total
            
    return feature_expected_count

def get_feature_count(x_instance, y_instance, feature_dict):
    feature_count = defaultdict(int)
    
    for x, y in zip(x_instance, y_instance): feature_count[f"emission:{y}+{x}"] += 1
    
    for y_prev, y in zip(['START'] + y_instance, y_instance + ['STOP']):
        feature_count[f"transition:{y_prev}+{y}"] += 1
    
    return feature_count


    

## Part IV (i): gradient and training with regularization

In [14]:
def gradient_fn(x_data, y_data, feature_dict, y_vocab, eta=0.1):
    feature_grad = defaultdict(float)
    
    for x_instance, y_instance in zip(x_data, y_data):
        feature_expected_counts = forward_backward(x_instance, y_vocab, feature_dict)
        feature_actual_counts = get_feature_count(x_instance, y_instance, feature_dict)
        for k, v in feature_expected_counts.items(): feature_grad[k] += v
        for k, v in feature_actual_counts.items(): feature_grad[k] -= v    
    
    if eta > 0: 
        for k, v in feature_dict.items(): feature_grad[k] += 2*eta*v
    
    return feature_grad
        
    
def loss_fn(x_data, y_data, feature_dict, y_vocab, eta=0):
    loss = 0
    for x_instance, y_instance in zip(x_data, y_data):
        loss += loss_fn_instance(x_instance, y_instance, feature_dict, y_vocab) 
    reg_loss = eta * sum([o**2 for o in feature_dict.values()]) if eta > 0 else 0
    return loss + reg_loss

In [15]:
# Gradient verification
feature_key_checks = ['emission:O+the', 'transition:START+O', 'transition:O+O', 'transition:I-per+I-per']
feature_gradients = gradient_fn(x_data, y_data, feature_dict, y_vocab, eta=0)
loss1 = loss_fn(x_data, y_data, feature_dict, y_vocab, eta=0)
delta = 1e-6

for feat_k in feature_key_checks:
    new_feature_dict = feature_dict.copy()
    new_feature_dict[feat_k] += delta
    loss2 = loss_fn(x_data, y_data, new_feature_dict, y_vocab, eta=0)
    numerical_grad = (loss2 - loss1) / delta
    analytic_grad = feature_gradients[feat_k]
    if abs(numerical_grad - analytic_grad) / max(abs(numerical_grad), 1e-8) < 1e-5: 
        print(f'{feat_k:>40} passed gradient checking!')
    else:
        print(f'{feat_k:>40} didnot pass gradient checking!')

      emission:O+the passed gradient checking!
  transition:START+O passed gradient checking!
      transition:O+O passed gradient checking!
transition:I-per+I-per passed gradient checking!


In [16]:
# Helper function
def numpy_to_dict(weight, feature_dict):
    for i,k in enumerate(feature_dict.keys()):
        feature_dict[k] = weight[i]
    return feature_dict

def dict_to_numpy(grads, feature_dict):
    np_grads = np.zeros(len(feature_dict))
    for i, k in enumerate(feature_dict.keys()):
        np_grads[i] = grads[k]
    return np_grads

def get_loss_grad(weight, *args):
    x_data, y_data, feature_dict, y_vocab = args
    feature_dict = numpy_to_dict(weight, feature_dict)
    loss = loss_fn(x_data, y_data, feature_dict, y_vocab, eta=0.1)
    grads = gradient_fn(x_data, y_data, feature_dict, y_vocab, eta=0.1)
    grads = dict_to_numpy(grads, feature_dict)
    return loss, grads

def callbackF(weight): print(f'Loss: \t {loss_fn(x_data, y_data, feature_dict, y_vocab, eta=0.1):.4f}')



# Initialization
init_weight = np.zeros(len(feature_dict))
feature_dict = numpy_to_dict(init_weight, feature_dict)


# Training
result = fmin_l_bfgs_b( 
    get_loss_grad, init_weight, pgtol=0.01, callback=callbackF,
    args=(x_data, y_data, feature_dict, y_vocab) 
)

# Save weights
feature_dict = numpy_to_dict(result[0], feature_dict)
weight_name = save_dir/'partial-part4-1.json'
with open(weight_name, 'w') as f: json.dump(feature_dict, f)

Loss: 	 18333.7955
Loss: 	 14132.2391
Loss: 	 13066.3711
Loss: 	 12663.9164
Loss: 	 12303.2443
Loss: 	 11099.0803
Loss: 	 10489.4525
Loss: 	 9614.8077
Loss: 	 9067.4347
Loss: 	 8394.3281
Loss: 	 8004.4294
Loss: 	 7673.0057
Loss: 	 7275.4810
Loss: 	 6911.1102
Loss: 	 6710.6507
Loss: 	 6469.9394
Loss: 	 6113.1164
Loss: 	 6011.9983
Loss: 	 5886.4793
Loss: 	 5688.9170
Loss: 	 5620.4006
Loss: 	 5526.2114
Loss: 	 5410.1797
Loss: 	 5263.5436
Loss: 	 5010.8256
Loss: 	 4891.2793
Loss: 	 4772.6344
Loss: 	 4602.0067
Loss: 	 4504.9576
Loss: 	 4438.6535
Loss: 	 4382.5379
Loss: 	 4312.2697
Loss: 	 4146.8862
Loss: 	 3931.2384
Loss: 	 3877.3722
Loss: 	 3638.9999
Loss: 	 3547.3763
Loss: 	 3424.4592
Loss: 	 3289.2315
Loss: 	 3263.2814
Loss: 	 3149.6003
Loss: 	 3129.3778
Loss: 	 3088.5620
Loss: 	 3031.8828
Loss: 	 2977.3570
Loss: 	 2917.4135
Loss: 	 2890.3171
Loss: 	 2833.4111
Loss: 	 2783.4258
Loss: 	 2686.1813
Loss: 	 2643.4129
Loss: 	 2600.7354
Loss: 	 2583.4919
Loss: 	 2549.4932
Loss: 	 2524.9203
Los

## Part 4 (ii): write to file

In [22]:
weight_name = save_dir/'partial-part4-1.json'
with open(weight_name) as f: feature_dict = json.load(f)

y_preds = inference(partial_dir/'dev.in', y_vocab, feature_dict, partial_dir/'dev.p4.out')
y_label = read_data(partial_dir/'dev.out', column=1)

y_label = [oo for o in y_label for oo in o+['O']]
y_preds = [oo for o in y_preds for oo in o+['O']]

prec, rec, f1 = evaluate(y_label, y_preds, verbose=False)
print(f'precision: {prec:.3f} \t rec: {rec:.3f} \t f1 {f1:.3f}')

precision: 54.321 	 rec: 55.932 	 f1 55.115


## Part 5 (i)

## Part 5 (ii)

## Part 5 (iii) Structured Perceptron

In [None]:
logsumexp = max