In [1]:
from collections import Counter
import numpy as np
import random
from random import shuffle
from sklearn.metrics import f1_score
import sys
from itertools  import product


In [2]:
def load_dataset_sents(file_path, as_zip=True, to_idx=False, token_vocab=None, target_vocab=None): 
    '''
    Function to load data from train/test files
    '''
    targets = []
    inputs = []
    zip_inps = []
    with open(file_path) as f:
        for line in f:
            sent, tags = line.split("\t")
            words = [token_vocab[w.strip()] if to_idx else w.strip() for w in sent.split()]
            ner_tags = [target_vocab[w.strip()] if to_idx else w.strip() for w in tags.split()] 
            inputs.append(words)
            targets.append(ner_tags)
            zip_inps.append(list(zip(words, ner_tags)))
    return zip_inps if as_zip else (inputs, targets)

In [3]:

def cw_cl_counter(corpus, min_counts = 3):
    '''
    Current word-current label dictionary
    It doesn't include features with a less tham min_counts frequency, default: 3
    '''
    cw_cl = []
    for s in corpus:
        cw_cl += [word+"_"+label for word,label in s]
    counter = Counter(cw_cl)
    return {k:v for k,v in counter.items() if v >= min_counts}

In [4]:

def phi_1(x, y, cw_cl_counts):
    '''
    Given a sentence x and a sequence y, returns the counts for each
    current word-current label feature in the sentence that is also in the corpus (cw_cl_counts)
    '''
    features = []
    for i in range(len(x)):
        f = x[i]+"_"+y[i]
        if f in cw_cl_counts:
            features.append(x[i]+"_"+y[i])
    return Counter(features)


In [6]:
def phi(x,y,phi_counts):
    '''
    Given a sentence x, a sequence y, and a dictionary phi_counts ({phi_functions:counter_functions})
    returns a combined dictionary of features
    '''
    features = {}
    for phi_func,counts in phi_counts.items():
        for feature, count in phi_func(x,y,counts).items():
            features[feature] = count
        
    return features


In [8]:

def perceptron(corpus,w,c,phi_counts):
    '''
    Given a copus, a w (dict), a number of updates c (int) and a phi_counts ({phi_functions:counter_functions})
    performs an iteration of the perceptron
    '''
    for k,s in enumerate(corpus):
        # initialize variables
        x = [i[0] for i in s]
        y_target = [i[1] for i in s]
        y_pred = predict(x,w,phi_counts)
        if y_pred != y_target:
            # w = w + phi(x,y ) - phi(x ,y_pred)
            phi_diff = Counter(phi(x,y_target,phi_counts))
            phi_diff.subtract(phi(x,y_pred,phi_counts))
            
            for feature,count in phi_diff.items():
                if feature in w:
                    w[feature] += count
                else:
                    w[feature] = count
        c += 1
    return w,c


In [9]:
def train(corpus, phi_counts, num_iter = 10, averaging = True):
    '''
    Perform the perceptron training algorithm given a corpus, a phi_counts dictionary ({phi_functions:counter_functions}),
    a number of iterations num_iter (int) with/without averaging (boolean)
    '''
    c = 0
    w_c = {}
    w_avg = {}
    for iteration in range(num_iter):
        print("Iteration {}/{}".format(iteration+1,num_iter))
        # randomize
        shuffle(corpus)
        w_c,c = perceptron(corpus,w_c,c,phi_counts)
        if averaging:
            for feature, v in w_c.items():
                if feature in w_avg:
                    w_avg[feature] += v
                else:
                    w_avg[feature] = v
    if averaging:
        for f,v in w_avg.items():
            w_avg[f] = v/c
        return w_avg
    else:
        return w_c

In [10]:
def predict(x,w,phi_counts):
    '''
    Predicts a sequence y_pred given a sentence x, weights w (dict),
    and phi_counts dictionary ({phi_functions:counter_functions})
    '''
    max_score = -1e10
    y_pred = []
    # all possible sequences
    all_possible_y = product(labels, repeat = len(x))
    for y in all_possible_y:
        score = 0
        # if len(y) == len(x): This should always be true
        phi_xy = phi(x,y,phi_counts)
        # for each feature that is in phi and w (features>0)
        for feature in set(phi_xy).intersection(set(w)):
            score += phi_xy[feature]*w[feature]
        if score > max_score:
            max_score = score
            y_pred = y
    return y_pred


In [11]:
corpus = load_dataset_sents("train.txt")
test_corpus = load_dataset_sents("test.txt")

In [12]:
cw_cl_counts = cw_cl_counter(corpus)

In [18]:
labels = ['O', 'ORG', 'MISC', 'PER', 'LOC']

In [19]:
print("Perceptron with 5 iterations and averaging")
# phi_1
print("Phi1")
phi_counts = {phi_1:cw_cl_counts}
w_phi1 = train(corpus,phi_counts, num_iter = 5)


Perceptron with 5 iterations and averaging
Phi1
Iteration 1/5
Iteration 2/5
Iteration 3/5
Iteration 4/5
Iteration 5/5


In [15]:
def test(w,corpus,phi_counts):
    '''
    Predicts a sequence for each sentence in the corpus and computes the f1score with
    the correct and predicted flatten sequences
    '''
    predicted, correct = [], []
    for s in corpus:
        x = [i[0] for i in s]
        correct += [i[1] for i in s]
        predicted += predict(x,w,phi_counts)
    
    f1_micro = f1_score(correct, predicted, average = "micro",
                        labels = ['ORG', 'MISC', 'PER', 'LOC'])

    return f1_micro

In [20]:
print("Testing...")
# print("F1 Score: train",round(test(w_phi1,corpus,phi_counts),3))
print("F1 Score: test",round(test(w_phi1,test_corpus,phi_counts),3))
print("Top 10 most positively-weighted features:")
print()

Testing...
F1 Score: test 0.628
Top 10 most positively-weighted features:

