# Sentence compressor

### Project on natural language processing

##### Reading the dataset

In [1]:
import theano
import theano.tensor as T
import lasagne
import numpy as np

Using gpu device 0: GeForce GTX 960M (CNMeM is enabled with initial size: 75.0% of memory, cuDNN 5103)
  "downsample module has been moved to the theano.tensor.signal.pool module.")


In [2]:
import compression_data

In [3]:
data = compression_data.load_compression_data("compression-data.json.gz")

Function that convert array of tokkens to string, either compressed or not

In [4]:
def prase_array_to_string(prase, compressed = False):
    return ' '.join([t.form for t in filter(lambda t: t.label == 0 if compressed else True, prase)])
prase_array_to_string(data[2], True)

u'Luis Suarez was spotted in London'

#### Reading the word2vec embedings

In [5]:
import gensim
model = gensim.models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

Make embeding part of every token

In [6]:
from collections import namedtuple
unk = np.zeros(300)
TokenWithEmb = namedtuple("TokenWithEmb", "form tag stem label emb")
data_with_emb = []
for phrase in data:
    new_phrase = []
    for word in phrase:
        new_phrase.append(TokenWithEmb(word.form, word.tag, word.stem, word.label, model[word.form]
                                  if word.form in model else unk))
    data_with_emb.append(new_phrase)
del model

#### Assign number to every tag

In [23]:
all_tags = [token.tag for phrase in data_with_emb for token in phrase]
tags_dict = {tag:i for i, tag in enumerate(set(all_tags))}

##### Split the dataset into train, test and dev sets

In [24]:
from sklearn.cross_validation import train_test_split
train_with_dev, test = train_test_split(data_with_emb, test_size = 500, random_state = 0)
train, dev = train_test_split(train_with_dev, test_size = 1000, random_state = 0)

In [25]:
len(test), len(dev), len(train)

(500, 1000, 8500)

#### Function to iterate over phrases

This function return one phrase at each yield, first field is matrix of embedings, second vector of tags encodings, third is vector of answers

In [27]:
def iterate (dataset, shuffle = True):
    perm = np.random.permutation(len(dataset)) if shuffle else range(len(dataset))
    for i in perm:
        yield (np.vstack(map(lambda token: token.emb, dataset[i])),
               np.array(map(lambda token: tags_dict[token.tag], dataset[i])),
               np.array(map(lambda token: token.label, dataset[i])))

#### Defining the network

In [39]:
X_emb = T.matrix('input','float32')
X_tag = T.ivector('tag')
Y = T.ivector('target')
embeding_size = 300
tag_emb_size = 200
rnn_size = 500
hid_size = 500
def network_architecture():
    #Input layer for word2vec embeding
    l_emb_in = lasagne.layers.InputLayer(shape=(None, embeding_size), input_var = X_emb)
    
    #Input layer for tag id
    l_tag_in = lasagne.layers.InputLayer(shape=(None, ),input_var = X_tag)
    
    #Embeding layer for tags
    l_emb = lasagne.layers.EmbeddingLayer(l_tag_in, input_size=len(tags_dict), output_size = tag_emb_size)
    
    #Concat word2vec embeding and tags embeding
    l_concat = lasagne.layers.ConcatLayer([l_emb_in, l_emb], axis = 1)
    
    #Reshape previous layer to fit RNN layer format 
    l_reshape = lasagne.layers.ReshapeLayer(l_concat, shape = (1, -1, embeding_size + tag_emb_size))
    
    #First recurent forward and backward layer and concatenation of them
    l_rnn_forward = lasagne.layers.LSTMLayer(l_reshape, grad_clipping = 100, num_units = rnn_size)
    l_rnn_backward = lasagne.layers.LSTMLayer(l_reshape, grad_clipping = 100, num_units = rnn_size, backwards=True)
    l_rnn = lasagne.layers.ConcatLayer([l_rnn_forward, l_rnn_backward], axis = 2)
    
    
    l_dropout = lasagne.layers.DropoutLayer(l_rnn, p=0.2)
    
    #Second recurent layer
    l_rnn_forward = lasagne.layers.LSTMLayer(l_dropout, grad_clipping = 100, num_units = rnn_size)
    l_rnn_backward = lasagne.layers.LSTMLayer(l_dropout, grad_clipping = 100, num_units = rnn_size, backwards=True)
    l_rnn = lasagne.layers.ConcatLayer([l_rnn_forward, l_rnn_backward], axis = 2)
    
    #Reshape to fit dence layer format
    l_shp = lasagne.layers.ReshapeLayer(l_rnn, shape = (-1, 2 * rnn_size))    
    
    #Dence layer with dropout
    l_hid = lasagne.layers.DenseLayer(l_shp, num_units = hid_size)    
    l_dropout = lasagne.layers.DropoutLayer(l_hid, p=0.1)
    
    #Dence layer with softmax nonlinerity, for final answer
    l_den = lasagne.layers.DenseLayer(l_dropout, num_units = 2, nonlinearity=lasagne.nonlinearities.identity)
    l_sden = lasagne.layers.ReshapeLayer(l_den, shape = (-1, 2))
    l_out = lasagne.layers.NonlinearityLayer(l_sden, nonlinearity=lasagne.nonlinearities.softmax)
    
    return l_out

network = network_architecture()

Defining the function for training and testing the network.  Network will be thrained using adam method.

In [40]:
def compile_network(l_out, learnig_rate):
    weights = lasagne.layers.get_all_params(l_out, trainable=True)
    network_output = lasagne.layers.get_output(l_out)
    network_output_det = lasagne.layers.get_output(l_out, deterministic = True)

    loss = lasagne.objectives.categorical_crossentropy(network_output, Y).mean()
    loss_det = lasagne.objectives.categorical_crossentropy(network_output_det, Y).mean()

    updates = lasagne.updates.adam(loss, weights, learning_rate = learnig_rate)


    #training functio
    train = theano.function([X_emb, X_tag, Y], loss, updates=updates, allow_input_downcast=True)

    #loss function
    compute_cost = theano.function([X_emb, X_tag, Y], loss_det, allow_input_downcast=True)

    #prediction function
    probs = theano.function([X_emb, X_tag], network_output_det, allow_input_downcast=True)
    
    return train, compute_cost, probs

train_func, compute_cost_func, probs_func = compile_network(network, 1e-3)

#### Training the network

Training cycle. Cycle consist of n_epoch. In each epoch network trained on every phrase in train set and score computed on dev set. The network with highest dev score will be taken.

In [41]:
def trainig_cycle(train_func, compute_cost_func, net, n_epochs):
    print("Training ...")
    np.random.seed(0)
    
    best_score = 10000000
    best_weight = None
    train_epoch_loss = []
    dev_epoch_loss = []
    for epoch in range(n_epochs):    
        train_losses = []
        dev_losses = []

        for X_emb, X_tag, Y in iterate(train):
            tr_loss = train_func(X_emb, X_tag,  Y)
            train_losses.append(tr_loss)

        for X_emb, X_tag, Y in iterate(dev):        
            dev_loss = compute_cost_func(X_emb, X_tag,  Y)
            dev_losses.append(dev_loss)
        
        if dev_loss < best_score:
            best_weight = lasagne.layers.get_all_param_values(net)
            best_score = dev_loss

        print "Epoch %d: train loss - %f, dev loss - %f" % (epoch, np.mean(train_losses), np.mean(dev_losses))
        train_epoch_loss.append(np.mean(train_losses))
        dev_epoch_loss.append(np.mean(dev_losses))
    return train_epoch_loss, dev_epoch_loss, best_weight

train_epoch_loss, dev_epoch_loss, best_weight = trainig_cycle(train_func, compute_cost_func, network, 6)
lasagne.layers.set_all_param_values(network, best_weight)

Training ...
Epoch 0: train loss - 0.495334, dev loss - 0.435520
Epoch 1: train loss - 0.394671, dev loss - 0.402078
Epoch 2: train loss - 0.346510, dev loss - 0.409115
Epoch 3: train loss - 0.288222, dev loss - 0.435973
Epoch 4: train loss - 0.231198, dev loss - 0.467889
Epoch 5: train loss - 0.202118, dev loss - 0.551481


#### Evaluation

Compute the accuracy and auc in every phrase and report the avearag. Also compute the number of phrases that match exactly.

In [42]:
from sklearn.metrics import roc_auc_score, accuracy_score

aucs = []
accuracys = []
count_total_match = 0

for X_emb, X_tag, Y in iterate(test, False):
    probs = probs_func(X_emb, X_tag)
    predictions = (probs[:,1] > 0.5).astype(int)
    aucs.append(roc_auc_score(Y, probs[:, 1]))
    accuracys.append(accuracy_score(Y, predictions))
    count_total_match += np.array_equal(Y, predictions)

print "Avearage auc %s, avearage accuracy %s" % (np.mean(aucs), np.mean(accuracys))
print "Number of phrases that match %s/%s" % (count_total_match, len(test))

Avearage auc 0.900689427125, avearage accuracy 0.834544672925
Number of phrases that match 103/500


Print the phrase along with it`s compression and predicted compression

In [45]:
from itertools import compress
def test_phrase(number):
    for X_emb, X_tag, Y in iterate(test[number:(number+1)], False):
        probs = probs_func(X_emb, X_tag)
        predictions = (probs[:,1] > 0.5).astype(int)
        phrase = prase_array_to_string(test[number])
        phrase_compressed = prase_array_to_string(test[number], True)
        phrase_compression_prediction = ' '.join(map(lambda x: x.form, compress(test[number], 1 - predictions)))
        print "Phrase: '%s'" % phrase
        print "True compression: '%s'" % phrase_compressed
        print "Compression predicted: '%s'" % phrase_compression_prediction

test_phrase(77)

Phrase: 'Montreal-based international flight training specialist CAE Inc. said Wednesday it has won $ 140 million in new military contracts including options'
True compression: 'CAE Inc. has won $ 140 million in military contracts'
Compression predicted: 'CAE Inc. has won $ 140 million in military contracts'
