# Sentence compressor

### Project on natural language processing

##### Reading the dataset

In [1]:
import theano
import theano.tensor as T
import lasagne
import numpy as np

Using gpu device 0: GeForce GTX 960M (CNMeM is enabled with initial size: 75.0% of memory, cuDNN 5103)
  "downsample module has been moved to the theano.tensor.signal.pool module.")


In [2]:
import compression_data

In [3]:
data = compression_data.load_compression_data("compression-data.json.gz")

In [4]:
def prase_array_to_string(prase, compressed = False):
    return ' '.join([t.form for t in filter(lambda t: t.label == 0 if compressed else True, prase)])
prase_array_to_string(data[2], True)

u'Luis Suarez was spotted in London'

#### Reading the word2vec embedings

In [5]:
import gensim
model = gensim.models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [6]:
from collections import namedtuple
unk = np.zeros(300)
TokenWithEmb = namedtuple("TokenWithEmb", "form tag stem label emb")
data_with_emb = []
for phrase in data:
    new_phrase = []
    for word in phrase:
        new_phrase.append(TokenWithEmb(word.form, word.tag, word.stem, word.label, model[word.form]
                                  if word.form in model else unk))
    data_with_emb.append(new_phrase)
del model

##### Split the dataset into train, test and dev sets

In [7]:
from sklearn.cross_validation import train_test_split
train_with_dev, test = train_test_split(data_with_emb, test_size = 2000, random_state = 0)
train, dev = train_test_split(train_with_dev, test_size = 2000, random_state = 0)

In [8]:
len(test), len(dev), len(train)

(2000, 2000, 6000)

#### Function to iterate over phrases

In [14]:
def iterate (dataset, shuffle = True):
    if shuffle:
        for i in np.random.permutation(len(dataset)):
            yield np.vstack(map(lambda token: token.emb, dataset[i])), np.array(map(lambda token: token.label, dataset[i]))
    else:
        for i in range(len(dataset)):
            yield np.vstack(map(lambda token: token.emb, dataset[i])), np.array(map(lambda token: token.label, dataset[i]))

#### Defining the network

In [20]:
X = T.matrix('input','float32')
Y = T.ivector('target')
embeding_size = 300
rnn_size = 500
def network_architecture():
    l_in = lasagne.layers.InputLayer(shape=(None, embeding_size), input_var = X)
    
    l_sin = lasagne.layers.ReshapeLayer(l_in, shape = (1, -1, embeding_size))
    
    l_rnn_forward = lasagne.layers.GRULayer(l_sin, grad_clipping = 100, num_units = rnn_size)
    l_rnn_backward = lasagne.layers.GRULayer(l_sin, grad_clipping = 100, num_units = rnn_size, backwards=True)
    l_rnn = lasagne.layers.ConcatLayer([l_rnn_forward, l_rnn_backward], axis = 2)
    
    l_dropout = lasagne.layers.DropoutLayer(l_rnn)
    
    l_rnn_forward = lasagne.layers.GRULayer(l_dropout, grad_clipping = 100, num_units = rnn_size)
    l_rnn_backward = lasagne.layers.GRULayer(l_dropout, grad_clipping = 100, num_units = rnn_size, backwards=True)
    l_rnn = lasagne.layers.ConcatLayer([l_rnn_forward, l_rnn_backward], axis = 2)
     
    l_shp = lasagne.layers.ReshapeLayer(l_rnn, shape = (-1, 2 * rnn_size))

    l_den = lasagne.layers.DenseLayer(l_shp, num_units = 2, nonlinearity=lasagne.nonlinearities.identity)
    l_sden = lasagne.layers.ReshapeLayer(l_den, shape = (-1, 2))
    l_out = lasagne.layers.NonlinearityLayer(l_sden, nonlinearity=lasagne.nonlinearities.softmax)
    
    return l_out

network = network_architecture()

In [21]:
def compile_network(l_out, learnig_rate):
    weights = lasagne.layers.get_all_params(l_out, trainable=True)
    network_output = lasagne.layers.get_output(l_out)
    network_output_det = lasagne.layers.get_output(l_out, deterministic = True)

    loss = lasagne.objectives.categorical_crossentropy(network_output, Y).mean()
    loss_det = lasagne.objectives.categorical_crossentropy(network_output_det, Y).mean()

    updates = lasagne.updates.adam(loss, weights, learning_rate = learnig_rate)


    #training functio
    train = theano.function([X, Y], loss, updates=updates, allow_input_downcast=True)

    #loss function
    compute_cost = theano.function([X, Y], loss_det, allow_input_downcast=True)

    #prediction function
    probs = theano.function([X], network_output_det,allow_input_downcast=True)
    
    return train, compute_cost, probs

train_func, compute_cost_func, probs_func = compile_network(network, 1e-3)

#### Training the network

In [22]:
def trainig_cycle(train_func, compute_cost_func, net, n_epochs):
    print("Training ...")
    np.random.seed(0)
    
    best_score = 10000000
    best_weight = None
    train_epoch_loss = []
    dev_epoch_loss = []
    for epoch in xrange(n_epochs):    
        train_losses = []
        dev_losses = []

        for X, Y in iterate(train):
            tr_loss = train_func(X, Y)
            train_losses.append(tr_loss)

        for X, Y in iterate(dev):        
            dev_loss = compute_cost_func(X, Y)
            dev_losses.append(dev_loss)
        
        if dev_loss < best_score:
            best_weight = lasagne.layers.get_all_param_values(net)
            best_score = dev_loss

        print "Epoch %d: train loss - %f, dev loss - %f" % (epoch, np.mean(train_losses), np.mean(dev_losses))
        train_epoch_loss.append(np.mean(train_losses))
        dev_epoch_loss.append(np.mean(dev_losses))
    return train_epoch_loss, dev_epoch_loss, best_weight

train_epoch_loss, dev_epoch_loss, best_weight = trainig_cycle(train_func, compute_cost_func, network, 2)

Training ...
Epoch 0: train loss - 0.558829, dev loss - 0.513675
Epoch 1: train loss - 0.459067, dev loss - 0.446848


#### Evaluation

In [33]:
from sklearn.metrics import auc, roc_auc_score

aucs = []
accuracys = []

for X, Y in iterate(test, False):
    probs = probs_func(X)
    predictions = (probs[:,1] > 0.5).astype(int)
    aucs.append(roc_auc_score(Y, probs[:, 1]))
    accuracys.append(accuracy_score(Y, predictions))

print "Avearage auc %s, avearage accuracy %s" % (np.mean(aucs), np.mean(accuracys))

Avearage auc 0.86647055051, avearage accuracy 0.790368424812


In [39]:
from itertools import compress
def test_phrase(number):
    for X, Y in iterate(test[number:(number+1)], False):
        probs = probs_func(X)
        predictions = (probs[:,1] > 0.5).astype(int)
        phrase = prase_array_to_string(test[number])
        phrase_compressed = prase_array_to_string(test[number], True)
        phrase_compression_prediction = ' '.join(map(lambda x: x.form, compress(test[number], 1 - predictions)))
        print "Phrase: '%s'" % phrase
        print "True compression: '%s'" % phrase_compressed
        print "Compression predicted: '%s'" % phrase_compression_prediction

test_phrase(18)

Phrase: 'Abraham Lincoln captured its second PSAL football championship in three seasons with a 28 - 27 victory over Tottenville on Tuesday at Yankee Stadium'
True compression: 'Abraham Lincoln captured its PSAL football championship'
Compression predicted: 'Abraham Lincoln captured its second PSAL football championship'
