In [1]:
from __future__ import unicode_literals
import os
import nltk
import pandas as pd
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout
import numpy as np
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
np.random.seed(1337)  # For Reproducibility
import multiprocessing

Using Theano backend.


In [2]:
# reload necessary data set to pandas DataFrame.
train_df = pd.read_csv('dumps/stemmered_train_mrd.csv', encoding='utf8')
test_df = pd.read_csv('dumps/stemmered_test_mrd.csv', encoding='utf8')

In [3]:
print "Storing terms from training documents as list of lists"
terms_by_doc_train = [document.rstrip(']"').lstrip('"[').split(", ") for document in train_df.ix[:,0]]
terms_by_label_train = train_df.ix[:, 1]
n_terms_per_doc = [len(terms) for terms in terms_by_doc_train]
print "min, max and average number of terms per document:", min(n_terms_per_doc), max(n_terms_per_doc), sum(n_terms_per_doc)/len(n_terms_per_doc)
# print terms_by_doc_train[0][8]
print "Storing terms from test documents as list of lists"
terms_by_doc_test = [document.rstrip(']"').lstrip('"[').split(", ") for document in test_df.ix[:,0]]
terms_by_label_test = test_df.ix[:, 1]
n_terms_per_doc = [len(terms) for terms in terms_by_doc_test]
print "min, max and average number of terms per document:", min(n_terms_per_doc), max(n_terms_per_doc), sum(n_terms_per_doc)/len(n_terms_per_doc)
# print terms_by_doc_test[0][0]
# Store all terms in list
all_terms = [terms for sublist in terms_by_doc_train for terms in sublist]
# Compute average number of terms
avg_len = sum(n_terms_per_doc)/len(n_terms_per_doc)
print "the average number of terms:", avg_len
# Find unique terms
all_unique_terms = list(set(all_terms))
print "the number of unique terms:", len(all_unique_terms)
print "How many sentences in traing set:" , len(terms_by_doc_train)
print "How many sentences in test set:", len(terms_by_doc_test)


Storing terms from training documents as list of lists
min, max and average number of terms per document: 5 39 11
Storing terms from test documents as list of lists
min, max and average number of terms per document: 5 32 11
the average number of terms: 11
the number of unique terms: 11403
How many sentences in traing set: 5794
How many sentences in test set: 3873


In [33]:
# set parameters:
vocab_dim = 20
maxlen = 100
n_iterations = 1  # ideally more..
# Words that appear only once or twice in a billion-word corpus are probably uninteresting typos and garbage. 
n_exposures = 3
window_size = 5
batch_size = 32
n_epoch = 2
input_length = 100
cpu_count = multiprocessing.cpu_count()

In [34]:
combine_train_test_X = terms_by_doc_train + terms_by_doc_train


In [35]:
print('Training a Word2vec model...')
model = Word2Vec(size=vocab_dim,
                 min_count=n_exposures,
                 window=window_size,
                 workers=cpu_count,
                 iter=n_iterations)
model.build_vocab(combine_train_test_X)
model.train(combine_train_test_X)
# print len(model.vocab.viewkeys()) -> 11403

117475

Training a Word2vec model...


In [36]:
gensim_dict = Dictionary()
gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
# gensim_dict.items() returns [(0, u"'surpris"), (1, u'woodi'), (2, u'yellow'),...]
# K+1 aims at avoiding 0 as index.
w2indx = {v: k+1 for k, v in gensim_dict.items()}
w2vec = {word: model[word] for word in w2indx.keys()}
# print len(model["surpris"]) -> 100
print('Setting up Arrays for Keras Embedding Layer...')
n_symbols = len(w2indx) + 1  # adding 1 to account for 0th index
embedding_weights = np.zeros((n_symbols + 1, vocab_dim))
for word, index in w2indx.items():
    embedding_weights[index, :] = w2vec[word]
# print embedding_weights.shape -> (11405, 100)

Setting up Arrays for Keras Embedding Layer...


In [37]:
X_train = []
for doc in terms_by_doc_train:
    new_txt = []
    for word in doc:
        try:
            new_txt.append(w2indx[word])
        except:
            new_txt.append(0)
    X_train.append(new_txt)
X_test = []
for doc in terms_by_doc_test:
    new_txt = []
    for word in doc:
        try:
            new_txt.append(w2indx[word])
        except:
            new_txt.append(0)
    X_test.append(new_txt)

print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
y_train = np.array(terms_by_label_train)
y_test = np.array(terms_by_label_test)
print('X_train shape:',  y_train.shape)
print('X_test shape:', y_test.shape)


Pad sequences (samples x time)
(u'X_train shape:', (5794, 100))
(u'X_test shape:', (3873, 100))
(u'X_train shape:', (5794,))
(u'X_test shape:', (3873,))


In [38]:
print('Defining a Simple Keras Model...')
model = Sequential()  # or Graph or whatever
model.add(Embedding(input_dim=n_symbols + 1,
                    output_dim=vocab_dim,
                    mask_zero=True,
                    weights=[embedding_weights],
                    input_length=input_length))  # Adding Input Length
model.add(LSTM(vocab_dim))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
#model.add(Dense(1, activation='relu'))

print('Compiling the Model...')
model.compile(loss='binary_crossentropy', optimizer='adam', class_mode='binary', metrics=['accuracy'])

print("Train...")
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=n_epoch,
          validation_data=(X_test, y_test))

print("Evaluate...")
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

(u'Test score:', 0.55627031720118592)
(u'Test accuracy:', 0.72579395817195969)
































































































 864/3873 [=====>........................] - ETA: 0s

 800/3873 [=====>........................] - ETA: 0s

 736/3873 [====>.........................] - ETA: 0s

 672/3873 [====>.........................] - ETA: 0s

 608/3873 [===>..........................] - ETA: 0s

 544/3873 [===>..........................] - ETA: 0s

 480/3873 [==>...........................] - ETA: 0s

 416/3873 [==>...........................] - ETA: 0s

 352/3873 [=>............................] - ETA: 0s

 288/3873 [=>............................] - ETA: 0s

 224/3873 [>.............................] - ETA: 0s

 160/3873 [>.............................] - ETA: 0s

  96/3873 [..............................] - ETA: 0s


Evaluate...
  32/3873 [..............................] - ETA: 0s

























































































































































































































































































1344/5794 [=====>........................] - ETA: 3s - loss: 0.5410 - acc: 0.7865

1312/5794 [=====>........................] - ETA: 3s - loss: 0.5445 - acc: 0.7851

1280/5794 [=====>........................] - ETA: 3s - loss: 0.5450 - acc: 0.7852

1248/5794 [=====>........................] - ETA: 3s - loss: 0.5425 - acc: 0.7877

1216/5794 [=====>........................] - ETA: 3s - loss: 0.5417 - acc: 0.7870

1184/5794 [=====>........................] - ETA: 3s - loss: 0.5432 - acc: 0.7855

1152/5794 [====>.........................] - ETA: 3s - loss: 0.5434 - acc: 0.7882

1120/5794 [====>.........................] - ETA: 3s - loss: 0.5451 - acc: 0.7875

1088/5794 [====>.........................] - ETA: 3s - loss: 0.5455 - acc: 0.7849

1056/5794 [====>.........................] - ETA: 3s - loss: 0.5468 - acc: 0.7869

1024/5794 [====>.........................] - ETA: 3s - loss: 0.5451 - acc: 0.7881

 - ETA: 3s - loss: 0.5483 - acc: 0.7853

 992/5794 [====>.........................]

 960/5794 [===>..........................] - ETA: 3s - loss: 0.5463 - acc: 0.7854

 928/5794 [===>..........................] - ETA: 3s - loss: 0.5439 - acc: 0.7866

 896/5794 [===>..........................] - ETA: 3s - loss: 0.5392 - acc: 0.7913

 864/5794 [===>..........................] - ETA: 3s - loss: 0.5379 - acc: 0.7928

 832/5794 [===>..........................] - ETA: 3s - loss: 0.5396 - acc: 0.7909

 800/5794 [===>..........................] - ETA: 4s - loss: 0.5439 - acc: 0.7863

 768/5794 [==>...........................] - ETA: 4s - loss: 0.5412 - acc: 0.7865

 736/5794 [==>...........................] - ETA: 4s - loss: 0.5422 - acc: 0.7826

 704/5794 [==>...........................] - ETA: 4s - loss: 0.5446 - acc: 0.7798

 672/5794 [==>...........................] - ETA: 4s - loss: 0.5424 - acc: 0.7812

 640/5794 [==>...........................] - ETA: 4s - loss: 0.5415 - acc: 0.7812

 608/5794 [==>...........................] - ETA: 4s - loss: 0.5436 - acc: 0.7796

 576/5794 [=>............................] - ETA: 4s - loss: 0.5381 - acc: 0.7882

 544/5794 [=>............................] - ETA: 4s - loss: 0.5401 - acc: 0.7831

 512/5794 [=>............................] - ETA: 4s - loss: 0.5420 - acc: 0.7812

 480/5794 [=>............................] - ETA: 4s - loss: 0.5475 - acc: 0.7750

 448/5794 [=>............................] - ETA: 4s - loss: 0.5468 - acc: 0.7768

 416/5794 [=>............................] - ETA: 4s - loss: 0.5516 - acc: 0.7692

 384/5794 [>.............................] - ETA: 4s - loss: 0.5447 - acc: 0.7760

 352/5794 [>.............................] - ETA: 4s - loss: 0.5480 - acc: 0.7784

 320/5794 [>.............................] - ETA: 4s - loss: 0.5469 - acc: 0.7781

 288/5794 [>.............................] - ETA: 4s - loss: 0.5531 - acc: 0.7743

 256/5794 [>.............................] - ETA: 4s - loss: 0.5538 - acc: 0.7656

 224/5794 [>.............................] - ETA: 4s - loss: 0.5576 - acc: 0.7634

 192/5794 [..............................] - ETA: 4s - loss: 0.5564 - acc: 0.7552

 160/5794 [..............................] - ETA: 4s - loss: 0.5573 - acc: 0.7625

 128/5794 [..............................] - ETA: 4s - loss: 0.5442 - acc: 0.7891

  96/5794 [..............................] - ETA: 4s - loss: 0.5655 - acc: 0.7708

  64/5794 [..............................] - ETA: 4s - loss: 0.5954 - acc: 0.7344


Epoch 2/2
  32/5794 [..............................] - ETA: 4s - loss: 0.5842 - acc: 0.7188

























































































































































































































































































1344/5794 [=====>........................] - ETA: 3s - loss: 0.6926 - acc: 0.5216

1312/5794 [=====>........................] - ETA: 3s - loss: 0.6925 - acc: 0.5259

1280/5794 [=====>........................] - ETA: 3s - loss: 0.6923 - acc: 0.5266

1248/5794 [=====>........................] - ETA: 4s - loss: 0.6922 - acc: 0.5264

1216/5794 [=====>........................] - ETA: 4s - loss: 0.6926 - acc: 0.5238

1184/5794 [=====>........................] - ETA: 4s - loss: 0.6927 - acc: 0.5228

1152/5794 [====>.........................] - ETA: 4s - loss: 0.6927 - acc: 0.5191

1120/5794 [====>.........................] - ETA: 4s - loss: 0.6929 - acc: 0.5161

1088/5794 [====>.........................] - ETA: 4s - loss: 0.6928 - acc: 0.5193

1056/5794 [====>.........................] - ETA: 4s - loss: 0.6927 - acc: 0.5170

1024/5794 [====>.........................] - ETA: 4s - loss: 0.6927 - acc: 0.5176

 992/5794 [====>.........................] - ETA: 4s - loss: 0.6927 - acc: 0.5171

 960/5794 [===>..........................] - ETA: 4s - loss: 0.6931 - acc: 0.5104

 928/5794 [===>..........................] - ETA: 4s - loss: 0.6934 - acc: 0.5022

 896/5794 [===>..........................] - ETA: 4s - loss: 0.6933 - acc: 0.5067

 864/5794 [===>..........................] - ETA: 4s - loss: 0.6939 - acc: 0.5012

 832/5794 [===>..........................] - ETA: 4s - loss: 0.6942 - acc: 0.4952

 800/5794 [===>..........................] - ETA: 4s - loss: 0.6944 - acc: 0.4938

 768/5794 [==>...........................] - ETA: 4s - loss: 0.6942 - acc: 0.4935

 736/5794 [==>...........................] - ETA: 4s - loss: 0.6942 - acc: 0.4946

 704/5794 [==>...........................] - ETA: 4s - loss: 0.6939 - acc: 0.4972

 672/5794 [==>...........................] - ETA: 4s - loss: 0.6940 - acc: 0.4985

 640/5794 [==>...........................] - ETA: 4s - loss: 0.6936 - acc: 0.5016

 608/5794 [==>...........................] - ETA: 4s - loss: 0.6936 - acc: 0.4984

 576/5794 [=>............................] - ETA: 4s - loss: 0.6937 - acc: 0.4983

 544/5794 [=>............................] - ETA: 4s - loss: 0.6937 - acc: 0.4982

 512/5794 [=>............................] - ETA: 5s - loss: 0.6935 - acc: 0.5059

 480/5794 [=>............................] - ETA: 5s - loss: 0.6933 - acc: 0.5125

 448/5794 [=>............................] - ETA: 5s - loss: 0.6937 - acc: 0.5067

 416/5794 [=>............................] - ETA: 5s - loss: 0.6938 - acc: 0.5024

 384/5794 [>.............................] - ETA: 5s - loss: 0.6944 - acc: 0.4896

 352/5794 [>.............................] - ETA: 5s - loss: 0.6942 - acc: 0.4886

 320/5794 [>.............................] - ETA: 5s - loss: 0.6946 - acc: 0.4906

 288/5794 [>.............................] - ETA: 4s - loss: 0.6948 - acc: 0.4965

 256/5794 [>.............................] - ETA: 4s - loss: 0.6955 - acc: 0.4805

 224/5794 [>.............................] - ETA: 4s - loss: 0.6963 - acc: 0.4732

 192/5794 [..............................] - ETA: 5s - loss: 0.6967 - acc: 0.4583

 160/5794 [..............................] - ETA: 5s - loss: 0.6974 - acc: 0.4750

 128/5794 [..............................] - ETA: 5s - loss: 0.6954 - acc: 0.4922

  96/5794 [..............................] - ETA: 5s - loss: 0.6951 - acc: 0.5208

  64/5794 [..............................] - ETA: 5s - loss: 0.6969 - acc: 0.5156

Train on 5794 samples, validate on 3873 samples
Epoch 1/2
  32/5794 [..............................] - ETA: 5s - loss: 0.6983 - acc: 0.4688

Defining a Simple Keras Model...
Compiling the Model...
Train...
