In [1]:
import pickle
import numpy as np

from keras import backend
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.constraints import unitnorm
from keras.regularizers import l2
from keras.initializers import random_uniform
from keras.callbacks import TensorBoard


from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


Load train, validation and test data

In [2]:
def get_idx_from_sent(sent, word_idx_map, max_l=51, kernel_size=5):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    pad = kernel_size - 1
    for i in range(pad):
        x.append(0)
    words = sent.split()
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
    while len(x) < max_l+2*pad:
        x.append(0)
    return x

def make_idx_data(revs, word_idx_map, max_l=51, kernel_size=5):
    """
    Transforms sentences into a 2-d matrix.
    """
    train, val, test = [], [], []
    for rev in revs:
        sent = get_idx_from_sent(rev['text'], word_idx_map, max_l, kernel_size)
        sent.append(rev['y'])
        if rev['split'] == 1:
            train.append(sent)
        elif rev['split'] == 0:
            val.append(sent)
    train = np.array(train, dtype=np.int)
    val = np.array(val, dtype=np.int)
    return [train, val]


print ("loading data...")
with open("imdb-train-val-testN.pickle", 'rb') as f:
    x = pickle.load(f, encoding='latin')
revs, W, word_idx_map, vocab = x[0], x[1], x[2], x[3]
print ("data loaded!")

datasets = make_idx_data(revs, word_idx_map, max_l=2721,kernel_size=5)

loading data...
data loaded!


Put train data in separate NumPy arrays

In [3]:
# Train data preparation
N = datasets[0].shape[0]
conv_input_width = W.shape[1]
conv_input_height = int(datasets[0].shape[1]-1)

# For each word write a word index (not vector) to X tensor
train_X = np.zeros((N, conv_input_height), dtype=np.int)
train_Y = np.zeros((N, 2), dtype=np.int)
for i in range(N):
    for j in range(conv_input_height):
        train_X[i, j] = datasets[0][i, j]
    
print ('train_X.shape = {}'.format(train_X.shape))
print ('train_Y.shape = {}'.format(train_Y.shape))




train_X.shape = (2209, 2729)
train_Y.shape = (2209, 2)


In [4]:
import pandas as pd
data_train = pd.read_csv('essays.csv',encoding = "latin")
for i in range(N):
    train_Y[i,data_train.iloc[i,3]] = 1

In [7]:
print(train_X.shape)
print(train_Y)


(2209, 2729)
[[0 1]
 [1 0]
 [0 1]
 ...
 [0 1]
 [1 0]
 [0 1]]


Put validation data in separate NumPy arrays

In [5]:
# Validation data preparation
Nv = datasets[1].shape[0]

# For each word write a word index (not vector) to X tensor
val_X = np.zeros((Nv, conv_input_height), dtype=np.int)
val_Y = np.zeros((Nv, 2), dtype=np.int)
for i in range(Nv):
    for j in range(conv_input_height):
        val_X[i, j] = datasets[1][i, j]
    
print ('val_X.shape = {}'.format(val_X.shape))
print ('val_Y.shape = {}'.format(val_Y.shape))
for i in range(Nv):
    val_Y[i,data_train.iloc[i,3]] = 1


val_X.shape = (258, 2729)
val_Y.shape = (258, 2)


In [24]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
                                    # (nothing gets printed in Jupyter, only if you run it standalone)
sess = tf.Session(config=config)
set_session(sess)

In [25]:
tb = TensorBoard(log_dir = 'logs/pp_cnn_d0.4_mse_200')


Let's define and compile CNN model with Keras

In [26]:
from keras.optimizers import RMSprop

backend.set_image_dim_ordering('th')

# Number of feature maps (outputs of convolutional layer)
N_fm = 200
# kernel size of convolutional layer
kernel_size = 5

model = Sequential()
# Embedding layer (lookup table of trainable word vectors)
model.add(Embedding(input_dim=W.shape[0], 
                    output_dim=W.shape[1], 
                    input_length=conv_input_height,
                    weights=[W], 
                    W_constraint=unitnorm(),
                    name = 'e_l'))
# Reshape word vectors from Embedding to tensor format suitable for Convolutional layer
model.add(Reshape((1, conv_input_height, conv_input_width)))

# first convolutional layer
model.add(Convolution2D(N_fm,
                        kernel_size, 
                        conv_input_width,
                        kernel_initializer='random_uniform',
                        border_mode='valid',
                        W_regularizer=l2(0.001)))
# ReLU activation
model.add(Activation('relu'))

# aggregate data in every feature map to scalar using MAX operation
model.add(MaxPooling2D(pool_size=(conv_input_height-kernel_size+1,1)))

model.add(Flatten())
model.add(Dropout(0.4))
model.add(Dense(128,kernel_initializer='random_uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.4))
# Inner Product layer (as in regular neural network, but without non-linear activation function)
model.add(Dense(2))
# SoftMax activation; actually, Dense+SoftMax works as Multinomial Logistic Regression
model.add(Activation('softmax'))

# Custom optimizers could be used, though right now standard adadelta is employed
opt = RMSprop(lr=0.001, rho=0.9, epsilon=None)
model.compile(loss='mean_squared_error', 
              optimizer=opt,
              metrics=['accuracy'])



Train model for N_epoch epochs (could be run as many times as needed)

In [27]:
model.fit(train_X, train_Y, batch_size=32, epochs = 10, validation_data=(val_X,val_Y), verbose=1, callbacks = [tb])

Train on 2209 samples, validate on 258 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x177d45abeb8>

In [9]:
cvscores=[]
scores = model.evaluate(val_X, val_Y, verbose=1)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

acc: 51.55%
51.55% (+/- 0.00%)


In [19]:
backend.clear_session()

Save model

In [49]:
model.save('cnn_10epochsN.model')

Put test data in separate NumPy array

In [10]:
Nt = datasets[2].shape[0]

test_X = np.zeros((Nt, conv_input_height), dtype=np.int)
for i in range(Nt):
    for j in range(conv_input_height):
       test_X[i, j] = datasets[2][i, j]
    
print ('test_X.shape = {}'.format(test_X.shape))
test_X[5][232]

test_X.shape = (468, 2729)


11622

In [12]:
a = model.predict(test_X, verbose = 1)

