In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras import backend as K

from utils import TextLoader

utils_dir = 'utils'
data_path = 'data/data_2w_arti_clean.csv'
test_data_path = 'data/data_test_arti_1_clean.csv'

# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 100

def f1score(y_true, y_pred):
    num_tp = K.sum(y_true*y_pred)
    num_fn = K.sum(y_true*(1.0-y_pred))
    num_fp = K.sum((1.0-y_true)*y_pred)
    num_tn = K.sum((1.0-y_true)*(1.0-y_pred))
    #print num_tp, num_fn, num_fp, num_tn
    f1 = 2.0*num_tp/(2.0*num_tp+num_fn+num_fp)
    return f1

print('Loading data...')

data_loader = TextLoader(True, utils_dir, data_path, batch_size, 20, None, None)
data_test_loader = TextLoader(True, utils_dir, test_data_path, batch_size, 20, None, None)

x_train = data_loader.tensor_xa
y_train = data_loader.tensor_y

x_test = data_test_loader.tensor_xa
y_test = data_test_loader.tensor_y

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()

model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))

model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=[f1score])

print(model.summary())
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)

scores = model.evaluate(x_test, y_test, verbose=0)
print("F1 Score: %.2f%%" % (scores[1]*100))

Loading data...
loading vocab and processing data
loading vocab and processing data
(19997, 'train sequences')
(104, 'test sequences')
Pad sequences (samples x time)
('x_train shape:', (19997, 400))
('x_test shape:', (104, 400))
Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 400, 50)           250000    
_________________________________________________________________
dropout_5 (Dropout)          (None, 400, 50)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 398, 250)          37750     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 250)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 250)               62750     
______________________________