In [3]:
# write all code in one cell 

#========================Load data=========================
import numpy as np
import pandas as pd

train_data_source = 'imdb_train.csv'
test_data_source = 'imdb_valid.csv'

train_df = pd.read_csv(train_data_source, encoding='latin-1')
test_df = pd.read_csv(test_data_source, encoding='latin-1')

In [4]:
# to change use .astype() 
test_df['text'] = test_df.text.astype(str)
train_df['text'] = train_df.text.astype(str)

In [5]:
train_df.head()
# test_df.dtypes

Unnamed: 0,label,text
0,2,Just after the end of WWII Powell & Pressburge...
1,1,How did this ever come into existence? I gener...
2,2,I loved this movie since I was 7 and I saw it ...
3,2,The film opens with Bill Coles (Melvyn Douglas...
4,1,"Brilliant actor as he is, Al Pacino completely..."


In [6]:
# convert string to lower case 
train_texts = train_df['text'].values 
train_texts = [s.lower() for s in train_texts] 

test_texts = test_df['text'].values 
test_texts = [s.lower() for s in test_texts] 

In [7]:
#=======================Convert string to index================
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(train_texts)

# Convert string to index 
train_sequences = tk.texts_to_sequences(train_texts)
test_texts = tk.texts_to_sequences(test_texts)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
# Padding
train_data = pad_sequences(train_sequences, maxlen=1014, padding='post')
test_data = pad_sequences(test_texts, maxlen=1014, padding='post')

# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')

In [9]:
#=======================Get classes================
train_classes = train_df['label'].values
train_class_list = [x-1 for x in train_classes]

test_classes = test_df['label'].values
test_class_list = [x-1 for x in test_classes]

from keras.utils import to_categorical
train_classes = to_categorical(train_class_list)
test_classes = to_categorical(test_class_list)

In [10]:
print(tk.word_index)

{'3': 48, '·': 96, '6': 55, 'o': 7, '&': 53, ' ': 2, '¬': 104, '(': 36, 'm': 15, '<': 30, '[': 73, '´': 65, '»': 89, 'ï': 98, 'w': 21, '}': 69, 'å': 101, '¨': 66, '¦': 87, 'i': 6, 'e': 3, '7': 52, '{': 70, '¯': 85, 'â': 54, 'n': 9, '|': 93, 's': 8, '³': 86, '®': 97, '#': 64, '°': 105, 'd': 13, '%': 62, '§': 77, '*': 46, '²': 103, '5': 49, 'k': 26, '+': 63, 'f': 17, '£': 72, '!': 37, '¶': 81, '-': 31, ':': 43, 'q': 39, 'b': 20, ';': 47, '$': 59, 'p': 22, '¸': 100, 'u': 16, 'j': 33, ',': 25, 'z': 38, '©': 57, ']': 71, 'ª': 92, '2': 45, '_': 60, '1': 42, '½': 91, '^': 83, '0': 41, '¡': 67, '¤': 76, 'l': 12, '\t': 80, '=': 61, '«': 78, '¥': 95, 'º': 90, '~': 68, '.': 23, 't': 4, '±': 84, 'v': 24, 'x': 34, 'r': 10, '4': 51, '¿': 102, '9': 44, 'h': 11, '\xa0': 82, '8': 50, '>': 29, '"': 32, '?': 40, '`': 58, '\\': 94, 'g': 18, ')': 35, '/': 28, 'a': 5, 'ã': 56, '¹': 99, "'": 27, 'UNK': 1, '@': 74, '¼': 79, 'y': 19, '¢': 88, 'c': 14, '\xad': 75, '¾': 106}


In [11]:
vocab_size=len(tk.word_index)
vocab_size

106

In [12]:
embedding_weights=[]

In [13]:
embedding_weights.append(np.zeros(vocab_size))

In [14]:
for char, i in tk.word_index.items(): # from index 1 to 70
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)

In [15]:
print(embedding_weights.shape) # first row all 0 for PAD, 69 char, last row for UNK
embedding_weights

(107, 106)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [16]:
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

In [17]:
# parameter 
input_size = 1014
# vocab_size = 86
embedding_size = 106
conv_layers = [[256, 7, 3], 
               [256, 7, 3], 
               [256, 3, -1], 
               [256, 3, -1], 
               [256, 3, -1], 
               [256, 3, 3]]

fully_connected_layers = [1024, 1024]
num_of_classes = 2
dropout_p = 0.5
optimizer = 'adam'
loss = 'categorical_crossentropy'

In [18]:
# Embedding layer Initialization
embedding_layer = Embedding(vocab_size+1, 
                            embedding_size,
                            input_length=input_size,
                            weights=[embedding_weights])

In [19]:
# Model 

# Input
inputs = Input(shape=(input_size,), name='input', dtype='int64')  # shape=(?, 1014)
# Embedding 
x = embedding_layer(inputs)
# Conv 
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x) 
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x) # Final shape=(None, 34, 256)
x = Flatten()(x) # (None, 8704)
# Fully connected layers 
for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x) # dense_size == 1024
    x = Dropout(dropout_p)(x)
# Output Layer
predictions = Dense(num_of_classes, activation='softmax')(x)
# Build model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) # Adam, categorical_crossentropy
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 1014)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1014, 106)         11342     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1008, 256)         190208    
_________________________________________________________________
activation_1 (Activation)    (None, 1008, 256)         0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 336, 256)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 330, 256)          459008    
_________________________________________________________________
activation_2 (Activation)    (None, 330, 256)          0         
__________

In [20]:
# 1000 training samples and 100 testing samples
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)

x_train = train_data[indices][:]
y_train = train_classes[indices][:]

x_test = test_data[:]
y_test = test_classes[:]

In [21]:
# Training
model.fit(x_train, y_train,
          validation_data=(x_test, y_test),
          batch_size=128,
          epochs=10,
          verbose=2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 600s - loss: 0.7138 - acc: 0.5046 - val_loss: 0.6929 - val_acc: 0.5112
Epoch 2/10
 - 589s - loss: 0.6932 - acc: 0.5061 - val_loss: 0.6931 - val_acc: 0.5112
Epoch 3/10
 - 598s - loss: 0.6935 - acc: 0.5034 - val_loss: 0.6928 - val_acc: 0.5108
Epoch 4/10
 - 619s - loss: 0.6845 - acc: 0.5562 - val_loss: 0.6750 - val_acc: 0.5916
Epoch 5/10
 - 600s - loss: 0.6499 - acc: 0.6227 - val_loss: 0.6390 - val_acc: 0.6234
Epoch 6/10
 - 638s - loss: 0.6252 - acc: 0.6500 - val_loss: 0.6253 - val_acc: 0.6420
Epoch 7/10
 - 632s - loss: 0.5992 - acc: 0.6771 - val_loss: 0.5909 - val_acc: 0.6784
Epoch 8/10
 - 588s - loss: 0.5507 - acc: 0.7167 - val_loss: 0.6002 - val_acc: 0.6820
Epoch 9/10
 - 575s - loss: 0.5011 - acc: 0.7562 - val_loss: 0.5684 - val_acc: 0.7084
Epoch 10/10
 - 583s - loss: 0.4510 - acc: 0.7864 - val_loss: 0.5562 - val_acc: 0.7180


<keras.callbacks.History at 0x7f6dca7dd5f8>

In [22]:
model.save("my_model_imdb.h5")