In [1]:
# write all code in one cell 

#========================Load data=========================
import numpy as np
import pandas as pd

train_data_source = '../data/ag_news_csv/train.csv'
test_data_source = '../data/ag_news_csv/test.csv'

train_df = pd.read_csv(train_data_source, header=None)
test_df = pd.read_csv(test_data_source, header=None)

# concatenate column 1 and column 2 as one text
for df in [train_df, test_df]:
    df[1] = df[1] + df[2]
    df = df.drop([2], axis=1)
    
# convert string to lower case 
train_texts = train_df[1].values 
train_texts = [s.lower() for s in train_texts] 

test_texts = test_df[1].values 
test_texts = [s.lower() for s in test_texts] 

#=======================Convert string to index================
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(train_texts)
# If we already have a character list, then replace the tk.word_index
# If not, just skip below part

#-----------------------Skip part start--------------------------
# construct a new vocabulary 
alphabet="abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
    
# Use char_dict to replace the tk.word_index
tk.word_index = char_dict.copy() 
# Add 'UNK' to the vocabulary 
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
#-----------------------Skip part end----------------------------

# Convert string to index 
train_sequences = tk.texts_to_sequences(train_texts)
test_texts = tk.texts_to_sequences(test_texts)

# Padding
train_data = pad_sequences(train_sequences, maxlen=1014, padding='post')
test_data = pad_sequences(test_texts, maxlen=1014, padding='post')

# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')

#=======================Get classes================
train_classes = train_df[0].values
train_class_list = [x-1 for x in train_classes]

test_classes = test_df[0].values
test_class_list = [x-1 for x in test_classes]

from keras.utils import to_categorical
train_classes = to_categorical(train_class_list)
test_classes = to_categorical(test_class_list)

Using TensorFlow backend.


In [4]:
print(len(tk.word_index))
print(tk.word_index)

70
{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '0': 27, '1': 28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36, ' ': 37, ',': 38, ';': 39, '.': 40, '!': 41, '?': 42, ':': 43, "'": 44, '"': 45, '/': 46, '\\': 47, '|': 48, '_': 49, '@': 50, '#': 51, '$': 52, '%': 53, '^': 54, '&': 55, '*': 56, '~': 57, '`': 58, '+': 59, '-': 60, '=': 61, '<': 62, '>': 63, '(': 64, ')': 65, '[': 66, ']': 67, '{': 68, '}': 69, 'UNK': 70}


In [6]:
vocab_size = len(tk.word_index) # 70

embedding_weights = [] #(71, 70)
embedding_weights.append(np.zeros(vocab_size)) #(0, 70)

for char, i in tk.word_index.items(): # from index 1 to 69
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)

In [7]:
embedding_weights

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [8]:
from keras.layers import Input, Embedding, Dense, Flatten
from keras.layers import LSTM, Dropout
from keras.models import Model

In [9]:
# parameter 
input_size = 1014
embedding_size = 70

num_of_classes = 4
dropout_p = 0.5
optimizer = 'adam'
loss = 'categorical_crossentropy'

# embedding layer 
embedding_layer = Embedding(vocab_size+1,
                            embedding_size,
                            weights=[embedding_weights],
                            input_length=input_size)

In [10]:
# Model construction
inputs = Input(shape=(input_size,))
embedded_sequence = embedding_layer(inputs)
x = LSTM(256, return_sequences=True, activation='relu')(embedded_sequence)
x = LSTM(256, return_sequences=True, activation='relu')(x)
x = Flatten()(x)
x = Dense(1024, activation='relu')(x)
x = Dropout(dropout_p)(x)
x = Dense(1024, activation='relu')(x)
x = Dropout(dropout_p)(x)
prediction = Dense(num_of_classes, activation='softmax')(x)

model = Model(inputs=inputs, outputs=prediction)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1014)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1014, 70)          4970      
_________________________________________________________________
lstm_1 (LSTM)                (None, 1014, 256)         334848    
_________________________________________________________________
lstm_2 (LSTM)                (None, 1014, 256)         525312    
_________________________________________________________________
flatten_1 (Flatten)          (None, 259584)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              265815040 
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
__________

In [11]:
# prepare the data 
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)

x_train = train_data[indices][:1000]
y_train = train_classes[indices][:1000]

x_test = test_data[:100]
y_test = test_classes[:100]

# training
model.fit(x_train, y_train,
          validation_data=(x_test, y_test),
          batch_size=128,
          epochs=1,
          verbose=1)

Train on 1000 samples, validate on 100 samples
Epoch 1/1


<keras.callbacks.History at 0x184f2c2ac8>