In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


# 1 Load data

In [2]:
import data_helpers

#==================Preprocess===================

# Load data
positive_data_file = "../data/rt-polaritydata/rt-polarity.pos"
negtive_data_file = "../data/rt-polaritydata/rt-polarity.neg"

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(positive_data_file, negtive_data_file)


Loading data...


In [4]:
x_text[:2]

["the rock is destined to be the 21st century 's new conan and that he 's going to make a splash even greater than arnold schwarzenegger , jean claud van damme or steven segal",
 "the gorgeously elaborate continuation of the lord of the rings trilogy is so huge that a column of words cannot adequately describe co writer director peter jackson 's expanded vision of j r r tolkien 's middle earth"]

# 2 Tokenize, Pad, Split

In [5]:
# =======================Convert string to index================
# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(x_text)
# If we already have a character list, then replace the tk.word_index
# If not, just skip below part

# -----------------------Skip part start--------------------------
# construct a new vocabulary
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1

# Use char_dict to replace the tk.word_index
tk.word_index = char_dict.copy()
# Add 'UNK' to the vocabulary
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

In [6]:
len(tk.word_index) # 69 char, and 1 UNK token

70

In [7]:
tk.word_index

{' ': 37,
 '!': 41,
 '"': 45,
 '#': 51,
 '$': 52,
 '%': 53,
 '&': 55,
 "'": 44,
 '(': 64,
 ')': 65,
 '*': 56,
 '+': 59,
 ',': 38,
 '-': 60,
 '.': 40,
 '/': 46,
 '0': 27,
 '1': 28,
 '2': 29,
 '3': 30,
 '4': 31,
 '5': 32,
 '6': 33,
 '7': 34,
 '8': 35,
 '9': 36,
 ':': 43,
 ';': 39,
 '<': 62,
 '=': 61,
 '>': 63,
 '?': 42,
 '@': 50,
 'UNK': 70,
 '[': 66,
 '\\': 47,
 ']': 67,
 '^': 54,
 '_': 49,
 '`': 58,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '{': 68,
 '|': 48,
 '}': 69,
 '~': 57}

In [8]:
# Convert string to index
sequences = tk.texts_to_sequences(x_text)

In [11]:
print(sequences[:2])

[[20, 8, 5, 37, 18, 15, 3, 11, 37, 9, 19, 37, 4, 5, 19, 20, 9, 14, 5, 4, 37, 20, 15, 37, 2, 5, 37, 20, 8, 5, 37, 29, 28, 19, 20, 37, 3, 5, 14, 20, 21, 18, 25, 37, 44, 19, 37, 14, 5, 23, 37, 3, 15, 14, 1, 14, 37, 1, 14, 4, 37, 20, 8, 1, 20, 37, 8, 5, 37, 44, 19, 37, 7, 15, 9, 14, 7, 37, 20, 15, 37, 13, 1, 11, 5, 37, 1, 37, 19, 16, 12, 1, 19, 8, 37, 5, 22, 5, 14, 37, 7, 18, 5, 1, 20, 5, 18, 37, 20, 8, 1, 14, 37, 1, 18, 14, 15, 12, 4, 37, 19, 3, 8, 23, 1, 18, 26, 5, 14, 5, 7, 7, 5, 18, 37, 38, 37, 10, 5, 1, 14, 37, 3, 12, 1, 21, 4, 37, 22, 1, 14, 37, 4, 1, 13, 13, 5, 37, 15, 18, 37, 19, 20, 5, 22, 5, 14, 37, 19, 5, 7, 1, 12], [20, 8, 5, 37, 7, 15, 18, 7, 5, 15, 21, 19, 12, 25, 37, 5, 12, 1, 2, 15, 18, 1, 20, 5, 37, 3, 15, 14, 20, 9, 14, 21, 1, 20, 9, 15, 14, 37, 15, 6, 37, 20, 8, 5, 37, 12, 15, 18, 4, 37, 15, 6, 37, 20, 8, 5, 37, 18, 9, 14, 7, 19, 37, 20, 18, 9, 12, 15, 7, 25, 37, 9, 19, 37, 19, 15, 37, 8, 21, 7, 5, 37, 20, 8, 1, 20, 37, 1, 37, 3, 15, 12, 21, 13, 14, 37, 15, 6, 37, 23, 15

In [13]:
# See char level length
length = [len(sent) for sent in sequences]
print('The max length is: ', max(length))
print('The min length is: ', min(length))
print('The average length is: ', sum(length)/len(length))

The max length is:  266
The min length is:  2
The average length is:  111.5871318701932


In [15]:
# Padding
sequences_pad = pad_sequences(sequences, maxlen=266, padding='post')
print("The whole data size is: ", sequences_pad.shape)

The whole data size is:  (10662, 266)


In [27]:
sequences_pad[:2]

array([[20,  8,  5, 37, 18, 15,  3, 11, 37,  9, 19, 37,  4,  5, 19, 20,
         9, 14,  5,  4, 37, 20, 15, 37,  2,  5, 37, 20,  8,  5, 37, 29,
        28, 19, 20, 37,  3,  5, 14, 20, 21, 18, 25, 37, 44, 19, 37, 14,
         5, 23, 37,  3, 15, 14,  1, 14, 37,  1, 14,  4, 37, 20,  8,  1,
        20, 37,  8,  5, 37, 44, 19, 37,  7, 15,  9, 14,  7, 37, 20, 15,
        37, 13,  1, 11,  5, 37,  1, 37, 19, 16, 12,  1, 19,  8, 37,  5,
        22,  5, 14, 37,  7, 18,  5,  1, 20,  5, 18, 37, 20,  8,  1, 14,
        37,  1, 18, 14, 15, 12,  4, 37, 19,  3,  8, 23,  1, 18, 26,  5,
        14,  5,  7,  7,  5, 18, 37, 38, 37, 10,  5,  1, 14, 37,  3, 12,
         1, 21,  4, 37, 22,  1, 14, 37,  4,  1, 13, 13,  5, 37, 15, 18,
        37, 19, 20,  5, 22,  5, 14, 37, 19,  5,  7,  1, 12,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 

In [17]:
# Shuffle data
np.random.seed(42)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = sequences_pad[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train and test
training_rate = 0.9
train_len = int(len(y) * training_rate)
x_train = x_shuffled[:train_len]
y_train = y_shuffled[:train_len]
x_test = x_shuffled[train_len:]
y_test = y_shuffled[train_len:]
print('Training data size is: ', x_train.shape)
print('Validation data size is: ', x_test.shape)

Training data size is:  (9595, 266)
Validation data size is:  (1067, 266)


# 3 Embedding weights

In [18]:
vocab_size = len(tk.word_index)
vocab_size

70

We use one-hot vector, so the embedding dim is 70

In [22]:
# Embedding weights
embedding_dim = 70

zero_vector = np.zeros((1, embedding_dim)) 

embedding_weights = np.concatenate((zero_vector, np.identity(vocab_size)), axis=0)

In [24]:
print(embedding_weights.shape)
embedding_weights

(71, 70)


array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [26]:
from keras.layers import Embedding
input_size = 266

# Embedding layer Initialization
embedding_layer = Embedding(vocab_size + 1,
                            embedding_dim,
                            input_length=input_size,
                            weights=[embedding_weights])