In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


# 1 Load data

In [2]:
import data_helpers

#==================Preprocess===================

# Load data
positive_data_file = "../data/rt-polaritydata/rt-polarity.pos"
negtive_data_file = "../data/rt-polaritydata/rt-polarity.neg"

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(positive_data_file, negtive_data_file)


Loading data...


In [4]:
x_text[:2]

["the rock is destined to be the 21st century 's new conan and that he 's going to make a splash even greater than arnold schwarzenegger , jean claud van damme or steven segal",
 "the gorgeously elaborate continuation of the lord of the rings trilogy is so huge that a column of words cannot adequately describe co writer director peter jackson 's expanded vision of j r r tolkien 's middle earth"]

# 2 Tokenize, Pad, Split

In [5]:
# =======================Convert string to index================
# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(x_text)
# If we already have a character list, then replace the tk.word_index
# If not, just skip below part

# -----------------------Skip part start--------------------------
# construct a new vocabulary
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1

# Use char_dict to replace the tk.word_index
tk.word_index = char_dict.copy()
# Add 'UNK' to the vocabulary
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

In [6]:
len(tk.word_index) # 69 char, and 1 UNK token

70

In [7]:
tk.word_index

{' ': 37,
 '!': 41,
 '"': 45,
 '#': 51,
 '$': 52,
 '%': 53,
 '&': 55,
 "'": 44,
 '(': 64,
 ')': 65,
 '*': 56,
 '+': 59,
 ',': 38,
 '-': 60,
 '.': 40,
 '/': 46,
 '0': 27,
 '1': 28,
 '2': 29,
 '3': 30,
 '4': 31,
 '5': 32,
 '6': 33,
 '7': 34,
 '8': 35,
 '9': 36,
 ':': 43,
 ';': 39,
 '<': 62,
 '=': 61,
 '>': 63,
 '?': 42,
 '@': 50,
 'UNK': 70,
 '[': 66,
 '\\': 47,
 ']': 67,
 '^': 54,
 '_': 49,
 '`': 58,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '{': 68,
 '|': 48,
 '}': 69,
 '~': 57}

In [8]:
# Convert string to index
sequences = tk.texts_to_sequences(x_text)

In [11]:
print(sequences[:2])

[[20, 8, 5, 37, 18, 15, 3, 11, 37, 9, 19, 37, 4, 5, 19, 20, 9, 14, 5, 4, 37, 20, 15, 37, 2, 5, 37, 20, 8, 5, 37, 29, 28, 19, 20, 37, 3, 5, 14, 20, 21, 18, 25, 37, 44, 19, 37, 14, 5, 23, 37, 3, 15, 14, 1, 14, 37, 1, 14, 4, 37, 20, 8, 1, 20, 37, 8, 5, 37, 44, 19, 37, 7, 15, 9, 14, 7, 37, 20, 15, 37, 13, 1, 11, 5, 37, 1, 37, 19, 16, 12, 1, 19, 8, 37, 5, 22, 5, 14, 37, 7, 18, 5, 1, 20, 5, 18, 37, 20, 8, 1, 14, 37, 1, 18, 14, 15, 12, 4, 37, 19, 3, 8, 23, 1, 18, 26, 5, 14, 5, 7, 7, 5, 18, 37, 38, 37, 10, 5, 1, 14, 37, 3, 12, 1, 21, 4, 37, 22, 1, 14, 37, 4, 1, 13, 13, 5, 37, 15, 18, 37, 19, 20, 5, 22, 5, 14, 37, 19, 5, 7, 1, 12], [20, 8, 5, 37, 7, 15, 18, 7, 5, 15, 21, 19, 12, 25, 37, 5, 12, 1, 2, 15, 18, 1, 20, 5, 37, 3, 15, 14, 20, 9, 14, 21, 1, 20, 9, 15, 14, 37, 15, 6, 37, 20, 8, 5, 37, 12, 15, 18, 4, 37, 15, 6, 37, 20, 8, 5, 37, 18, 9, 14, 7, 19, 37, 20, 18, 9, 12, 15, 7, 25, 37, 9, 19, 37, 19, 15, 37, 8, 21, 7, 5, 37, 20, 8, 1, 20, 37, 1, 37, 3, 15, 12, 21, 13, 14, 37, 15, 6, 37, 23, 15

In [13]:
# See char level length
length = [len(sent) for sent in sequences]
print('The max length is: ', max(length))
print('The min length is: ', min(length))
print('The average length is: ', sum(length)/len(length))

The max length is:  266
The min length is:  2
The average length is:  111.5871318701932


In [15]:
# Padding
sequences_pad = pad_sequences(sequences, maxlen=266, padding='post')
print("The whole data size is: ", sequences_pad.shape)

The whole data size is:  (10662, 266)


In [27]:
sequences_pad[:2]

array([[20,  8,  5, 37, 18, 15,  3, 11, 37,  9, 19, 37,  4,  5, 19, 20,
         9, 14,  5,  4, 37, 20, 15, 37,  2,  5, 37, 20,  8,  5, 37, 29,
        28, 19, 20, 37,  3,  5, 14, 20, 21, 18, 25, 37, 44, 19, 37, 14,
         5, 23, 37,  3, 15, 14,  1, 14, 37,  1, 14,  4, 37, 20,  8,  1,
        20, 37,  8,  5, 37, 44, 19, 37,  7, 15,  9, 14,  7, 37, 20, 15,
        37, 13,  1, 11,  5, 37,  1, 37, 19, 16, 12,  1, 19,  8, 37,  5,
        22,  5, 14, 37,  7, 18,  5,  1, 20,  5, 18, 37, 20,  8,  1, 14,
        37,  1, 18, 14, 15, 12,  4, 37, 19,  3,  8, 23,  1, 18, 26,  5,
        14,  5,  7,  7,  5, 18, 37, 38, 37, 10,  5,  1, 14, 37,  3, 12,
         1, 21,  4, 37, 22,  1, 14, 37,  4,  1, 13, 13,  5, 37, 15, 18,
        37, 19, 20,  5, 22,  5, 14, 37, 19,  5,  7,  1, 12,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 

In [17]:
# Shuffle data
np.random.seed(42)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = sequences_pad[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train and test
training_rate = 0.9
train_len = int(len(y) * training_rate)
x_train = x_shuffled[:train_len]
y_train = y_shuffled[:train_len]
x_test = x_shuffled[train_len:]
y_test = y_shuffled[train_len:]
print('Training data size is: ', x_train.shape)
print('Validation data size is: ', x_test.shape)

Training data size is:  (9595, 266)
Validation data size is:  (1067, 266)


# 3 Embedding weights with one-hot

In [18]:
vocab_size = len(tk.word_index)
vocab_size

70

We use one-hot vector, so the embedding dim is 70

In [22]:
# Embedding weights
embedding_dim = 70

zero_vector = np.zeros((1, embedding_dim)) 

embedding_weights = np.concatenate((zero_vector, np.identity(vocab_size)), axis=0)

In [24]:
print(embedding_weights.shape)
embedding_weights

(71, 70)


array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [26]:
from keras.layers import Embedding
input_size = 266

# Embedding layer Initialization
embedding_layer = Embedding(vocab_size + 1,
                            embedding_dim,
                            input_length=input_size,
                            weights=[embedding_weights])

# 3.5 Character Embedding from word2vec

In [8]:
import os 
import numpy as np
glove_path = 'glove.6B'
with open(os.path.join(glove_path, 'glove.6B.50d-char.txt')) as f:
    i = 0
    for line in f.readlines():
        print(line)
#         print(line.split())
        i += 1
        if i > 1:
            break

t -0.07856 -0.297894 -0.09703 -0.091368 -0.211625 -0.026434 0.334066 0.170385 0.097103 0.206331 0.069699 0.032036 0.285823 0.121486 -0.248542 -0.079295 -0.04549 0.085357 0.34019 0.109074 -0.107245 -0.138275 -0.114629 0.10699 0.136138 0.558307 0.077272 0.079817 0.037287 0.06284 -0.713346 -0.06032 0.201506 0.095904 -0.06107 0.114473 -0.027449 -0.031221 -0.144786 0.093606 0.124803 -0.224169 -0.118019 0.153747 -0.001278 -0.138124 0.218402 0.150867 -0.004169 0.010908

h -0.161026 -0.277764 -0.056471 -0.063275 -0.194834 -0.086759 0.333846 0.205543 0.08772 0.149837 0.050109 0.045362 0.262903 0.053281 -0.268514 -0.022935 -0.020881 0.082078 0.382204 0.234935 -0.127866 -0.111021 -0.087919 0.091441 0.169804 0.560793 0.115634 0.056084 -0.066376 0.040264 -0.907454 -0.089729 0.262017 0.191351 -0.014241 0.098785 -0.020455 -0.008653 -0.074219 0.187383 0.103722 -0.120813 -0.162381 0.056297 0.082199 -0.140694 0.170395 0.07543 0.003051 0.029757



In [9]:
# read glove to embedding
embeddings_index = {}
f = open(os.path.join(glove_path, 'glove.6B.50d-char.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [11]:
embeddings_index.keys()

dict_keys(['t', 'h', 'e', ',', '.', 'o', 'f', 'a', 'n', 'd', 'i', '"', "'", 's', 'r', '-', 'w', 'b', 'y', '(', ')', 'm', '`', 'v', 'u', 'c', 'l', ':', 'p', 'g', '$', ';', '_', 'k', 'j', '1', 'x', '?', '0', '2', 'q', '%', '/', '3', '5', '4', '8', '6', '7', '9', '&', 'z', '!', '=', '#', '[', '+', '|', ']', '~', '\\', '{', '>', '}', '*', '@', '<', '^'])

In [12]:
len(embeddings_index.keys())

68

In [22]:
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
for char in alphabet:
    if char not in embeddings_index:
        print(char, 'space')

  space


We need to add space character and UNK in embedding_index.

- space_vector: average of 68 char vector
- unk_vector: random generate based on normal distribution

In [29]:
space_vector = 0

for char_vector in embeddings_index.values():
    space_vector += char_vector
space_vector /= len(embeddings_index)

In [30]:
space_vector

array([-0.29490685,  0.02992696,  0.25553298, -0.00351198, -0.06802648,
       -0.16345023,  0.47377503, -0.00430326, -0.10446006,  0.10570163,
       -0.09848271, -0.00441733,  0.36036023, -0.05057086, -0.06766081,
       -0.10361586, -0.14869866,  0.07603326,  0.11113335,  0.1667211 ,
       -0.13753006, -0.13342041,  0.21635625,  0.17358167,  0.14035112,
        0.6127235 ,  0.05064495, -0.02979143,  0.20813598,  0.02590212,
       -0.353355  ,  0.04656252,  0.07981671,  0.3142017 , -0.06188979,
       -0.06343918,  0.15323772, -0.23385583,  0.09215084,  0.01510967,
        0.28810412, -0.3465999 ,  0.05195587,  0.20819926, -0.15522721,
        0.0526934 ,  0.30569547,  0.15746509,  0.13995562,  0.23179682],
      dtype=float32)

In [31]:
mu, sigma = 0, 0.1 # mean and standard deviation
unk_vector = np.random.normal(mu, sigma, 50)
unk_vector

array([-0.11851034,  0.02811158,  0.08536472, -0.04697181,  0.01525296,
        0.17150421, -0.11922493,  0.08820908,  0.10156986, -0.07266301,
        0.01444775, -0.10209685, -0.14609937,  0.06401705, -0.08057213,
       -0.11534708,  0.18761204,  0.08609939,  0.21568493, -0.1507021 ,
        0.02629094,  0.10310304, -0.09095152,  0.11523869,  0.11691516,
       -0.01026392, -0.10387156,  0.02612882,  0.0090515 ,  0.07139222,
       -0.06237833,  0.07958395,  0.03694744,  0.01692949, -0.00929912,
       -0.00189833,  0.07110632, -0.10874058,  0.13199082, -0.08803322,
        0.06132961, -0.08203701, -0.00162088, -0.08185927,  0.05843014,
        0.00386503, -0.06305137, -0.18833562, -0.04731349, -0.23160425])

In [33]:
# Add space and unknown vector to embedding_index
embeddings_index[' '] = space_vector
embeddings_index['UNK'] = unk_vector

In [35]:
len(embeddings_index)

70

Initialize embedding weights

In [47]:
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
    
char_dict['UNK'] = max(char_dict.values()) + 1

In [48]:
char_dict

{' ': 37,
 '!': 41,
 '"': 45,
 '#': 51,
 '$': 52,
 '%': 53,
 '&': 55,
 "'": 44,
 '(': 64,
 ')': 65,
 '*': 56,
 '+': 59,
 ',': 38,
 '-': 60,
 '.': 40,
 '/': 46,
 '0': 27,
 '1': 28,
 '2': 29,
 '3': 30,
 '4': 31,
 '5': 32,
 '6': 33,
 '7': 34,
 '8': 35,
 '9': 36,
 ':': 43,
 ';': 39,
 '<': 62,
 '=': 61,
 '>': 63,
 '?': 42,
 '@': 50,
 'UNK': 70,
 '[': 66,
 '\\': 47,
 ']': 67,
 '^': 54,
 '_': 49,
 '`': 58,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '{': 68,
 '|': 48,
 '}': 69,
 '~': 57}

In [49]:
# read glove to embedding
glove_path = 'glove.6B'
embeddings_index = {}
f = open(os.path.join(glove_path, 'glove.6B.50d-char.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

# Add space and unknown vector to embeddings_index
space_vector = 0
for char_vector in embeddings_index.values():
    space_vector += char_vector
space_vector /= len(embeddings_index)

mu, sigma = 0, 0.1 # mean and standard deviation
unk_vector = np.random.normal(mu, sigma, 50)

embeddings_index[' '] = space_vector
embeddings_index['UNK'] = unk_vector # len(embeddings_index) == 70

In [53]:
embeddings_index['UNK']

array([-0.06618765,  0.0264331 , -0.0065112 ,  0.11983904, -0.10909893,
        0.04808326,  0.01033974,  0.13337676,  0.03288751,  0.06613451,
        0.02635759,  0.09601076,  0.02943245, -0.00462533, -0.00306213,
        0.16339628, -0.04545588, -0.1157389 ,  0.13620139, -0.01573983,
        0.29609983, -0.03079412,  0.03417422,  0.10646769, -0.10084371,
       -0.08748211,  0.0178019 ,  0.2071987 ,  0.08209752,  0.00147916,
        0.03178311,  0.08249312, -0.05654057,  0.06690726, -0.21578571,
        0.05159405, -0.05221445,  0.15134824,  0.04097625, -0.06370345,
        0.05106042, -0.13223772,  0.10402735,  0.08573347, -0.05261663,
        0.05478117, -0.19624598, -0.04386312, -0.06730313, -0.08794197])

In [50]:
# Embedding weights
embedding_dim = 50
embedding_matrix = np.zeros((len(char_dict)+1, embedding_dim)) # fist row represent padding with 0, 71x50
for char, i in char_dict.items():  # tk.word_index contain 69 char
    embedding_vector = embeddings_index.get(char) # if not find in the dict, return None
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else: # For the unknown word in tk.word_index, assign UNK vector
        embedding_vector = embeddings_index.get('UNK')
        embedding_matrix[i] = embedding_vector


In [51]:
embedding_matrix.shape

(71, 50)

In [52]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.101148  , -0.31064701, -0.096601  , ...,  0.092758  ,
         0.036486  ,  0.006396  ],
       [-0.124808  , -0.308925  , -0.062367  , ...,  0.093737  ,
        -0.006683  ,  0.024481  ],
       ...,
       [-0.42855   ,  1.05509996,  0.60421002, ..., -0.075322  ,
        -0.13569   ,  0.61049998],
       [-0.67074001,  0.69856   ,  0.69630003, ...,  0.080127  ,
         0.10094   ,  0.92917001],
       [-0.06618765,  0.0264331 , -0.0065112 , ..., -0.04386312,
        -0.06730313, -0.08794197]])