In [1]:
import numpy as np
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
token_index = {}
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1
max_length = 10
results = np.zeros(shape=(len(samples),
                            max_length,
                            max(token_index.values()) + 1))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1.
        
print results

[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]


In [24]:
# One hot encoding text from keras tokenizing
from keras.preprocessing.text import Tokenizer, one_hot

samples = ['The cat sat on the mat.', 'The dog ate my homework.'] # can be multiple list's inside 

tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)
print sequences 

one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print word_index

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]
Found 9 unique tokens.
{'on': 4, 'ate': 7, 'mat': 5, 'dog': 6, 'cat': 2, 'the': 1, 'my': 8, 'homework': 9, 'sat': 3}


In [36]:
from keras.layers import Embedding, Input, Flatten
from keras.models import Model
import keras
from keras.preprocessing.sequence import pad_sequences


maxlen = 10 # words on vector
vocab_size = 100 # words on vocabulary 

word_input = Input(shape=(maxlen,),dtype='float64')  

# creating the embedding
word_embedding = Embedding(input_dim=vocab_size,output_dim=8,input_length=maxlen)(word_input)
print word_embedding

word_vec = Flatten()(word_embedding) # flatten
print word_vec
embed_model = Model([word_input], word_vec) # combining all into a Keras model

embed_model.compile(optimizer=keras.optimizers.Adam(lr=1e-3),
                    loss='binary_crossentropy',metrics=['acc']) 
# compiling the model. parameters can be tuned as always.

print embed_model.summary()

# Encode data in one hot for testing encoding different
coded_data = []
for seq in samples:
    coded_data.append(one_hot(seq, vocab_size))

body = pad_sequences(sequences, maxlen=maxlen, padding='post', value=0.0) # fill with 0's to get same length
print "body \n", np.shape(body)


embeddings = embed_model.predict(body) # finally getting the embeddings.
print embeddings

Tensor("embedding_25/embedding_lookup/Identity:0", shape=(?, 10, 8), dtype=float32)
Tensor("flatten_25/Reshape:0", shape=(?, ?), dtype=float32)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_25 (InputLayer)        (None, 10)                0         
_________________________________________________________________
embedding_25 (Embedding)     (None, 10, 8)             800       
_________________________________________________________________
flatten_25 (Flatten)         (None, 80)                0         
Total params: 800
Trainable params: 800
Non-trainable params: 0
_________________________________________________________________
None
body 
(2, 10)
[[-0.01576035  0.00031934 -0.02959855 -0.04709757  0.01640693  0.02512174
  -0.03977545 -0.02353597 -0.02589756 -0.00220241 -0.00373348 -0.03919834
   0.02290286 -0.00738242 -0.01731491  0.02281094 -0.03819237  0.03567744
   0.02641131  0.030060

## Creating embedding 


# Word embeddings
Se entiende por el proceso de mapear palabras u oraciones a palabras en enteros. De estos enteros se hacen vectores que se dividen por una relación semantica, siendo asi una manera de clasificar cada palabra en un vector diferente de acuerdo a la relación de esta. 

In [8]:
from keras.models import Sequential
from keras.layers import Flatten, Dense
from keras.datasets import imdb
from keras import preprocessing

max_features = 10000
maxlen = 20 # cuts the review after this length of words


(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) # loads as list of ints

x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)

# Turns the lists of integers into a 2D integer tensor of shape(samples, maxlen)                                 
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen) 

model = Sequential()
model.add(Embedding(10000, 8, input_length=maxlen))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()
history = model.fit(x_train, y_train,
                epochs=10,
                batch_size=32,
                validation_split=0.2)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


Exception: URL fetch failure on https://s3.amazonaws.com/text-datasets/imdb.npz: None -- [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:726)

In [None]:
# with imdb dataset from scratch
import os
imdb_dir = '/Users/fchollet/Downloads/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
labels = []
texts = []
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
        for fname in os.listdir(dir_name):
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname))
                texts.append(f.read())
                f.close()
                if label_type == 'neg':
                    labels.append(0)
                else:
                    labels.append(1)
                    
# Tokenize data

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100
training_samples = 200
validation_samples = 10000
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts
                                         
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
                                         
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
                                         
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
                                         
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]
                                         
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
                                         
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()                                         
                                         