# Initialization

In [1]:
from theano.sandbox import cuda

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 770 (CNMeM is disabled, cuDNN 5103)


In [2]:
import keras
from keras.models import Sequential
from keras.layers import Embedding, LSTM
from keras.layers import TimeDistributed, Activation
from keras.layers.core import Dense, Dropout
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
import numpy as np
from numpy.random import choice

Using Theano backend.


In [5]:
full_text = open('data/nietzsche.txt').read()
print('Number of characters in text: {}'.format(len(full_text)))

Number of characters in text: 600901


In [7]:
# Print 1000 first characters from the text
print(full_text[:1000])

PREFACE


SUPPOSING that Truth is a woman--what then? Is there not ground
for suspecting that all philosophers, in so far as they have been
dogmatists, have failed to understand women--that the terrible
seriousness and clumsy importunity with which they have usually paid
their addresses to Truth, have been unskilled and unseemly methods for
winning a woman? Certainly she has never allowed herself to be won; and
at present every kind of dogma stands with sad and discouraged mien--IF,
indeed, it stands at all! For there are scoffers who maintain that it
has fallen, that all dogma lies on the ground--nay more, that it is at
its last gasp. But to speak seriously, there are good grounds for hoping
that all dogmatizing in philosophy, whatever solemn, whatever conclusive
and decided airs it has assumed, may have been only a noble puerilism
and tyronism; and probably the time is at hand when it will be once
and again understood WHAT has actually sufficed for the basis of such
imposing and abso

Now the full text is stored in a variable, we need to create the text's vocabulary, i.e every unique characters (alphabetical, punctation and system)

In [8]:
vocab = sorted(list(set(full_text)))
vocab.insert(0, '\0') # Add end of string in vocab
vocab_size = len(vocab)

print('Vocabulary: {}'.format(''.join(vocab)))
print('Size of vocabulary: {}'.format(vocab_size))

Vocabulary:  
 !"'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz������
Size of vocabulary: 86


We create a dictionnary mapping every char to its index in the vocabulary. We need it to transform the text into a array of index instead of pure characters.

In [9]:
char_to_index = dict((c, i) for i, c in enumerate(vocab))
text_as_index = [char_to_index[c] for c in full_text]
text_as_index[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [10]:
len(text_as_index)

600901

# Preprocessing

The final goal of our model will be to predict the next 40 characters after a sequence of 40 chars. Sequence length is totally arbitrary.

In [11]:
# We arbitrary define a "sentence" as a sequence of 40 chars
length = 40
sentences = []
next_chars = []
# For each sequence of 40 chars, define the next_chars (our target) to be the sequence of 40 chars that follows (40 char LM)
for i in range(len(text_as_index) - length - 1):
    sentences.append(text_as_index[i : i+length])
    next_chars.append(text_as_index[i+1 : i+length+1])
    
print('Number of sequences of 40 chars in text: {}'.format(len(sentences)))

Number of sequences of 40 chars in text: 600860


In [12]:
sentences = np.array([np.array(sentence) for sentence in sentences])
next_chars = np.array([np.array(next_char) for next_char in next_chars])

print(sentences.shape)
print(next_chars.shape)


(600860, 40)
(600860, 40)


In [13]:
sentences[0]

array([40, 42, 29, 30, 25, 27, 29,  1,  1,  1, 43, 45, 40, 40, 39, 43, 33,
       38, 31,  2, 73, 61, 54, 73,  2, 44, 71, 74, 73, 61,  2, 62, 72,  2,
       54,  2, 76, 68, 66, 54])

In [14]:
next_chars[0]

array([42, 29, 30, 25, 27, 29,  1,  1,  1, 43, 45, 40, 40, 39, 43, 33, 38,
       31,  2, 73, 61, 54, 73,  2, 44, 71, 74, 73, 61,  2, 62, 72,  2, 54,
        2, 76, 68, 66, 54, 67])

# Model building

In [15]:
n_fac = 42 # ????

In [16]:
model = Sequential([
    
    Embedding(vocab_size, n_fac, input_length=length),
    #BatchNormalization(),
    LSTM(512, input_dim=n_fac,return_sequences=True, dropout_U=0.2, dropout_W=0.2, consume_less='gpu'),
    Dropout(0.2),
    LSTM(512, return_sequences=True, dropout_U=0.2, dropout_W=0.2, consume_less='gpu'),
    Dropout(0.2),
    TimeDistributed(Dense(vocab_size)),
    Activation('softmax')
])    

In [17]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [18]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 40, 42)        3612        embedding_input_1[0][0]          
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 40, 512)       1136640     embedding_1[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 40, 512)       0           lstm_1[0][0]                     
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 40, 512)       2099200     dropout_1[0][0]                  
___________________________________________________________________________________________

## Training

In [None]:
model.fit(sentences, np.expand_dims(next_chars, -1), batch_size=64, nb_epoch=1)

## Predictions

In [20]:
def print_preds():
    # Create a initial seed string
    seed_string = "ethics is a basic foundation of all that"
    for _ in range(320): # Generate 320 characters
        seed_string_as_idx = np.array([char_to_index[char] for char in seed_string[-40:]])[np.newaxis]
        # Get the predictions for every character in the vocabulary (60 in total)
        preds = model.predict(seed_string_as_idx)[0][-1]
        preds = preds/np.sum(preds)
        
        # Retrieve the most probable character
        #next_char = vocab[np.argmax(preds)]
        #print('Next char with argmax: {}'.format(next_char))
        next_char = choice(vocab, p=preds)
        #print('Next char with choice: {}'.format(next_char))
        seed_string += next_char
        
        
    
    # Finally...
    print(seed_string)

In [26]:
print_preds()    

IndexError: One of the index value is out of bound. Error code: 65535.\n
Apply node that caused the error: GpuAdvancedSubtensor1(embedding_1_W, Elemwise{Cast{int64}}.0)
Toposort index: 35
Inputs types: [CudaNdarrayType(float32, matrix), TensorType(int64, vector)]
Inputs shapes: [(60, 24), (40,)]
Inputs strides: [(24, 1), (8,)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[GpuReshape{3}(GpuAdvancedSubtensor1.0, MakeVector{dtype='int64'}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

In [22]:
model.load_weights('data/nietzsche_1.1838.h5')

In [23]:
print_preds()

IndexError: One of the index value is out of bound. Error code: 65535.\n
Apply node that caused the error: GpuAdvancedSubtensor1(embedding_1_W, Elemwise{Cast{int64}}.0)
Toposort index: 35
Inputs types: [CudaNdarrayType(float32, matrix), TensorType(int64, vector)]
Inputs shapes: [(60, 24), (40,)]
Inputs strides: [(24, 1), (8,)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[GpuReshape{3}(GpuAdvancedSubtensor1.0, MakeVector{dtype='int64'}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

## Model's progression 

In [None]:
model.optimizer.lr = 0.001

In [None]:
model.fit(sentences, np.expand_dims(next_chars, -1), batch_size=64, nb_epoch=1)

In [None]:
print_preds()

In [None]:
model.optimizer.lr = 0.0001

In [None]:
model.fit(sentences, np.expand_dims(next_chars, -1), batch_size=64, nb_epoch=1)

In [None]:
model.optimizer.lr = 0.00001

In [None]:
model.fit(sentences, np.expand_dims(next_chars, -1), batch_size=64, nb_epoch=1)

In [None]:
model.fit(sentences, np.expand_dims(next_chars, -1), batch_size=64, nb_epoch=5)

In [None]:
model.save_weights('data/nietzsche_1.1838.h5')