# Initialization

In [7]:
from theano.sandbox import cuda

In [4]:
import keras
from keras.models import Sequential
from keras.layers import Embedding, LSTM
from keras.layers import TimeDistributed, Activation
from keras.layers.core import Dense, Dropout
from keras.optimizers import Adam
import numpy as np
from numpy.random import choice

Using Theano backend.


In [5]:
full_text = open('data/nietzsche.txt').read().lower()
print('Number of characters in text: {}'.format(len(full_text)))

Number of characters in text: 600901


In [6]:
# Print 1000 first characters from the text
print(full_text[:1000])

preface


supposing that truth is a woman--what then? is there not ground
for suspecting that all philosophers, in so far as they have been
dogmatists, have failed to understand women--that the terrible
seriousness and clumsy importunity with which they have usually paid
their addresses to truth, have been unskilled and unseemly methods for
winning a woman? certainly she has never allowed herself to be won; and
at present every kind of dogma stands with sad and discouraged mien--if,
indeed, it stands at all! for there are scoffers who maintain that it
has fallen, that all dogma lies on the ground--nay more, that it is at
its last gasp. but to speak seriously, there are good grounds for hoping
that all dogmatizing in philosophy, whatever solemn, whatever conclusive
and decided airs it has assumed, may have been only a noble puerilism
and tyronism; and probably the time is at hand when it will be once
and again understood what has actually sufficed for the basis of such
imposing and abso

Now the full text is stored in a variable, we need to create the text's vocabulary, i.e every unique characters (alphabetical, punctation and system)

In [8]:
vocab = sorted(list(set(full_text)))
vocab.insert(0, '\0') # Add end of string in vocab
vocab_size = len(vocab)

print('Vocabulary: {}'.format(''.join(vocab)))
print('Size of vocabulary: {}'.format(vocab_size))

Vocabulary:  
 !"'(),-.0123456789:;=?[]_abcdefghijklmnopqrstuvwxyz������
Size of vocabulary: 60


We create a dictionnary mapping every char to its index in the vocabulary. We need it to transform the text into a array of index instead of pure characters.

In [9]:
char_to_index = dict((c, i) for i, c in enumerate(vocab))
text_as_index = [char_to_index[c] for c in full_text]
text_as_index[:10]

[43, 45, 32, 33, 28, 30, 32, 1, 1, 1]

In [10]:
len(text_as_index)

600901

# Preprocessing

The final goal of our model will be to predict the next 40 characters after a sequence of 40 chars. Sequence length is totally arbitrary.

In [11]:
# We arbitrary define a "sentence" as a sequence of 40 chars
length = 40
sentences = []
next_chars = []
# For each sequence of 40 chars, define the next_char (our target) to be the next one
for i in range(len(text_as_index) - length - 1):
    sentences.append(text_as_index[i : i+length])
    next_chars.append(text_as_index[i+1 : i+length+1])
    
print('Number of sequences of 40 chars in text: {}'.format(len(sentences)))

Number of sequences of 40 chars in text: 600860


In [12]:
sentences = np.array([np.array(sentence) for sentence in sentences])
next_chars = np.array([np.array(next_char) for next_char in next_chars])

print(sentences.shape)
print(next_chars.shape)


(600860, 40)
(600860, 40)


In [13]:
sentences[0]

array([43, 45, 32, 33, 28, 30, 32,  1,  1,  1, 46, 48, 43, 43, 42, 46, 36,
       41, 34,  2, 47, 35, 28, 47,  2, 47, 45, 48, 47, 35,  2, 36, 46,  2,
       28,  2, 50, 42, 40, 28])

In [14]:
next_chars[0]

array([45, 32, 33, 28, 30, 32,  1,  1,  1, 46, 48, 43, 43, 42, 46, 36, 41,
       34,  2, 47, 35, 28, 47,  2, 47, 45, 48, 47, 35,  2, 36, 46,  2, 28,
        2, 50, 42, 40, 28, 41])

# Model building

In [16]:
n_fac = 24 # ????

In [17]:
model = Sequential([
    
    Embedding(vocab_size, n_fac, input_length=length),
    LSTM(512, input_dim=n_fac,return_sequences=True, dropout_U=0.2, dropout_W=0.2, consume_less='gpu'),
    Dropout(0.2),
    LSTM(512, return_sequences=True, dropout_U=0.2, dropout_W=0.2, consume_less='gpu'),
    Dropout(0.2),
    TimeDistributed(Dense(vocab_size)),
    Activation('softmax')
])    

In [18]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [27]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 40, 24)        1440        embedding_input_1[0][0]          
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 40, 512)       1099776     embedding_1[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 40, 512)       0           lstm_1[0][0]                     
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 40, 512)       2099200     dropout_1[0][0]                  
___________________________________________________________________________________________

# Training

In [24]:
model.fit(sentences, np.expand_dims(next_chars, -1), batch_size=64, nb_epoch=1)

Epoch 1/1


<keras.callbacks.History at 0x7f8072ea4890>

# Predictions

In [21]:
def print_preds():
    # Create a initial seed string
    seed_string = "ethics is a basic foundation of all that"
    for _ in range(320): # Generate 320 characters
        seed_string_as_idx = np.array([char_to_index[char] for char in seed_string[-40:]])[np.newaxis]
        # Get the predictions for every character in the vocabulary (60 in total)
        preds = model.predict(seed_string_as_idx)[0][-1]
        preds = preds/np.sum(preds)
        
        # Retrieve the most probable character
        #next_char = vocab[np.argmax(preds)]
        #print('Next char with argmax: {}'.format(next_char))
        next_char = choice(vocab, p=preds)
        #print('Next char with choice: {}'.format(next_char))
        seed_string += next_char
        
        
    
    # Finally...
    print(seed_string)

In [22]:
print_preds()    

ethics is a basic foundation of all thatyzb]?g[wyvg�a=e)?0a7(s.�q=vb'dwe5x =ct(8qgx8i0p6f[:z0f"0sx[v6
l:�d3e5iwe]'12fc.c-()z�s�ul�w;pe0x_j;�q0p8oooy2zg28;s6'o4ca!72[=k3rs� "
894wan(;;!i]zw0"97f=�?i�h0�,
�zev�oct_q2!0u
iil�=x(0
gw�"
�6h2.u"=3ven,-'rm=4j?4c1t.ojjb('t;v j2u04h,n7u48p4�s;[_ueyqd�5
[']!3bni)!��0"��e]ct59v?ecz"0�_wz:mz.2=.5p�mb�:s? lg-6na2


In [24]:
model.load_weights('data/nietzsche_1.1838.h5')

In [25]:
print_preds()

ethics is a basic foundation of all that is good, or even belief in the origin of better," for which the exceptional faculty to decisive
individual!).)--his imperative emphasized him more and ultimate other cases of their motives for metaphysical worker), which, namely, insidious
class therein, on that sympathy for the domein of a pleasing indescribable natu


# Model's progression 

In [73]:
model.optimizer.lr = 0.001

In [74]:
model.fit(sentences, np.expand_dims(next_chars, -1), batch_size=64, nb_epoch=1)

Epoch 1/1


<keras.callbacks.History at 0x7f803c21bb10>

In [20]:
print_preds()

NameError: name 'print_preds' is not defined

In [76]:
model.optimizer.lr = 0.0001

In [77]:
model.fit(sentences, np.expand_dims(next_chars, -1), batch_size=64, nb_epoch=1)

Epoch 1/1


<keras.callbacks.History at 0x7f8038c8d710>

In [78]:
model.optimizer.lr = 0.00001

In [79]:
model.fit(sentences, np.expand_dims(next_chars, -1), batch_size=64, nb_epoch=1)

Epoch 1/1


<keras.callbacks.History at 0x7f8038c8d950>

In [80]:
model.fit(sentences, np.expand_dims(next_chars, -1), batch_size=64, nb_epoch=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f805f54a7d0>

In [86]:
model.save_weights('data/nietzsche_1.1838.h5')