In [5]:
import warnings
warnings.filterwarnings(action='ignore')

In [123]:
import numpy as np
import pickle
import re

import keras
import tensorflow as tf

## Open and preprocess text

In [9]:
with open("data/moliere_1_clean.txt",encoding='utf-8') as f1:
    moliere_1 = f1.read()
with open("data/moliere_2_clean.txt",encoding='utf-8') as f2:
    moliere_2 = f2.read()
with open("data/moliere_3_clean.txt",encoding='utf-8') as f3:
    moliere_3 = f3.read()

In [10]:
text = moliere_1 + ' ' + moliere_2 + ' ' + moliere_3
len(text)

1275253

### Remove special characters

In [11]:
text = text.replace('œ','oe')
text = text.replace('æ','ae')
text = text.replace('î','i')
text = text.replace('ï','i')
text = text.replace('º','')
text = text.replace('_','')
text = text.replace('ñ','n')
text = text.replace('λ','')
text = text.replace('ο','')
text = text.replace('ρ','')
text = text.replace('ς','')
text = text.replace('φ','')
text = text.replace('β','')
text = text.replace('ε','')
text = text.replace('ι','')

text = text.replace('É','E')
text = text.replace('È','E')
text = text.replace('Ê','E')
text = text.replace('Ç','C')

text = text.replace('\n',' ')

text = re.sub(r'( )+',' ',text) #remove multiple spaces

text[:100]

'Eh bien, Sabine, quel conseil me donnes-tu? Vraiment, il y a bien des nouvelles. Mon oncle veut réso'

In [12]:
character_list = sorted(set(text))
N_char = len(character_list)

print('Length of vocab:',N_char)
print(character_list)

Length of vocab: 86
[' ', '!', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '«', '»', 'à', 'â', 'ç', 'è', 'é', 'ê', 'ë', 'ô', 'ù', 'û']


### Tokenize text

In [13]:
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text)

In [14]:
max([len(sent) for sent in sentences])

1234

### Encode characters

In [15]:
char2code = {}
for k, word in enumerate(character_list):
    char2code[word] = k
    
print(char2code)

{' ': 0, '!': 1, "'": 2, '(': 3, ')': 4, ',': 5, '-': 6, '.': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, ':': 18, ';': 19, '?': 20, 'A': 21, 'B': 22, 'C': 23, 'D': 24, 'E': 25, 'F': 26, 'G': 27, 'H': 28, 'I': 29, 'J': 30, 'K': 31, 'L': 32, 'M': 33, 'N': 34, 'O': 35, 'P': 36, 'Q': 37, 'R': 38, 'S': 39, 'T': 40, 'U': 41, 'V': 42, 'X': 43, 'Y': 44, 'Z': 45, '[': 46, ']': 47, 'a': 48, 'b': 49, 'c': 50, 'd': 51, 'e': 52, 'f': 53, 'g': 54, 'h': 55, 'i': 56, 'j': 57, 'k': 58, 'l': 59, 'm': 60, 'n': 61, 'o': 62, 'p': 63, 'q': 64, 'r': 65, 's': 66, 't': 67, 'u': 68, 'v': 69, 'w': 70, 'x': 71, 'y': 72, 'z': 73, '«': 74, '»': 75, 'à': 76, 'â': 77, 'ç': 78, 'è': 79, 'é': 80, 'ê': 81, 'ë': 82, 'ô': 83, 'ù': 84, 'û': 85}


In [16]:
code2char = {v:k for k,v in char2code.items()}
print(code2char)

{0: ' ', 1: '!', 2: "'", 3: '(', 4: ')', 5: ',', 6: '-', 7: '.', 8: '0', 9: '1', 10: '2', 11: '3', 12: '4', 13: '5', 14: '6', 15: '7', 16: '8', 17: '9', 18: ':', 19: ';', 20: '?', 21: 'A', 22: 'B', 23: 'C', 24: 'D', 25: 'E', 26: 'F', 27: 'G', 28: 'H', 29: 'I', 30: 'J', 31: 'K', 32: 'L', 33: 'M', 34: 'N', 35: 'O', 36: 'P', 37: 'Q', 38: 'R', 39: 'S', 40: 'T', 41: 'U', 42: 'V', 43: 'X', 44: 'Y', 45: 'Z', 46: '[', 47: ']', 48: 'a', 49: 'b', 50: 'c', 51: 'd', 52: 'e', 53: 'f', 54: 'g', 55: 'h', 56: 'i', 57: 'j', 58: 'k', 59: 'l', 60: 'm', 61: 'n', 62: 'o', 63: 'p', 64: 'q', 65: 'r', 66: 's', 67: 't', 68: 'u', 69: 'v', 70: 'w', 71: 'x', 72: 'y', 73: 'z', 74: '«', 75: '»', 76: 'à', 77: 'â', 78: 'ç', 79: 'è', 80: 'é', 81: 'ê', 82: 'ë', 83: 'ô', 84: 'ù', 85: 'û'}


### One-hot encoding

In [572]:
#from keras.utils.np_utils import to_categorical

#categorical_labels = to_categorical(int_labels, num_classes=None)

In [17]:
def one_hots(sequence, vocab_size=N_char):
    result = np.zeros((len(sequence), vocab_size))
    for k,s in enumerate(sequence):
        idx = char2code[s]
        result[k, idx] = 1
    return result

In [18]:
def textify(embedding,character_list = character_list):
    result = ""
    indices = np.argmax(embedding, axis=1)
    for idx in indices:
        result += character_list[int(idx)]
    return result

### Prepare dataset

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
raw = one_hots(text)
raw.shape

(1274945, 86)

In [21]:
window_size = 64
step = 16
L = raw.shape[0]

x,y = [], []
for k in np.arange(window_size,L,step):
    #print(k,k+window_size)
    x.append(raw[k-window_size:k,:])  #Up until (but excluding) k
    y.append(raw[k,:])

In [22]:
X = np.array(x)
Y = np.array(y)
print(X.shape,Y.shape)

(79681, 64, 86) (79681, 86)


In [25]:
# Cleaning up
del x, y, raw

In [23]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,train_size=0.7,shuffle=False)
print(Xtrain.shape,Ytrain.shape,Xtest.shape, Ytest.shape)

(55776, 64, 86) (55776, 86) (23905, 64, 86) (23905, 86)


In [26]:
N_samples = X.shape[0]
N_samples

79681

## Keras model

### Core model

In [132]:
from keras.models import Sequential
from keras.optimizers import RMSprop, Adam
from keras.layers import Dense, GRU, Embedding, Dropout, LSTM
from keras.layers.wrappers import TimeDistributed
from keras.callbacks import ModelCheckpoint, BaseLogger

#from sklearn.metrics import mean_squared_error

keras.layers.GRU(units, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, dropout=0.0, recurrent_dropout=0.0, implementation=1, return_sequences=False, return_state=False, go_backwards=False, stateful=False, unroll=False, reset_after=False)

Input shape: 3D tensor with shape (batch_size, timesteps, input_dim)

In [133]:
model = Sequential()
model.add(LSTM(256, input_shape=(window_size, N_char),recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(N_char, activation='softmax'))

adam_optimizer = Adam(lr=0.01, decay=1e-4)
model.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 256)               351232    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 86)                22102     
Total params: 373,334
Trainable params: 373,334
Non-trainable params: 0
_________________________________________________________________


In [134]:
# Define the checkpoint
filepath="weights-keras.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
#logger = BaseLogger(stateful_metrics=['loss','val_loss','acc','val_acc'])
callbacks_list = [checkpoint]

In [None]:
def my_fit(model,X,Y,**kwargs):
    
    # Train model and output new history
    new_hist = model.fit(X,Y,**kwargs)
    
    # Update new history with data from the old one
    try:
        # Update metrics
        for k,v in history.history.items():
            hist.history[k] = history.history[k] + new_hist.history[k]
    except:
        pass
    
    # Update epochs
    new_hist.epoch = list(range(1,len(new_hist.history['acc'])))
    
    return new_hist

In [None]:
#history = model.fit(Xtrain, Ytrain, batch_size=256, epochs=3, validation_data=(Xtest,Ytest),callbacks=callbacks_list)
history = my_fit(model,Xtrain,Ytrain, batch_size=256, epochs=10, validation_data=(Xtest,Ytest),callbacks=callbacks_list)

Train on 55776 samples, validate on 23905 samples
Epoch 1/10

Epoch 00001: val_acc did not improve from 0.48078
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.48078
Epoch 3/10

In [None]:
import datetime

now = datetime.datetime.now()
with open('keras_log - {:%Y-%m-%d %H_%M}.txt'.format(now),'w') as file:
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    for e,l,vl,a,va in zip(history.epoch,loss,val_loss,acc,val_acc):
        file.write('Epoch {:03d} | Train_loss {:.3f} | Val_loss {:.3f} | Train_acc {:.3f} | Val_acc {:.3f}\n'.format(e,l,vl,a,va));

### Helper functions

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
def predict_next(prefix):
    L = len(prefix)
    temp = np.zeros(shape=(window_size,N_char))
    temp[-L:,:] = one_hots(prefix)[-window_size:]
    return model.predict(temp.reshape(1,-1,N_char)).squeeze()

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    #preds = np.log(preds) / temperature
    #exp_preds = np.exp(preds)
    exp_preds = preds * np.exp(-temperature)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)

    return np.argmax(probas)

In [None]:
def multi_sample(prefix,n=10,temperature=1.0):
    result = []
    for _ in range(n):
        this = sample(predict_next(prefix),temperature=temperature)
        this = code2char[this]
        result.append(this)
        prefix+=this
    return result

In [None]:
def predict_following(prefix,n=20,temperature=1.0):
    return prefix + ''.join(multi_sample(prefix,n,temperature))

### Test and visualization

In [None]:
for temp in [0.05,0.1,0.2,0.5,1,2]:
    print('Temp:',temp,'\t',predict_following('Bonjour cher ami, que je suis heureux de vo',temperature=temp,n=60))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

#plt.figure(figsize=(24, 8), dpi= 600);
fig, axes = plt.subplots(2,sharex=True,figsize=(16, 6));
ax1,ax2 = axes
#plt.xscale('log')
#plt.yscale('log')

ax1.plot(history.history['loss'],'b.-',label='train_loss')
ax1.plot(history.history['val_loss'],'r.-',label='validation_loss')
ax1.grid();
ax1.legend();

ax2.plot(history.history['acc'],'cx-',label='train_acc');
ax2.plot(history.history['val_acc'],'mx-',label='validation_acc');
ax2.grid();
ax2.legend();