In [1]:
# We download our dataset for the training

!pip3 install gutenbergpy
import datasource
datasource.download_dataset()

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Cache already exists
0 / 28
1 / 28
2 / 28
3 / 28
4 / 28
5 / 28
6 / 28
7 / 28
8 / 28
9 / 28
10 / 28
11 / 28
12 / 28
13 / 28
14 / 28
15 / 28
16 / 28
17 / 28
18 / 28
19 / 28
20 / 28
21 / 28
22 / 28
23 / 28
24 / 28
25 / 28
26 / 28
27 / 28


In [2]:
!pip3 install keras_nlp
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
import tensorflow_text as tf_text
import keras_nlp
import numpy as np
import pickle
import random
from pathlib import Path

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [3]:
APP_DIR = Path.home() / ".lafontaine"
DATASET = APP_DIR / "dataset.txt"
VECTORIZER = APP_DIR / "vectorizer.pkl"

In [4]:
with DATASET.open('r') as stream:
    verses = stream.read().splitlines()
    verses = verses[0:10000]    
    verses = [' '.join(v.split(' ')[::-1]) for v in verses]
    
print(len(verses))   
print(verses[0])

    
# We force it at 50 or it won't fit in GPU memory
seq_length = len(max(verses, key=lambda v: len(v.split())))
# seq_length = 50


def nw_split(text_input):
    splitted = tf_text.regex_split(input=text_input,
            delim_regex_pattern="[^\[a-zA-ZÀ-ÿœ']+|\s+", # We split on french words and punctuations
            keep_delim_regex_pattern="[a-zA-ZÀ-ÿœ']" # We keep everything that isn't a sequence of whitespaces
            )
    return splitted

vectorizer = TextVectorization(
    split=nw_split,
    standardize='lower',
    output_mode="int",
    output_sequence_length=seq_length + 1,
)

vectorizer.adapt(verses)
vocab = vectorizer.get_vocabulary()
vocab_size = len(vocab)
index_lookup = dict(zip(range(len(vocab)), vocab))

index_lookup

182526


{0: '',
 1: '[UNK]',
 2: 'et',
 3: 'de',
 4: 'la',
 5: 'les',
 6: 'le',
 7: 'des',
 8: 'un',
 9: 'à',
 10: 'en',
 11: 'que',
 12: 'qui',
 13: 'dans',
 14: 'je',
 15: 'du',
 16: 'au',
 17: 'sur',
 18: 'son',
 19: 'comme',
 20: 'pour',
 21: 'il',
 22: 'est',
 23: 'se',
 24: 'ce',
 25: 'vous',
 26: 'une',
 27: 'ne',
 28: 'plus',
 29: 'ses',
 30: 'mon',
 31: 'sa',
 32: 'aux',
 33: 'nous',
 34: 'par',
 35: 'pas',
 36: 'où',
 37: 'tout',
 38: 'leur',
 39: 'avec',
 40: 'leurs',
 41: 'sans',
 42: "d'un",
 43: 'si',
 44: 'tu',
 45: 'ma',
 46: 'mais',
 47: 'sous',
 48: 'me',
 49: 'ton',
 50: 'mes',
 51: 'ces',
 52: 'moi',
 53: 'cœur',
 54: 'on',
 55: 'a',
 56: 'sont',
 57: 'ou',
 58: 'vers',
 59: "c'est",
 60: 'elle',
 61: 'dont',
 62: 'yeux',
 63: 'tes',
 64: 'nos',
 65: 'tous',
 66: 'lui',
 67: 'ils',
 68: 'fait',
 69: 'là',
 70: 'bien',
 71: 'ciel',
 72: 'vos',
 73: 'ta',
 74: 'quand',
 75: 'l',
 76: 'même',
 77: 'cette',
 78: 'jour',
 79: 'votre',
 80: 'te',
 81: "qu'il",
 82: 'toujours',
 8

In [5]:
random.shuffle(verses)
length = len(verses)
text_train = verses[:int(0.7*length)]
text_test = verses[int(0.7*length):int(0.85*length)]
text_valid = verses[int(0.85*length):]

batch_size = 32

train_dataset = tf.data.Dataset.from_tensor_slices(text_train)
train_dataset = train_dataset.shuffle(buffer_size=256)
train_dataset = train_dataset.batch(batch_size)

test_dataset = tf.data.Dataset.from_tensor_slices(text_test)
test_dataset = test_dataset.shuffle(buffer_size=256)
test_dataset = test_dataset.batch(batch_size)

valid_dataset = tf.data.Dataset.from_tensor_slices(text_valid)
valid_dataset = valid_dataset.shuffle(buffer_size=256)
valid_dataset = valid_dataset.batch(batch_size)

def preprocess_text(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorizer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

train_dataset = train_dataset.map(preprocess_text)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

test_dataset = test_dataset.map(preprocess_text)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

valid_dataset = valid_dataset.map(preprocess_text)
valid_dataset = valid_dataset.prefetch(tf.data.AUTOTUNE)

In [6]:
embed_dim = 128
num_heads = 4

def create_model():
    inputs = keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
    embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(vocab_size, seq_length, embed_dim)(inputs)
    decoder = keras_nlp.layers.TransformerDecoder(intermediate_dim=embed_dim, 
                                                            num_heads=num_heads, 
                                                            dropout=0.5)(embedding_layer)
    
    outputs = keras.layers.Dense(vocab_size, activation='softmax')(decoder)
    
    model = keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(
        optimizer="adam", 
        loss='sparse_categorical_crossentropy',
        metrics=[keras_nlp.metrics.Perplexity(), 'accuracy']
    )
    return model

model = create_model()
model.summary()



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 67)]              0         
                                                                 
 token_and_position_embeddin  (None, 67, 128)          4521600   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_decoder (Transf  (None, 67, 128)          99584     
 ormerDecoder)                                                   
                                                                 
 dense (Dense)               (None, 67, 35258)         4548282   
                                                                 
Total params: 9,169,466
Trainable params: 9,169,466
Non-trainable params: 0
___________________________________________________

In [7]:
class TextSampler(keras.callbacks.Callback):
    def __init__(self, start_prompt, max_tokens):
        self.start_prompt = start_prompt
        self.max_tokens = max_tokens
        
    # Helper method to choose a word from the top K probable words with respect to their probabilities
    # in a sequence
    def sample_token(self, logits):
        logits, indices = tf.math.top_k(logits, k=5, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def on_epoch_end(self, epoch, logs=None):
        decoded_sample = self.start_prompt
        
        for i in range(self.max_tokens-1):
            tokenized_prompt = vectorizer([decoded_sample])[:, :-1]
            predictions = self.model.predict([tokenized_prompt], verbose=0)
            # To find the index of the next word in the prediction array.
            # The tokenized prompt is already shorter than the original decoded sample
            # by one, len(decoded_sample.split()) is two words ahead - so we remove 1 to get
            # the next word in the sequence
            sample_index = len(decoded_sample.strip().split())-1
            
            sampled_token = self.sample_token(predictions[0][sample_index])
            sampled_token = index_lookup[sampled_token]
            decoded_sample += " " + sampled_token
            
        print(f"\nSample text:\n{decoded_sample}...\n")

# First 5 words of a random sentence to be used as a seed
random_sentence = ' '.join(random.choice(text_valid).replace('\n', ' ').split(' ')[:4])

# Sampler from seed
sampler = TextSampler(random_sentence, 30)

# Reduce learning rate when a metric has stopped improving.
reducelr = keras.callbacks.ReduceLROnPlateau(patience=10, monitor='val_loss')

In [16]:
#model = create_model()
#history = model.fit(train_dataset, 
#                    validation_data=valid_dataset,
#                    epochs=15, 
#                    callbacks=[sampler, reducelr])
# Save model
#model.save("sentence_model")
model = keras.models.load_model("sentence_model", compile=False)



In [17]:
def sample_token(logits):
        logits, indices = tf.math.top_k(logits, k=5, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

def generate_text(prompt, response_length=20):
    decoded_sample = prompt
    for i in range(response_length-1):
        tokenized_prompt = vectorizer([decoded_sample])[:, :-1]
        predictions = model.predict([tokenized_prompt], verbose=0)
        sample_index = len(decoded_sample.strip().split())-1
        sampled_token = sample_token(predictions[0][sample_index])
        sampled_token = index_lookup[sampled_token]
        decoded_sample += " " + sampled_token
    return decoded_sample

In [22]:
generate_text("Nous")

"Nous sommes deux nous nous avons daté les ans  elle à sa voix  de ce un  n'a"