<a href="https://colab.research.google.com/github/CeHaga/machado-gru/blob/main/Machado_Assis_GRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Machado de Assis GRU
This notebook trains a Gated Recurrent Unit using texts from Brazilian author Machado de Assis.

It's based on this TensorFlow tutorial for Shakespare works: https://www.tensorflow.org/tutorials/text/text_generation?hl=en

## Setup

In [None]:
# Basic packages
import tensorflow as tf
import numpy as np
import os
import time

In [None]:
# Read poetry file
path = '/content/drive/MyDrive/Machado_LSTM/poesias.txt'
text = open(path).read()
print('Length of text: {} characters'.format(len(text)))

Length of text: 650723 characters


In [None]:
# Get unique characters for vocabulary
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

120 unique characters


In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = '/content/drive/MyDrive/Machado_LSTM/training_checkpoints'

## Text Processing

### Mapping
Create a mapping from unique characters to indices and *vice versa*

In [None]:
# Represent each char as an index
char2idx = {u:i for i, u in enumerate(vocab)}

print('{')
for char,_ in zip(char2idx, range(8)):
  print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '"' :   3,
  "'" :   4,
  '(' :   5,
  ')' :   6,
  '*' :   7,
  ...
}


In [None]:
# Represent each index as a char (Opposite of before)
idx2char = np.array(vocab)

print('{')
for i in range(8):
  print('  {:d}: {:s},'.format(i, repr(idx2char[i])))
print('  ...\n}')

{
  0: '\n',
  1: ' ',
  2: '!',
  3: '"',
  4: "'",
  5: '(',
  6: ')',
  7: '*',
  ...
}


In [None]:
# Get input data as int representation
text_as_int = np.array([char2idx[c] for c in text])

print('{} ---- characters mapped to int ---- > {}'.format(repr(text[:10]), text_as_int[:10]))

'O ALMADA\nP' ---- characters mapped to int ---- > [39  1 25 36 37 25 28 25  0 40]


## Model Creation

### Create Input Sequences

In [None]:
# Set size of sequences
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Convert vector to indices stream
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

O
 
A
L
M


In [None]:
# Create batches
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

'O ALMADA\nPoema herói-cômico em 8 cantos\n(Fragmentos)\nADVERTÊNCIA\nO assunto deste poema é rigorosament'
'e histórico. Em 1659, era prelado administrador do Rio\nde Janeiro o Dr. Manuel de Sousa Almada, presb'
'ítero do hábito de São Pedro. Um tabelião, por\nnome Sebastião Ferreira Freire, foi vítima de uma assu'
'ada, em certa noite, na ocasião em que\nse recolhia para casa. Queixando-se ao ouvidor-geral Pedro de '
'Mustre Portugal, abriu este\ndevassa, vindo a saber-se que eram autores do delito alguns fâmulos do pr'


In [None]:
# Create a input vector and a target vector

def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

dataset = sequences.map(split_input_target)

for input_example, target_example in  dataset.take(1):
  print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'O ALMADA\nPoema herói-cômico em 8 cantos\n(Fragmentos)\nADVERTÊNCIA\nO assunto deste poema é rigorosamen'
Target data: ' ALMADA\nPoema herói-cômico em 8 cantos\n(Fragmentos)\nADVERTÊNCIA\nO assunto deste poema é rigorosament'


### Create Batches

In [None]:
# Set batch size
batch_size = 64
buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

### Create Model

In [None]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        # Meaning
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        # Context
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        # Output
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           30720     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 120)           123000    
Total params: 4,092,024
Trainable params: 4,092,024
Non-trainable params: 0
_________________________________________________________________


### Check a Prediction Before 

In [None]:
example_batch_predictions = ''

for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)

sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print("\nNext Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 ' um irmão o afeto casto,\nTanto pudor nessa criatura havia!\nNem um som despertava em nossos lábios;\nE'

Next Char Predictions: 
 "2ÍiNêÇ*AMÚ’Ô!ñ°lTfvVm:Â2ôêé´\n_ÕêÃ”ê6!zÓYevém;mMnª5P”5c/6:uáDñ11kÁ0O)díî'WmùÂvU‘mºjo9mEXi)Íb Vm(ñàñûq"


## Training

In [None]:
# Compile model
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

In [None]:
# Configure checkpoints

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
# Train
epochs = 50

history = model.fit(dataset, initial_epoch=50, epochs=epochs, callbacks=[checkpoint_callback])

## Predictions

In [None]:
# Set batch size to 1 for easier prediction
tf.train.latest_checkpoint(checkpoint_dir)

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [None]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 350

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperature results in more predictable text.
  # Higher temperature results in more surprising text.
  temperature = 1

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    # remove the batch dimension
    predictions = tf.squeeze(predictions, 0)

    # using a categorical distribution to predict the character returned by the model
    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    # Pass the predicted character as the next input to the model
    # along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string=u"Vida "))

Vida flor dos Cinges voando, com pequenina,
Leva o coração terrenho,
Euscrivão, quero, excelso
Recomenda
Para a espécie humilde
De sérias rapira.
Isto que uma sereno
Leitor, tiração de menina e de outro nosso,
Adultar em si o sol da minha pós; Não vês
custam modernos à mantira,
Potira acaso uma flor derramava
Era lhe creio.
Fastou-lhe às margens do vigá
