In [None]:
# default_exp model.rnn

# RNN

> API details. @nathan

In [1]:
# export
import pandas as pd
import tensorflow as tf

from icodegen.data.core import convert_df_to_tfds
from pathlib import Path

In [2]:
# hide
# Setup
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("sshleifer/tiny-gpt2")
tokenizer.add_special_tokens({"pad_token": "[PAD]"})

df_fake = pd.DataFrame(
    ["aaaa(bb(aaaa(bb()()ccc)dd)()ccc)dd", "aaaa(bb()ccccc)dd"], columns=["code"]
)

In [3]:
# export
def _loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True
    )

In [29]:
# export
class RNNModel:
    _RNN_TYPE = {
        "rnn": tf.keras.layers.SimpleRNN,
        "gru": tf.keras.layers.GRU,
        "lstm": tf.keras.layers.LSTM,
    }

    def __init__(
        self,
        rnn_type,
        n_layers,
        vocab_size,
        embedding_dim,
        rnn_units,
        batch_size,
        out_path,
        tokenizer,
    ):
        layer = RNNModel._RNN_TYPE[rnn_type]
        rnn_layers = [
            layer(
                rnn_units,
                return_sequences=True,
                # I think we need to have this not be stateful since we don't
                # chop up examples
                # stateful=True,
                recurrent_initializer="glorot_uniform",
                # following BigCode != Big Vocab Paper
                dropout=0.1,
            )
            for _ in range(n_layers)
        ]
        self.model = tf.keras.Sequential(
            [
                tf.keras.layers.Embedding(
                    input_dim=vocab_size,
                    output_dim=embedding_dim,
                    mask_zero=True,  # Zero cannot be used in the vocabulary
                    batch_input_shape=[batch_size, None],
                ),
            ]
            + rnn_layers
            + [
                tf.keras.layers.Dense(vocab_size),
            ]
        )
        self.tokenizer = tokenizer

        self.config_name = (
            f"{rnn_type}_vocab{vocab_size}_embed{embedding_dim}_units{rnn_units}"
        )
        self.out_path = Path(out_path) / self.config_name
        self.callbacks = [
            tf.keras.callbacks.ModelCheckpoint(
                filepath=self.out_path / "ckpt_{epoch}", save_weights_only=True
            )
        ]

    # TODO add code to easily train model
    def train(self, dataset, epochs):
        self.model.compile(optimizer="adam", loss=_loss)
        _ = self.model.fit(dataset, epochs=epochs, callbacks=self.callbacks)

    def generate(self, n, temperature=1.0):
        # Evaluation step (generating text using the learned model)

        # Converting our start string to numbers (vectorizing)
        input_eval = [self.tokenizer.bos_token_id]
        input_eval = tf.expand_dims(input_eval, 0)

        # Empty string to store our results
        text_generated = []

        # Low temperature results in more predictable text.
        # Higher temperature results in more surprising text.
        # Experiment to find the best setting.
        #         temperature = 1.0

        # Here batch size == 1
        self.model.reset_states()
        for i in range(n):
            predictions = self.model(input_eval)
            # remove the batch dimension
            predictions = tf.squeeze(predictions, 0)

            # using a categorical distribution to predict the character returned by the model
            predictions = predictions / temperature
            predicted_id = tf.random.categorical(predictions, num_samples=1)[
                -1, 0
            ].numpy()

            # Pass the predicted character as the next input to the model
            # along with the previous hidden state
            input_eval = tf.expand_dims([predicted_id], 0)

            text_generated.append(predicted_id)

        return self.tokenizer.decode(text_generated), text_generated

In [30]:
# Tokenize the data
ds = convert_df_to_tfds(df_fake, tokenizer, 32, 1)

In [31]:
gru = RNNModel("gru", 1, len(tokenizer), 128, 128, 1, "/tmp", tokenizer)
gru.model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (1, None, 128)            6433024   
_________________________________________________________________
gru_4 (GRU)                  (1, None, 128)            99072     
_________________________________________________________________
dense_4 (Dense)              (1, None, 50258)          6483282   
Total params: 13,015,378
Trainable params: 13,015,378
Non-trainable params: 0
_________________________________________________________________


In [34]:
gru.train(ds, 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [48]:
gru.generate(10, temperature=0.3)

('[PAD] aaaacc incor Resolerance Damon Chloeablishment tours',
 [50257, 24794, 535, 5970, 1874, 37668, 33572, 29476, 25380, 21284])

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()