In [None]:
# default_exp model.core

# Model

> API details. @nathan

In [None]:
# export
import json

import tensorflow as tf

from abc import ABC, abstractmethod
from pathlib import Path
from tokenizers import Tokenizer

In [None]:
# hide
# Setup
import pandas as pd

from icodegen.data.core import convert_df_to_tfds, java_special_tokens, train_tokenizer

# from transformers import GPT2TokenizerFast

# Don't use this GPT2 TOeknizerFast API, it is not at all close to the
# huggingface tokenizers lib API...
# tokenizer = GPT2TokenizerFast.from_pretrained("sshleifer/tiny-gpt2")
# tokenizer.add_special_tokens({"pad_token": "<pad>", "bos_token": "<sos>"})

df_fake = pd.DataFrame(
    ["aaaa(bb(aaaa(bb()()ccc)dd)()ccc)dd", "aaaa(bb()ccccc)dd"], columns=["code"]
)
# Tokenize the data
# ds = convert_df_to_tfds(df_fake, tokenizer, 32, 1)

In [None]:
# export
class Model(ABC):
    # TODO: Add generating the model config (but only the pieces we care about,
    # i.e., the num layers, heads, dim size, emb size, etc) so that we can
    # asily save it to a file for organizing evaluation results
    # Also add loading from_path method to load models easily
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model

    @abstractmethod
    def from_path(path):
        pass

    @abstractmethod
    def get_probs(self, inputs):
        pass

    @abstractmethod
    def save(self, path):
        pass

    @abstractmethod
    def tokenize(self, method):
        pass

    @abstractmethod
    def train(self, ds, epochs):
        pass

    @abstractmethod
    def save(self, path):
        pass

    # TODO: Add save method that handles saving model and tokenizer to disc

In [None]:
# export
def _loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True
    )

In [None]:
# export

# Tensorflow Huggingface Transformer
class TransformerModel(Model):
    def from_path(path):
        pass

    def get_probs(self, inputs):
        outputs = self.model(inputs)
        logits = outputs[0]
        probs = tf.nn.softmax(logits)

        return probs

    def save(self, path):
        pass

    def tokenize(self, method):
        return self.tokenizer(method, return_tensors="tf")

    def train(self, ds, epochs):
        pass

In [None]:
# export
class RNNModel(Model):
    _RNN_TYPE = {
        "rnn": tf.keras.layers.SimpleRNN,
        "gru": tf.keras.layers.GRU,
        "lstm": tf.keras.layers.LSTM,
    }

    def __init__(
        self,
        rnn_type,
        n_layers,
        vocab_size,
        embedding_dim,
        rnn_units,
        batch_size,
        out_path,
        tokenizer,
    ):
        self.rnn_type = rnn_type
        self.n_layers = n_layers
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.rnn_units = rnn_units

        self.config_name = (
            f"{rnn_type}_vocab{vocab_size}_embed{embedding_dim}_units{rnn_units}"
        )
        self.out_path = Path(out_path) / self.config_name
        self.out_path.mkdir(exist_ok=True)
        self.callbacks = [
            tf.keras.callbacks.ModelCheckpoint(
                filepath=self.out_path / "ckpt_{epoch}", save_weights_only=True
            )
        ]

        layer = RNNModel._RNN_TYPE[rnn_type]
        rnn_layers = [
            layer(
                rnn_units,
                return_sequences=True,
                recurrent_initializer="glorot_uniform",
                # following BigCode != Big Vocab Paper
                dropout=0.1,
            )
            for _ in range(n_layers)
        ]
        model = tf.keras.Sequential(
            [
                tf.keras.layers.Embedding(
                    input_dim=vocab_size,
                    output_dim=embedding_dim,
                    mask_zero=True,  # Zero cannot be used in the vocabulary
                    batch_input_shape=[batch_size, None],
                ),
            ]
            + rnn_layers
            + [
                tf.keras.layers.Dense(vocab_size),
            ]
        )

        super().__init__(tokenizer, model)

    @staticmethod
    def from_path(path):
        path = Path(path)

        tokenizer = Tokenizer.from_file(str(path / "tokenizer.json"))
        with open(path / "model_config.json", "r") as f:
            model_config = json.load(f)

        model = RNNModel(
            model_config["rnn_type"],
            model_config["n_layers"],
            model_config["vocab_size"],
            model_config["embedding_dim"],
            model_config["rnn_units"],
            1,
            path,
            tokenizer,
        )
        model.model = tf.keras.models.load_model(
            str(path), custom_objects={"_loss": _loss}
        )

        return model
        #         model.model.load_weights(tf.train.latest_checkpoint(path))
        pass

    def get_probs(self, method):
        pass

    def generate(self, n, temperature=1.0):
        # Evaluation step (generating text using the learned model)

        # Converting our start string to numbers (vectorizing)
        input_eval = self.tokenizer.encode("<sos>").ids
        input_eval = tf.expand_dims(input_eval, 0)

        # Empty string to store our results
        text_generated = []

        # Here batch size == 1
        self.model.reset_states()
        for i in range(n):
            predictions = self.model(input_eval)
            # remove the batch dimension
            predictions = tf.squeeze(predictions, 0)

            # using a categorical distribution to predict the character
            # returned by the model
            predictions = predictions / temperature
            predicted_id = tf.random.categorical(predictions, num_samples=1)[
                -1, 0
            ].numpy()

            # Pass the predicted character as the next input to the model
            # along with the previous hidden state
            input_eval = tf.expand_dims([predicted_id], 0)

            text_generated.append(predicted_id)

        return self.tokenizer.decode(text_generated, skip_special_tokens=False)

    def save(self):
        self.tokenizer.save(str(self.out_path / "tokenizer.json"), pretty=True)
        self.model.save(str(self.out_path))
        model_config = {
            "rnn_type": self.rnn_type,
            "n_layers": self.n_layers,
            "vocab_size": self.vocab_size,
            "embedding_dim": self.embedding_dim,
            "rnn_units": self.rnn_units,
        }
        with open(self.out_path / "model_config.json", "w") as f:
            json.dump(model_config, f)

    def tokenize(self, method):
        return self.tokenizer(method, return_tensors="tf")

    # TODO add tensorboard call back for easy visualization
    def train(self, dataset, epochs):
        self.model.compile(optimizer="adam", loss=_loss)
        history = self.model.fit(dataset, epochs=epochs, callbacks=self.callbacks)

        return history

In [None]:
PARAM_COUNT = 13_015_378

rnn_type = "gru"
n_layers = 1
vocab_size = len(tokenizer)
embedding_dim = 128
rnn_units = 128
batch_size = 1
out_path = "/tmp"
gru = RNNModel(
    rnn_type,
    n_layers,
    vocab_size,
    embedding_dim,
    rnn_units,
    batch_size,
    out_path,
    tokenizer,
)

assert PARAM_COUNT == gru.model.count_params()

In [None]:
EPOCHS = 1
chkpt_path = Path(out_path) / (
    f"{rnn_type}_vocab{vocab_size}_embed{embedding_dim}_units{rnn_units}"
)
history = gru.train(ds, EPOCHS)

assert chkpt_path.exists()
assert EPOCHS == len(list(chkpt_path.glob("*.index")))
assert EPOCHS == len(history.history["loss"])

In [None]:
NUM_TOKENS = 10
text = gru.generate(NUM_TOKENS)

assert NUM_TOKENS == len(tokenizer(text).input_ids)

In [None]:
df_trn = pd.read_json(
    Path("/tmp") / "codesearchnet_java" / "train.jsonl", orient="records", lines=True
)[:100]
df_bpe = pd.read_json(
    Path("/tmp") / "codesearchnet_java" / "bpe.jsonl", orient="records", lines=True
)[:5_000]
max_length = 100
batch_size = 1
tokenizer = train_tokenizer(df_bpe, java_special_tokens, max_length)
dataset = convert_df_to_tfds(df_trn, tokenizer, max_length, batch_size)

In [None]:
model = RNNModel(
    "rnn",
    1,
    tokenizer.get_vocab_size(),
    256,
    1024,
    batch_size,
    "/tmp/models",
    tokenizer,
)
model.train(dataset, 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fee9da28790>

In [None]:
# Non-stateful
NUM_TOKENS = 100
text = model.generate(NUM_TOKENS, temperature=0.1)
print(text)

<private><private><private><private><private> <void>DescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDescriptionDe

In [None]:
# Stateful
NUM_TOKENS = 100
text = model.generate(NUM_TOKENS, temperature=0.1)
print(text)

<public> <{><n>        <if> <(><)><;><n>        <}><pad>PRE the extractConnection <=> <(> scheduledConnection <{><n>        <if> <(> length<)> <{><n><n>        <}><pad> getIfc d.a<(><)><;><n>        <if> <(> "<)><;><n>        <if> <(> "<)><;><n>        <if> <(><!><(><)><;><n>        <}><pad><int> <void> setchedule <=> <(> "<)><;><n>        <if> <(> length<)><;><n>        <if> 


In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()

Converted 00_data.core.ipynb.
Converted 01_data.transforms.ipynb.
Converted 02_model.core.ipynb.
Converted 04_evaluation.core.ipynb.
Converted index.ipynb.
