In [None]:
# default_exp cli

# CLI

> Contains all the CLI functions that your library provides.

Some data and code taken from https://github.com/github/CodeSearchNet

```
MIT License

Copyright (c) 2019 GitHub

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
```

```
@article{husain2019codesearchnet,
  title={{CodeSearchNet} challenge: Evaluating the state of semantic code search},
  author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},
  journal={arXiv preprint arXiv:1909.09436},
  year={2019}
}
```

In [1]:
# export
import gdown
import io
import logging
import random
import requests
import zipfile

import pandas as pd
import tensorflow as tf

from collections import defaultdict
from fastcore.script import call_parse, Param
from icodegen.data.core import (
    convert_df_to_tfds,
    java_special_tokens,
    remove_non_ascii,
    replace_special_tokens,
    train_tokenizer,
)
from icodegen.data.transforms import (
    code_token_randomizer,
    line_randomizer,
    java_comment_remover,
    transform_df,
)
from icodegen.evaluation.core import (
    get_mean_probs,
    mean_dist_probs,
    get_mean_cross_entropy,
)
from icodegen.model.core import RNNModel, TransformerModel
from pathlib import Path
from sklearn.model_selection import train_test_split
from tokenizers import Tokenizer

seed = 115

logger = logging.getLogger()
logger.setLevel(logging.INFO)

## TODO Standardize naming convention to use `path` instead of `dir`

In [None]:
# export
URLs = {
    "bigclonebenchmark_lg": "https://drive.google.com/uc?id=1-4LPiiKGR5Zmg-TLqZEkRbRIdg7UlJQb",
    "bigclonebenchmark_sm": "https://drive.google.com/uc?id=1FCq0lSs4oqc3jpSoucsHlRqjmbVwdRQ9",
    "bug_fix_pairs": "https://drive.google.com/uc?id=1XEhnsQ3Uy6SnFz349I0Iu9lz4ggAaiQp",
    "codesearchnet_java": "https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip",
}

In [None]:
# export
@call_parse
def download_data(
    out_dir: Param("The output directory to download and extract all files to.", str)
):
    """
    Function for downloading all the data to reproduce our study.
    """
    out_dir = Path(out_dir)

    # TODO: Make individual folders to place all these files in
    # Download bigclonebenchmark_lg and bigclonebenchmark_sm
    logging.info("Downloading BigCloneBenchmark datasets.")
    #     gdown.download(
    #         URLs["bigclonebenchmark_lg"], str(out_dir / "bigclonebenchmark_lg.csv")
    #     )
    #     gdown.download(
    #         URLs["bigclonebenchmark_sm"], str(out_dir / "bigclonebenchmark_sm.csv")
    #     )

    # Download Bug Fix Pairs
    logging.info("Downloading and extracting Bug Fix Pairs dataset.")
    #     gdown.cached_download(
    #         URLs["bug_fix_pairs"],
    #         str(out_dir / "bug_fix_pairs.zip"),
    #         postprocess=gdown.extractall,
    #     )
    #     with zipfile.ZipFile(
    #         str(out_dir / "datasets" / "50-100" / "source_code.zip"), "r"
    #     ) as zip_ref:
    #         zip_ref.extractall(out_dir)

    # from https://stackoverflow.com/a/14260592/5768407 by users
    # yoavram (https://stackoverflow.com/users/1063612/yoavram) and
    # kamran kausar (https://stackoverflow.com/users/3486460/kamran-kausar)
    logging.info("Downloading and extracting CodeSearchNet Challenge dataset.")
    r = requests.get(URLs["codesearchnet_java"])
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(out_dir / "codesearchnet_java")

In [None]:
download_data("/tmp")

assert Path("/tmp/bigclonebenchmark_lg.csv").exists()
assert Path("/tmp/bigclonebenchmark_sm.csv").exists()

assert Path("/tmp/bug_fix_pairs.zip").exists()
assert Path("/tmp/50-100/buggy").exists()
assert Path("/tmp/50-100/fixed").exists()

assert Path("/tmp/codesearchnet_java").exists()

In [None]:
# export
def _process_bigclonebenchmark(path):
    # Set seed for reproducibility
    random.seed(seed)
    tf.random.set_seed(seed)
    pass

In [None]:
# df = pd.read_csv("/tmp/bigclonebenchmark_lg.csv")
# df.head()

In [None]:
# df.clone_type.value_counts()

In [None]:
# export
def _process_bug_fix(path):
    # Set seed for reproducibility
    random.seed(seed)
    tf.random.set_seed(seed)

    buggy_paths = sorted((path / "50-100").glob("buggy/*.java"))
    fixed_paths = sorted((path / "50-100").glob("fixed/*.java"))
    bugs = []
    fixes = []
    for bug_p, fix_p in zip(buggy_paths, fixed_paths):
        with open(bug_p, "r") as f:
            bugs.append(f.read())

        with open(fix_p, "r") as f:
            fixes.append(f.read())

    df_buggy = pd.DataFrame(bugs, columns=["code"])
    df_buggy = remove_non_ascii(df_buggy)
    df_buggy = replace_special_tokens(df_buggy, java_special_tokens)

    df_fixed = pd.DataFrame(fixes, columns=["code"])
    df_fixed = remove_non_ascii(df_fixed)
    df_fixed = replace_special_tokens(df_fixed, java_special_tokens)

    # Saving to jsonl because csv formatting is causing issues with quoting
    df_buggy.to_json(path / "buggy.jsonl", orient="records", lines=True)
    df_fixed.to_json(path / "fixed.jsonl", orient="records", lines=True)

In [None]:
_process_bug_fix(Path("/tmp"))

assert Path("/tmp/buggy.jsonl").exists()
assert Path("/tmp/fixed.jsonl").exists()

In [None]:
# BUGGY_MTHD = """\
# private void success(io.netty.channel.Channel channel) {
#     org.mycat.netty.mysql.MySQLHandshakeHandler.logger.debug("success info return form MySQLHandshakeHandler");
#     io.netty.buffer.ByteBuf out = channel.alloc().buffer();
#     org.mycat.netty.mysql.OK ok = new org.mycat.netty.mysql.OK();
#     ok.sequenceId = 2;
#     ok.setStatusFlag(Flags.SERVER_STATUS_AUTOCOMMIT);
#     out.writeBytes(ok.toPacket());
#     channel.writeAndFlush(out);
# }"""
# FIXED_MTHD = """\
# private void success(io.netty.channel.Channel channel) {
#     org.mycat.netty.mysql.MySQLHandshakeHandler.logger.info("success info return form MySQLHandshakeHandler");
#     io.netty.buffer.ByteBuf out = channel.alloc().buffer();
#     org.mycat.netty.mysql.OK ok = new org.mycat.netty.mysql.OK();
#     ok.sequenceId = 2;
#     ok.setStatusFlag(Flags.SERVER_STATUS_AUTOCOMMIT);
#     out.writeBytes(ok.toPacket());
#     channel.writeAndFlush(out);
# }"""
# df = pd.read_json("/tmp/bug_fix_pairs.jsonl", orient="records", lines=True)

# assert BUGGY_MTHD == df.buggy.values[0] and FIXED_MTHD == df.fixed.values[0]

In [None]:
# export
def _jsonl_list_to_dataframe(file_list, columns=None):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat(
        [
            pd.read_json(f, orient="records", compression="gzip", lines=True)[columns]
            for f in file_list
        ],
        sort=False,
    )


def _process_codesearchnet(path):
    """
    Grabs the different data splits and converts them into dataframes.
    Expects format from Code Search Net Challenge.
    """
    # Set seed for reproducibility
    random.seed(seed)
    tf.random.set_seed(seed)

    for split in ["train", "valid", "test"]:
        files = sorted((path / "java" / "final" / "jsonl" / split).glob("**/*.gz"))
        df = _jsonl_list_to_dataframe(files, ["code"])
        df = remove_non_ascii(df)
        df = replace_special_tokens(df, java_special_tokens)
        # Saving to jsonl because csv formatting is causing issues with quoting
        if split == "train":
            # 10% selected to match the Big Code != Big Vocab paper.
            df_trn, df_bpe = train_test_split(df, test_size=0.1)
            df_trn.to_json(path / f"{split}.jsonl", orient="records", lines=True)
            df_bpe.to_json(path / "bpe.jsonl", orient="records", lines=True)
        else:
            df.to_json(path / f"{split}.jsonl", orient="records", lines=True)

In [None]:
# export
@call_parse
def process_data(
    down_dir: Param(
        "The directory where all the files were downloaded and extracted to.", str
    )
):
    """Function for processing data related to the library."""
    down_dir = Path(down_dir)

    # Process CodeSearchNet Challenge data
    _process_codesearchnet(down_dir / "codesearchnet_java")

    # Process Bug Fix Pairs data


#     _process_bug_fix(down_dir)

In [None]:
process_data("/tmp")

assert Path("/tmp/codesearchnet_java/train.jsonl").exists()
assert Path("/tmp/codesearchnet_java/bpe.jsonl").exists()
assert Path("/tmp/codesearchnet_java/valid.jsonl").exists()
assert Path("/tmp/codesearchnet_java/test.jsonl").exists()

In [2]:
# export

# Experiment 0.0.0
VANILLA_CONFIG = {
    "rnn_type": "rnn",
    "n_layers": 1,
    "embedding_dim": 256,
    "rnn_units": 1_024,
}

# Experiment 1.0.0
GRU_CONFIG_1 = {
    "rnn_type": "gru",
    "n_layers": 1,
    "embedding_dim": 256,
    "rnn_units": 1_024,
}

# Experiment 1.1.0
GRU_CONFIG_2 = {
    "rnn_type": "gru",
    "n_layers": 2,
    "embedding_dim": 256,
    "rnn_units": 1_024,
}

# Experiment 1.1.1
GRU_CONFIG_3 = {
    "rnn_type": "gru",
    "n_layers": 3,
    "embedding_dim": 256,
    "rnn_units": 1_024,
}

# Experiment 1.2.0
GRU_CONFIG_4 = {
    "rnn_type": "gru",
    "n_layers": 1,
    "embedding_dim": 256,
    "rnn_units": 512,
}

# Experiment 1.2.1
GRU_CONFIG_5 = {
    "rnn_type": "gru",
    "n_layers": 1,
    "embedding_dim": 256,
    "rnn_units": 2_048,
}

_RNN_CONFIGs = [
    VANILLA_CONFIG,
    GRU_CONFIG_1,
    GRU_CONFIG_2,
    GRU_CONFIG_3,
    GRU_CONFIG_4,
    GRU_CONFIG_5,
]

In [33]:
# export
@call_parse
def train(
    data_path: Param("The path to where the data to train the models is located", str),
    out_path: Param("The output path to save all model chkpts to.", str),
    epochs: Param("The number of epochs to train each model for.", int) = 64,
    max_length: Param(
        "The maximum number of tokens each method can be. Truncation and padding will occur if the method is too long or short, respectively.",
        int,
    ) = 300,
    batch_size: Param("The batch size to use for training each model.", int) = 64,
):
    """Function for training models related to the library."""
    random.seed(seed)
    tf.random.set_seed(seed)

    data_path = Path(data_path)
    out_path = Path(out_path)
    out_path.mkdir(exist_ok=True)

    # Load in the datasets

    # Train BPE tokenizer
    # Check if the path where the tokenizer is to be saved is not empty
    # if it is not empty then just load the tokenizer there.
    if (out_path / "tokenizer.json").exists():
        logging.info(f"Loading tokenizer from {str(out_path / 'tokenizer.json')}.")
        tokenizer = Tokenizer.from_file(str(out_path / "tokenizer.json"))
    else:
        logging.info(
            f"Training new tokenizer and saving to {str(out_path / 'tokenizer.json')}."
        )
        df_bpe = pd.read_json(
            data_path / "codesearchnet_java" / "bpe.jsonl", orient="records", lines=True
        )[:1_000]
        tokenizer = train_tokenizer(df_bpe, java_special_tokens, max_length)
        tokenizer.save(str(out_path / "tokenizer.json"), pretty=True)
        del df_bpe

    # Tokenize the dataset and convert it to tfds.
    tfds_trn_path = (
        data_path / "codesearchnet_java" / f"tfds_trn_{max_length}len_{batch_size}bs"
    )
    if tfds_trn_path.exists():
        ds_trn = tf.data.experimental.load(
            str(tfds_trn_path),
            (
                tf.TensorSpec(shape=(batch_size, max_length - 1), dtype=tf.int32),
                tf.TensorSpec(shape=(batch_size, max_length - 1), dtype=tf.int32),
            ),
        )
    else:
        df_trn = pd.read_json(
            data_path / "codesearchnet_java" / "train.jsonl",
            orient="records",
            lines=True,
        )[:1_000]
        ds_trn = convert_df_to_tfds(df_trn, tokenizer, max_length, batch_size)
        tfds_trn_path.mkdir(exist_ok=True)
        tf.data.experimental.save(ds_trn, str(tfds_trn_path))
        del df_trn

    tfds_val_path = (
        data_path / "codesearchnet_java" / f"tfds_val_{max_length}len_{batch_size}bs"
    )
    if tfds_val_path.exists():
        ds_val = tf.data.experimental.load(
            str(tfds_val_path),
            (
                tf.TensorSpec(shape=(batch_size, max_length - 1), dtype=tf.int32),
                tf.TensorSpec(shape=(batch_size, max_length - 1), dtype=tf.int32),
            ),
        )
    else:
        df_val = pd.read_json(
            data_path / "codesearchnet_java" / "valid.jsonl",
            orient="records",
            lines=True,
        )[:1_000]
        ds_val = convert_df_to_tfds(df_val, tokenizer, max_length, batch_size)
        tfds_val_path.mkdir(exist_ok=True)
        tf.data.experimental.save(ds_val, str(tfds_val_path))
        del df_val

    logging.info("Starting the training of all RNN based models.")
    # Train RNN based models
    for config in _RNN_CONFIGs:
        rnn_model = RNNModel(
            config["rnn_type"],
            config["n_layers"],
            tokenizer.get_vocab_size(),
            config["embedding_dim"],
            config["rnn_units"],
            batch_size,
            str(out_path),
            tokenizer,
        )
        rnn_model.train(ds_trn, ds_val, epochs)
        rnn_model.save()

    logging.info("Starting the training of all Transformer based models.")
    # Train Transformer models
    pass

In [34]:
%%time
_RNN_CONFIGs = [VANILLA_CONFIG, GRU_CONFIG_1]
train(
    data_path="/home/jovyan/work/dvc-icodegen/data",
    out_path="/home/jovyan/work/dvc-icodegen/models",
    epochs=5,
    max_length=100,
    batch_size=8,
)

INFO:root:Loading tokenizer from /home/jovyan/work/dvc-icodegen/models/tokenizer.json.
INFO:root:Starting the training of all RNN based models.


<BatchDataset shapes: ((8, 99), (8, 99)), types: (tf.int32, tf.int32)>
Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
INFO:tensorflow:Assets written to: /home/jovyan/work/dvc-icodegen/models/rnn_vocab10000_embed256_units1024/assets


INFO:tensorflow:Assets written to: /home/jovyan/work/dvc-icodegen/models/rnn_vocab10000_embed256_units1024/assets


Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
INFO:tensorflow:Assets written to: /home/jovyan/work/dvc-icodegen/models/gru_vocab10000_embed256_units1024/assets


INFO:tensorflow:Assets written to: /home/jovyan/work/dvc-icodegen/models/gru_vocab10000_embed256_units1024/assets
INFO:root:Starting the training of all Transformer based models.


CPU times: user 3min 21s, sys: 11.1 s, total: 3min 32s
Wall time: 2min 2s


In [35]:
%%time
_RNN_CONFIGs = [VANILLA_CONFIG, GRU_CONFIG_1]
train(
    data_path="/home/jovyan/work/dvc-icodegen/data",
    out_path="/home/jovyan/work/dvc-icodegen/models",
    epochs=1,
    max_length=100,
    batch_size=8,
)

INFO:root:Loading tokenizer from /home/jovyan/work/dvc-icodegen/models/tokenizer.json.
INFO:root:Starting the training of all RNN based models.


<_LoadDataset shapes: ((8, 99), (8, 99)), types: (tf.int32, tf.int32)>




INFO:tensorflow:Assets written to: /home/jovyan/work/dvc-icodegen/models/rnn_vocab10000_embed256_units1024/assets


INFO:tensorflow:Assets written to: /home/jovyan/work/dvc-icodegen/models/rnn_vocab10000_embed256_units1024/assets










INFO:tensorflow:Assets written to: /home/jovyan/work/dvc-icodegen/models/gru_vocab10000_embed256_units1024/assets


INFO:tensorflow:Assets written to: /home/jovyan/work/dvc-icodegen/models/gru_vocab10000_embed256_units1024/assets
INFO:root:Starting the training of all Transformer based models.


CPU times: user 48.1 s, sys: 2.29 s, total: 50.4 s
Wall time: 34.1 s


In [27]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file(
    str(
        Path("/home/jovyan/work/dvc-icodegen/models/rnn_vocab10000_embed256_units1024")
        / "tokenizer.json"
    )
)

In [28]:
df_trn = pd.read_json(
    Path("/home/jovyan/work/dvc-icodegen/data") / "codesearchnet_java" / "train.jsonl",
    orient="records",
    lines=True,
)[:1_000]
dataset = convert_df_to_tfds(df_trn, tokenizer, 100, 1)

In [30]:
dataset.element_spec

(TensorSpec(shape=(1, 99), dtype=tf.int32, name=None),
 TensorSpec(shape=(1, 99), dtype=tf.int32, name=None))

In [None]:
model = RNNModel.from_path(
    "/home/jovyan/work/dvc-icodegen/models/rnn_vocab10000_embed256_units1024"
)
loss = model.model.evaluate(dataset, verbose=2)

In [None]:
NUM_TOKENS = 100
text = model.generate(NUM_TOKENS, temperature=0.1)
print(text)

In [None]:
_RNN_CONFIGs = [VANILLA_CONFIG, GRU_CONFIG_1]
train(
    data_path="/tmp",
    out_path="/tmp/models",
    epochs=10,
    max_length=100,
    batch_size=8,
)

In [None]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file(
    str(Path("/tmp/models/rnn_vocab10000_embed256_units1024") / "tokenizer.json")
)

In [None]:
df_trn = pd.read_json(
    Path("/tmp") / "codesearchnet_java" / "train.jsonl", orient="records", lines=True
)[:100]
dataset = convert_df_to_tfds(df_trn, tokenizer, 100, 1)

In [None]:
model = RNNModel.from_path("/tmp/models/rnn_vocab10000_embed256_units1024")
loss = model.model.evaluate(dataset, verbose=2)

In [None]:
NUM_TOKENS = 100
text = model.generate(NUM_TOKENS, temperature=0.1)
print(text)

In [None]:
# export
_TRANSFORMs = {
    "randomized_tokens": code_token_randomizer,
    "randomized_lines": line_randomizer,
    "comments_removed": java_comment_remover,
}

In [None]:
# export
def _get_metrics(df, model):
    mean_probs = get_mean_probs(df, model)
    df_dist = mean_dist_probs(df, model)
    mean_cross_entropy = get_mean_cross_entropy(df, model)

    return {
        "mean_probs": mean_probs,
        "dist_mean": df_dist,
        "mean_cross_entropy": mean_cross_entropy,
    }


def _long_range(data_dir, model, n=None):
    long_range_results = {}

    df_buggy = pd.read_json(data_dir / "buggy.jsonl", orient="records", lines=True)[:n]
    long_range_results["buggy"] = _get_metrics(df_buggy, model)
    del df_buggy

    df_fixed = pd.read_json(data_dir / "fixed.jsonl", orient="records", lines=True)[:n]
    long_range_results["fixed"] = _get_metrics(df_fixed, model)
    del df_fixed

    df_codesearchnet = pd.read_json(
        data_dir / "codesearchnet_java" / "test.jsonl", orient="records", lines=True
    )[:n]
    long_range_results["codesearchnet_original"] = _get_metrics(df_codesearchnet, model)

    for transform in _TRANSFORMs:
        df_transformed = transform_df(df_codesearchnet, _TRANSFORMs[transform])
        long_range_results["codesearchnet_" + transform] = _get_metrics(
            df_transformed, model
        )
        del df_transformed

    return long_range_results

In [None]:
# hide
# Setting up testing data
from transformers import TFGPT2LMHeadModel

# Using tiny-gpt2 for just quick tests since it is... tiny :)
# tokenizer = GPT2Tokenizer.from_pretrained("sshleifer/tiny-gpt2")
trnsfr = TFGPT2LMHeadModel.from_pretrained("sshleifer/tiny-gpt2")
model = TransformerModel(tokenizer, trnsfr)

In [None]:
random.seed(42)
long_range_results = _long_range(Path("/tmp"), model, n=100)
# print(long_range_results)

In [None]:
import pprint

pprint.pprint(long_range_results)

In [None]:
long_range_results["codesearchnet_original"]["mean_cross_entropy"]

In [None]:
print(
    long_range_results["codesearchnet_randomized_tokens"]["mean_cross_entropy"],
    long_range_results["codesearchnet_randomized_lines"]["mean_cross_entropy"],
    long_range_results["codesearchnet_comments_removed"]["mean_cross_entropy"],
)

In [None]:
# export
def _counterfactual(control_results, treatment_results):
    pass

In [None]:
# export
@call_parse
def evaluate(
    data_dir: Param("The message", str),
    model_dir: Param("The message", str),
    out_dir: Param("The message", str),
):
    """Function for evaluating models related to the library."""
    random.seed(seed)
    tf.random.set_seed(seed)

    data_dir = Path(data_dir)
    model_dir = Path(model_dir)
    out_dir = Path(out_dir)

    results = defaultdict(dict)
    #     models = []
    # These model folders will need to contain the config of the model as well
    # to differentiate them
    for m_path in model_dir.glob("*/"):
        model = None
        if m_path.name == "Transformer":
            model = TransformerModel.from_path(m_path)
        elif m_path.name == "GRU":
            model = RNNModel.from_path(m_path)
        elif m_path.name == "RNN":
            pass

        # Long-Range Interactions
        results[m_path.name]["long_range"] = _long_range(data_dir, model)

        # Counterfactuals


#         results[m_path]["counterfactual"] = _counterfactual(data_dir, model)
# _counterfactual(control_results, treatment_results)

# Save results in json format
# Long-Range Interactions
#     long_range_results = _long_range(data_dir, models)
#     long_range_results

#     # Counterfactuals
#     counterfactual_results = []
#     counterfactual_results
#     for transform in _TRANSFORMs:
#         pass
# _counterfactual(control_results, treatment_results)

In [None]:
# export
@call_parse
def reproduce(
    out_dir: Param(
        "The output directory to download, extract, and save all files to.", str
    )
):
    """Function for reproducing results related to the library."""
    random.seed(seed)
    tf.random.set_seed(seed)

    download_data(out_dir)
    process_data(out_dir)

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()