In [None]:
# default_exp cli

# CLI

> Contains all the CLI functions that your library provides.

Some data and code taken from https://github.com/github/CodeSearchNet

```
MIT License

Copyright (c) 2019 GitHub

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
```

```
@article{husain2019codesearchnet,
  title={{CodeSearchNet} challenge: Evaluating the state of semantic code search},
  author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},
  journal={arXiv preprint arXiv:1909.09436},
  year={2019}
}
```

In [1]:
# export
import logging
import random

import tensorflow as tf

from fastcore.script import call_parse, Param
from icodegen.data.core import (
    _download_data,
    process_data,
)
from icodegen.model.core import train
from icodegen.evaluation.core import evaluate
from pathlib import Path

seed = 115

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [2]:
# export
@call_parse
def download_data(
    out_path: Param("The output path to download and extract all files to.", str)
):
    """
    Function for downloading all the data to reproduce our study.
    """
    out_path = Path(out_path)

    _download_data(out_path)

In [None]:
data_path = Path("/tmp/data")
download_data(str(data_path))

bigclone_path = data_path / "bigclonebenchmark"
assert Path(bigclone_path / "bigclonebenchmark_lg.csv").exists()
assert Path(bigclone_path / "bigclonebenchmark_sm.csv").exists()

bugfix_path = data_path / "bug_fix"
assert Path(bugfix_path / "bug_fix_pairs.zip").exists()
assert Path(bugfix_path / "50-100/buggy").exists()
assert Path(bugfix_path / "50-100/fixed").exists()

codesearchnet_path = data_path / "codesearchnet"
assert Path(codesearchnet_path / "codesearchnet_java").exists()

In [3]:
# export
# Hyperparams for reproduction
EPOCHs = 100
MAX_LEN = 256
BS = 64

In [4]:
# export
@call_parse
def reproduce(
    out_path: Param(
        "The output directory to download, extract, and save all files to.", str
    ),
    n: Param("Number of examples to train on for testing purposes.", int) = None,
):
    """Function for reproducing results related to the library."""
    random.seed(seed)
    tf.random.set_seed(seed)
    out_path = Path(out_path)
    data_path = out_path / "data"

    _download_data(data_path)
    process_data(data_path)

    trn_data_path = data_path / "codesearchnet"
    models_path = out_path / "models"
    train(
        data_path=trn_data_path,
        out_path=models_path,
        epochs=EPOCHs,
        max_length=MAX_LEN,
        batch_size=BS,
        n=n,
    )

    evaluate(data_path, models_path)

In [5]:
OUT_PATH = "/tmp/output"
N = 100
EPOCHs = 2
MAX_LEN = 32
BS = 16

reproduce(OUT_PATH, N)

INFO:root:Downloading BigCloneBenchmark datasets.
INFO:root:Downloading and extracting Bug Fix Pairs dataset.


File exists: /tmp/output/data/bigclonebenchmark/bigclonebenchmark.zip
File exists: /tmp/output/data/bug_fix/bug_fix_pairs.zip


INFO:root:Downloading and extracting CodeSearchNet Challenge dataset.
INFO:root:File exists: /tmp/output/data/codesearchnet
INFO:root:Loading tokenizer from /tmp/output/models/tokenizer.json.
INFO:root:Starting the training of all RNN based models.


Epoch 1/2
Epoch 2/2
INFO:tensorflow:Assets written to: /tmp/output/models/rnn_layers1_vocab3078_embed256_units1024/assets


INFO:tensorflow:Assets written to: /tmp/output/models/rnn_layers1_vocab3078_embed256_units1024/assets


Epoch 1/2
Epoch 2/2




INFO:tensorflow:Assets written to: /tmp/output/models/gru_layers1_vocab3078_embed256_units1024/assets


INFO:tensorflow:Assets written to: /tmp/output/models/gru_layers1_vocab3078_embed256_units1024/assets


Epoch 1/2
Epoch 2/2




INFO:tensorflow:Assets written to: /tmp/output/models/gru_layers2_vocab3078_embed256_units1024/assets


INFO:tensorflow:Assets written to: /tmp/output/models/gru_layers2_vocab3078_embed256_units1024/assets


Epoch 1/2
Epoch 2/2




INFO:tensorflow:Assets written to: /tmp/output/models/gru_layers3_vocab3078_embed256_units1024/assets


INFO:tensorflow:Assets written to: /tmp/output/models/gru_layers3_vocab3078_embed256_units1024/assets


Epoch 1/2
Epoch 2/2




INFO:tensorflow:Assets written to: /tmp/output/models/gru_layers1_vocab3078_embed256_units512/assets


INFO:tensorflow:Assets written to: /tmp/output/models/gru_layers1_vocab3078_embed256_units512/assets


Epoch 1/2
Epoch 2/2




INFO:tensorflow:Assets written to: /tmp/output/models/gru_layers1_vocab3078_embed256_units2048/assets


INFO:tensorflow:Assets written to: /tmp/output/models/gru_layers1_vocab3078_embed256_units2048/assets
INFO:root:Starting the training of all Transformer based models.


/tmp/output/models/gru_layers1_vocab3078_embed256_units2048


KeyboardInterrupt: 

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()