In [None]:
# default_exp cli

# CLI

> Contains all the CLI functions that your library provides.

Some data and code taken from https://github.com/github/CodeSearchNet

```
MIT License

Copyright (c) 2019 GitHub

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
```

```
@article{husain2019codesearchnet,
  title={{CodeSearchNet} challenge: Evaluating the state of semantic code search},
  author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},
  journal={arXiv preprint arXiv:1909.09436},
  year={2019}
}
```

In [None]:
# export
import gdown
import io
import logging
import requests
import zipfile

import pandas as pd

from fastcore.script import call_parse, Param
from icodegen.data.transforms import (
    code_token_randomizer,
    line_randomizer,
    java_comment_remover,
    transform_df,
)
from icodegen.evaluation.core import (
    get_mean_probs,
    mean_dist_probs,
    get_mean_cross_entropy,
)
from icodegen.model.core import GRUModel, TransformerModel
from pathlib import Path

logger = logging.getLogger()
logger.setLevel(logging.INFO)

## TODO Standardize naming convention to use `path` instead of `dir`

In [None]:
# export
URLs = {
    "bigclonebenchmark_lg": "https://drive.google.com/uc?id=1-4LPiiKGR5Zmg-TLqZEkRbRIdg7UlJQb",
    "bigclonebenchmark_sm": "https://drive.google.com/uc?id=1FCq0lSs4oqc3jpSoucsHlRqjmbVwdRQ9",
    "bug_fix_pairs": "https://drive.google.com/uc?id=1XEhnsQ3Uy6SnFz349I0Iu9lz4ggAaiQp",
    "codesearchnet_java": "https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip",
}

In [None]:
# export
@call_parse
def download_data(
    out_dir: Param("The output directory to download and extract all files to.", str)
):
    """
    Function for downloading all the data to reproduce our study.
    """
    out_dir = Path(out_dir)

    # Download bigclonebenchmark_lg and bigclonebenchmark_sm
    logging.info("Downloading BigCloneBenchmark datasets.")
    gdown.download(
        URLs["bigclonebenchmark_lg"], str(out_dir / "bigclonebenchmark_lg.csv")
    )
    gdown.download(
        URLs["bigclonebenchmark_sm"], str(out_dir / "bigclonebenchmark_sm.csv")
    )

    # Download Bug Fix Pairs
    logging.info("Downloading and extracting Bug Fix Pairs dataset.")
    gdown.cached_download(
        URLs["bug_fix_pairs"],
        str(out_dir / "bug_fix_pairs.zip"),
        postprocess=gdown.extractall,
    )
    with zipfile.ZipFile(
        str(out_dir / "datasets" / "50-100" / "source_code.zip"), "r"
    ) as zip_ref:
        zip_ref.extractall(out_dir)

    # from https://stackoverflow.com/a/14260592/5768407 by users
    # yoavram (https://stackoverflow.com/users/1063612/yoavram) and
    # kamran kausar (https://stackoverflow.com/users/3486460/kamran-kausar)
    logging.info("Downloading and extracting CodeSearchNet Challenge dataset.")
    r = requests.get(URLs["codesearchnet_java"])
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(out_dir / "codesearchnet_java")

In [None]:
# download_data("/tmp")

# assert Path("/tmp/bigclonebenchmark_lg.csv").exists()
# assert Path("/tmp/bigclonebenchmark_sm.csv").exists()

# assert Path("/tmp/bug_fix_pairs.zip").exists()
# assert Path("/tmp/50-100/buggy").exists()
# assert Path("/tmp/50-100/fixed").exists()

In [None]:
# export
def _process_bigclonebenchmark(path):
    pass

In [None]:
# df = pd.read_csv("/tmp/bigclonebenchmark_lg.csv")
# df.head()

In [None]:
# df.clone_type.value_counts()

In [None]:
# # export
def _process_bug_fix(path):
    buggy_paths = sorted((path / "50-100").glob("buggy/*.java"))
    fixed_paths = sorted((path / "50-100").glob("fixed/*.java"))
    bugs = []
    fixes = []
    for bug_p, fix_p in zip(buggy_paths, fixed_paths):
        with open(bug_p, "r") as f:
            bugs.append(f.read())

        with open(fix_p, "r") as f:
            fixes.append(f.read())

    df_buggy = pd.DataFrame(bugs, columns=["code"])
    df_fixed = pd.DataFrame(fixes, columns=["code"])

    # Saving to jsonl because csv formatting is causing issues with quoting
    df_buggy.to_json(path / "buggy.jsonl", orient="records", lines=True)
    df_fixed.to_json(path / "fixed.jsonl", orient="records", lines=True)


# #     df = pd.DataFrame(zip(bugs, fixes), columns=["buggy", "fixed"])
# #     # Saving to jsonl because csv formatting is causing issues with quoting
# #     df.to_json(path / "bug_fix_pairs.jsonl", orient="records", lines=True)

In [None]:
# _process_bug_fix(Path("/tmp"))

# assert Path("/tmp/buggy.jsonl").exists()
# assert Path("/tmp/fixed.jsonl").exists()

In [None]:
# BUGGY_MTHD = """\
# private void success(io.netty.channel.Channel channel) {
#     org.mycat.netty.mysql.MySQLHandshakeHandler.logger.debug("success info return form MySQLHandshakeHandler");
#     io.netty.buffer.ByteBuf out = channel.alloc().buffer();
#     org.mycat.netty.mysql.OK ok = new org.mycat.netty.mysql.OK();
#     ok.sequenceId = 2;
#     ok.setStatusFlag(Flags.SERVER_STATUS_AUTOCOMMIT);
#     out.writeBytes(ok.toPacket());
#     channel.writeAndFlush(out);
# }"""
# FIXED_MTHD = """\
# private void success(io.netty.channel.Channel channel) {
#     org.mycat.netty.mysql.MySQLHandshakeHandler.logger.info("success info return form MySQLHandshakeHandler");
#     io.netty.buffer.ByteBuf out = channel.alloc().buffer();
#     org.mycat.netty.mysql.OK ok = new org.mycat.netty.mysql.OK();
#     ok.sequenceId = 2;
#     ok.setStatusFlag(Flags.SERVER_STATUS_AUTOCOMMIT);
#     out.writeBytes(ok.toPacket());
#     channel.writeAndFlush(out);
# }"""
# df = pd.read_json("/tmp/bug_fix_pairs.jsonl", orient="records", lines=True)

# assert BUGGY_MTHD == df.buggy.values[0] and FIXED_MTHD == df.fixed.values[0]

In [None]:
# export
def _jsonl_list_to_dataframe(file_list, columns=None):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat(
        [
            pd.read_json(f, orient="records", compression="gzip", lines=True)[columns]
            for f in file_list
        ],
        sort=False,
    )


def _process_codesearchnet(path):
    """
    Grabs the different data splits and converts them into dataframes.
    Expects format from Code Search Net Challenge.
    """
    for split in ["train", "valid", "test"]:
        files = sorted((path / "java" / "final" / "jsonl" / split).glob("**/*.gz"))
        df = _jsonl_list_to_dataframe(files, ["code"])
        # Saving to jsonl because csv formatting is causing issues with quoting
        df.to_json(path / f"{split}.jsonl", orient="records", lines=True)

In [None]:
# export
@call_parse
def process_data(
    down_dir: Param(
        "The directory where all the files were downloaded and extracted to.", str
    )
):
    """Function for processing data related to the library."""
    down_dir = Path(down_dir)

    # Process CodeSearchNet Challenge data
    _process_codesearchnet(down_dir / "codesearchnet_java")

    # Process Bug Fix Pairs data
    _process_bug_fix(down_dir)

In [None]:
# process_data("/home/jovyan/work/data")

In [None]:
# export
@call_parse
def train(param1: Param("The message", str)):
    """Function for training models related to the library."""
    pass

In [None]:
# # export
_TRANSFORMs = {
    "randomized_tokens": code_token_randomizer,
    "randomized_lines": line_randomizer,
    "comments_removed": java_comment_remover,
}

In [None]:
# # export
def _long_range(data_dir, models):
    long_range_results = {}

    df_buggy = pd.read_json(data_dir / "buggy.jsonl", orient="records", lines=True)
    df_fixed = pd.read_json(data_dir / "fixed.jsonl", orient="records", lines=True)
    for model in models:
        mean_probs_buggy = get_mean_probs(df_buggy, model)
        mean_probs_fixed = get_mean_probs(df_fixed, model)
        df_dist_buggy = mean_dist_probs(df_buggy, model)
        df_dist_fixed = mean_dist_probs(df_fixed, model)
        mean_cross_entropy_buggy = get_mean_cross_entropy(df_buggy, model)
        mean_cross_entropy_fixed = get_mean_cross_entropy(df_fixed, model)
        long_range_results["buggy"] = [
            model,
            mean_probs_buggy,
            df_dist_buggy,
            mean_cross_entropy_buggy,
        ]
        long_range_results["fixed"] = [
            model,
            mean_probs_fixed,
            df_dist_fixed,
            mean_cross_entropy_fixed,
        ]

    df_codesearchnet = pd.read_json(
        data_dir / "codesearchnet_java" / "test.jsonl", orient="records", lines=True
    )
    for model in models:
        mean_probs = get_mean_probs(df_codesearchnet, model)
        df_dist = mean_dist_probs(df_codesearchnet, model)
        mean_cross_entropy = get_mean_cross_entropy(df_codesearchnet, model)
        long_range_results["codesearchnet_original"] = [
            model,
            mean_probs,
            df_dist,
            mean_cross_entropy,
        ]
    for transform in _TRANSFORMs:
        df_transformed = transform_df(df_codesearchnet, _TRANSFORMs[transform])
        for model in models:
            mean_probs = get_mean_probs(df_transformed, model)
            df_dist = mean_dist_probs(df_transformed, model)
            mean_cross_entropy = get_mean_cross_entropy(df_transformed, model)
            long_range_results["codesearchnet_" + transform] = [
                model,
                mean_probs,
                df_dist,
                mean_cross_entropy,
            ]
            # how are we gonna save these results?
            # Jsonl with each mean_prob, dist, crossent, model config per line?

    return long_range_results

In [None]:
# export
def _counterfactual(control_results, treatment_results):
    pass

In [None]:
# export
@call_parse
def evaluate(
    data_dir: Param("The message", str),
    model_dir: Param("The message", str),
    out_dir: Param("The message", str),
):
    """Function for evaluating models related to the library."""
    data_dir = Path(data_dir)
    model_dir = Path(model_dir)
    out_dir = Path(out_dir)

    models = []
    for m_path in model_dir.glob("*/"):
        if m_path.name == "Transformer":
            models.append(TransformerModel.from_path(m_path))
        elif m_path.name == "GRU":
            models.append(GRUModel.from_path(m_path))
        elif m_path.name == "RNN":
            pass
    # Long-Range Interactions
    long_range_results = _long_range(data_dir, models)
    long_range_results

    # Counterfactuals
    counterfactual_results = []
    counterfactual_results
    for transform in _TRANSFORMs:
        pass
        # _counterfactual(control_results, treatment_results)

In [None]:
# export
@call_parse
def reproduce(
    out_dir: Param(
        "The output directory to download, extract, and save all files to.", str
    )
):
    """Function for reproducing results related to the library."""
    download_data(out_dir)
    process_data(out_dir)

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()