In [None]:
# default_exp cli

# CLI

> Contains all the CLI functions that your library provides.

Some data and code taken from https://github.com/github/CodeSearchNet

```
MIT License

Copyright (c) 2019 GitHub

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
```

```
@article{husain2019codesearchnet,
  title={{CodeSearchNet} challenge: Evaluating the state of semantic code search},
  author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},
  journal={arXiv preprint arXiv:1909.09436},
  year={2019}
}
```

In [123]:
# export
import gdown
import io
import logging
import requests
import zipfile

import pandas as pd

from fastcore.script import call_parse, Param
from pathlib import Path

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [2]:
# export
URLs = {
    "bigclonebenchmark_lg": "https://drive.google.com/uc?id=1-4LPiiKGR5Zmg-TLqZEkRbRIdg7UlJQb",
    "bigclonebenchmark_sm": "https://drive.google.com/uc?id=1FCq0lSs4oqc3jpSoucsHlRqjmbVwdRQ9",
    "bug_fix_pairs": "https://drive.google.com/uc?id=1XEhnsQ3Uy6SnFz349I0Iu9lz4ggAaiQp",
    "codesearchnet_java": "https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip",
}

In [124]:
# export
@call_parse
def download_data(
    out_dir: Param("The output directory to download and extract all files to.", str)
):
    """
    Function for downloading all the data to reproduce our study.
    """
    out_dir = Path(out_dir)

    # Download bigclonebenchmark_lg and bigclonebenchmark_sm
    logging.info("Downloading BigCloneBenchmark datasets.")
    gdown.download(
        URLs["bigclonebenchmark_lg"], str(out_dir / "bigclonebenchmark_lg.csv")
    )
    gdown.download(
        URLs["bigclonebenchmark_sm"], str(out_dir / "bigclonebenchmark_sm.csv")
    )

    # Download Bug Fix Pairs
    logging.info("Downloading and extracting Bug Fix Pairs dataset.")
    gdown.cached_download(
        URLs["bug_fix_pairs"],
        str(out_dir / "bug_fix_pairs.zip"),
        postprocess=gdown.extractall,
    )
    with zipfile.ZipFile(
        str(out_dir / "datasets" / "50-100" / "source_code.zip"), "r"
    ) as zip_ref:
        zip_ref.extractall(out_dir)

    # from https://stackoverflow.com/a/14260592/5768407 by users
    # yoavram (https://stackoverflow.com/users/1063612/yoavram) and
    # kamran kausar (https://stackoverflow.com/users/3486460/kamran-kausar)
    logging.info("Downloading and extracting CodeSearchNet Challenge dataset.")
    r = requests.get(URLs["codesearchnet_java"])
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(out_dir / "codesearchnet_java")

In [125]:
# download_data("/tmp")

# assert Path("/tmp/bigclonebenchmark_lg.csv").exists()
# assert Path("/tmp/bigclonebenchmark_sm.csv").exists()

# assert Path("/tmp/bug_fix_pairs.zip").exists()
# assert Path("/tmp/50-100/buggy").exists()
# assert Path("/tmp/50-100/fixed").exists()

INFO:root:Downloading BigCloneBenchmark datasets.
Downloading...
From: https://drive.google.com/uc?id=1-4LPiiKGR5Zmg-TLqZEkRbRIdg7UlJQb
To: /tmp/bigclonebenchmark_lg.csv
231MB [00:10, 22.2MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1FCq0lSs4oqc3jpSoucsHlRqjmbVwdRQ9
To: /tmp/bigclonebenchmark_sm.csv
100%|██████████| 921k/921k [00:00<00:00, 11.9MB/s]
INFO:root:Downloading and extracting Bug Fix Pairs dataset.


File exists: /tmp/bug_fix_pairs.zip


INFO:root:Downloading and extracting CodeSearchNet Challenge dataset.


In [None]:
# export
def _process_bigclonebenchmark(path):
    pass

In [141]:
# df = pd.read_csv("/tmp/bigclonebenchmark_lg.csv")
# df.head()

Unnamed: 0,funct_one,funct_two,clone_type
0,"public void uploadFile (File inputFile, String...","public void doGet (HttpServletRequest request,...",3
1,"public void doGet (HttpServletRequest request,...",private JButton getButtonImagen () {\n if (...,3
2,public static boolean decodeFileToFile (String...,"public void doGet (HttpServletRequest request,...",3
3,"public void doGet (HttpServletRequest request,...",private static void writeBinaryFile (String fi...,3
4,public static String stringOfUrl (String addr)...,"public void doGet (HttpServletRequest request,...",3


In [148]:
# df.clone_type.value_counts()

3    79420
2       11
1        6
Name: clone_type, dtype: int64

In [113]:
# export
def _process_bug_fix(path):
    buggy_paths = sorted((path / "50-100").glob("buggy/*.java"))
    fixed_paths = sorted((path / "50-100").glob("fixed/*.java"))
    bugs = []
    fixes = []
    for bug_p, fix_p in zip(buggy_paths, fixed_paths):
        with open(bug_p, "r") as f:
            bugs.append(f.read())

        with open(fix_p, "r") as f:
            fixes.append(f.read())

    df = pd.DataFrame(zip(bugs, fixes), columns=["buggy", "fixed"])
    # Saving to jsonl because csv formatting is causing issues with quoting
    df.to_json(path / "bug_fix_pairs.jsonl", orient="records", lines=True)

In [None]:
# export
def _jsonl_list_to_dataframe(file_list, columns=None):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat(
        [
            pd.read_json(f, orient="records", compression="gzip", lines=True)[columns]
            for f in file_list
        ],
        sort=False,
    )


def _process_codesearchnet(path):
    """
    Grabs the different data splits and converts them into dataframes.
    Expects format from Code Search Net Challenge.
    """
    for split in ["train", "valid", "test"]:
        files = sorted((path / "java" / "final" / "jsonl" / split).glob("**/*.gz"))
        df = _jsonl_list_to_dataframe(files, ["code", "docstring"])
        # Saving to jsonl because csv formatting is causing issues with quoting
        df.to_json(path / f"{split}.jsonl", orient="records", lines=True)

In [114]:
# _process_bug_fix(Path("/tmp"))

# assert Path("/tmp/bug_fix_pairs.jsonl").exists()

In [115]:
# BUGGY_MTHD = """\
# private void success(io.netty.channel.Channel channel) {
#     org.mycat.netty.mysql.MySQLHandshakeHandler.logger.debug("success info return form MySQLHandshakeHandler");
#     io.netty.buffer.ByteBuf out = channel.alloc().buffer();
#     org.mycat.netty.mysql.OK ok = new org.mycat.netty.mysql.OK();
#     ok.sequenceId = 2;
#     ok.setStatusFlag(Flags.SERVER_STATUS_AUTOCOMMIT);
#     out.writeBytes(ok.toPacket());
#     channel.writeAndFlush(out);
# }"""
# FIXED_MTHD = """\
# private void success(io.netty.channel.Channel channel) {
#     org.mycat.netty.mysql.MySQLHandshakeHandler.logger.info("success info return form MySQLHandshakeHandler");
#     io.netty.buffer.ByteBuf out = channel.alloc().buffer();
#     org.mycat.netty.mysql.OK ok = new org.mycat.netty.mysql.OK();
#     ok.sequenceId = 2;
#     ok.setStatusFlag(Flags.SERVER_STATUS_AUTOCOMMIT);
#     out.writeBytes(ok.toPacket());
#     channel.writeAndFlush(out);
# }"""
# df = pd.read_json("/tmp/bug_fix_pairs.jsonl", orient="records", lines=True)

# assert BUGGY_MTHD == df.buggy.values[0] and FIXED_MTHD == df.fixed.values[0]

In [None]:
# export
@call_parse
def process_data(
    down_dir: Param(
        "The directory where all the files were downloaded and extracted to.", str
    )
):
    """Function for processing data related to the library."""
    down_dir = Path(down_dir)

    # Process CodeSearchNet Challenge data
    _process_codesearchnet(down_dir / "codesearchnet_java")

    # Process Bug Fix Pairs data
    _process_bug_fix(down_dir)
    pass

In [None]:
# process_data("/home/jovyan/work/data")

In [None]:
# export
@call_parse
def train(param1: Param("The message", str)):
    """Function for training models related to the library."""
    pass

In [None]:
# export
@call_parse
def evaluate():
    """Function for evaluating models related to the library."""
    pass

In [None]:
# export
@call_parse
def reproduce(
    out_dir: Param(
        "The output directory to download, extract, and save all files to.", str
    )
):
    """Function for reproducing results related to the library."""
    download_data(out_dir)
    process_data(out_dir)
    pass

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()