# An usage example of NaturalCC

Task: Code Completion <br>
Dataset: Ruby dataset of [CodeXGLUE (Feng et. al., 2020)](https://arxiv.org/pdf/2002.08155.pdf) <br>
Model: SeqRNN <br>

## Step 1. Download dataset.

In [1]:
import os
import gdown

from ncc import LOGGER
from ncc import __NCC_DIR__
from ncc.utils.path_manager import PathManager

# CodeSearchNet(feng) dataset
DATASET_DIR = os.path.join(__NCC_DIR__, "demo")
DATASET_URL = "https://drive.google.com/uc?id=1rd2Tc6oUWBo7JouwexW3ksQ0PaOhUr6h"
out_file = os.path.join(DATASET_DIR, "Cleaned_CodeSearchNet.zip")
if not PathManager.exists(out_file):
    gdown.download(DATASET_URL, output=out_file)
LOGGER.info(f"Dataset has been downloaded at {out_file}")

[32m[2021-11-27 23:35:44]    INFO >> Dataset has been downloaded at /data/ncc_data/demo/Cleaned_CodeSearchNet.zip (4173055814.py:14, <module>())[0m


## Step 2. Pre-processing dataset and save it into MMAP format.


### 1) inflate data

In [2]:
import zipfile

DATA_DIR = os.path.join(DATASET_DIR, "completion")
with zipfile.ZipFile(out_file, "r") as writer:
    writer.extractall(path=DATASET_DIR)

LOGGER.info(f"Inflating data at {DATASET_DIR}")

[32m[2021-11-27 23:35:55]    INFO >> Inflating data at /data/ncc_data/demo (2210737368.py:7, <module>())[0m


### 2) use CodeBERT BPE dictionary to tokenize CodeSearchNet(feng)-ruby codes

#### load CodeBERT BPE dictionary

In [3]:
from ncc.data.dictionary import TransformersDictionary

vocab = TransformersDictionary.from_pretrained("microsoft/codebert-base", do_lower_case=False)

#### tokenization & dump

In [4]:
import torch
import ujson
from tqdm import tqdm
from ncc.data.indexed_dataset import MMapIndexedDatasetBuilder


def total_lines(reader):
    num = sum(1 for _ in reader)
    reader.seek(0)
    return num


SRC_DIR = os.path.join(DATASET_DIR, "CodeSearchNet", "ruby")
for mode in ["train", "valid", "test"]:
    SRC_FILE = os.path.join(SRC_DIR, f"{mode}.jsonl")
    DST_FILE = os.path.join(SRC_DIR, mode)
    mmap_dataset_builder = MMapIndexedDatasetBuilder(f"{DST_FILE}.mmap")

    with open(SRC_FILE, 'r') as reader:
        for idx, code_snippet in enumerate(tqdm(reader, total=total_lines(reader))):
            code_snippet = ujson.loads(code_snippet)
            raw_code_tokens = code_snippet['code_tokens']
            after_code_tokens = vocab.subtokenize(raw_code_tokens)
            if idx == 0:
                print()
                LOGGER.info(f"Show a case of {SRC_FILE}")
                LOGGER.info(f"Before BPE, '{raw_code_tokens}'")
                LOGGER.info(f"After BPE, '{after_code_tokens}'")
            tensor = torch.IntTensor(vocab.tokens_to_indices(after_code_tokens))
            mmap_dataset_builder.add_item(tensor)
    mmap_dataset_builder.finalize(f"{DST_FILE}.idx")

  0%|                                                 | 0/24927 [00:00<?, ?it/s][32m[2021-11-27 23:36:14]    INFO >> Show a case of /data/ncc_data/demo/CodeSearchNet/ruby/train.jsonl (1291166181.py:26, <module>())[0m
[32m[2021-11-27 23:36:14]    INFO >> Before BPE, '['def', 'render_body', '(', 'context', ',', 'options', ')', 'if', 'options', '.', 'key?', '(', ':partial', ')', '[', 'render_partial', '(', 'context', ',', 'options', ')', ']', 'else', 'StreamingTemplateRenderer', '.', 'new', '(', '@lookup_context', ')', '.', 'render', '(', 'context', ',', 'options', ')', 'end', 'end']' (1291166181.py:27, <module>())[0m
[32m[2021-11-27 23:36:14]    INFO >> After BPE, '['def', 'Ġrender', '_', 'body', 'Ġ(', 'Ġcontext', 'Ġ,', 'Ġoptions', 'Ġ)', 'Ġif', 'Ġoptions', 'Ġ.', 'Ġkey', '?', 'Ġ(', 'Ġ:', 'partial', 'Ġ)', 'Ġ[', 'Ġrender', '_', 'partial', 'Ġ(', 'Ġcontext', 'Ġ,', 'Ġoptions', 'Ġ)', 'Ġ]', 'Ġelse', 'ĠStreaming', 'Template', 'R', 'end', 'erer', 'Ġ.', 'Ġnew', 'Ġ(', 'Ġ@', 'look', 'up', '_', '




100%|████████████████████████████████████| 24927/24927 [00:58<00:00, 423.19it/s]
  0%|                                                  | 0/1400 [00:00<?, ?it/s][32m[2021-11-27 23:37:13]    INFO >> Show a case of /data/ncc_data/demo/CodeSearchNet/ruby/valid.jsonl (1291166181.py:26, <module>())[0m
[32m[2021-11-27 23:37:13]    INFO >> Before BPE, '['def', 'preparse', '(', 'unparsed', ',', 'args', '=', '[', ']', ',', 'opts', '=', '{', '}', ')', 'case', 'unparsed', 'when', 'Hash', 'then', 'opts', '.', 'merge!', 'unparsed', 'when', 'Array', 'then', 'unparsed', '.', 'each', '{', '|', 'e', '|', 'preparse', '(', 'e', ',', 'args', ',', 'opts', ')', '}', 'else', 'args', '<<', 'unparsed', '.', 'to_s', 'end', '[', 'args', ',', 'opts', ']', 'end']' (1291166181.py:27, <module>())[0m
[32m[2021-11-27 23:37:13]    INFO >> After BPE, '['def', 'Ġprepar', 'se', 'Ġ(', 'Ġunp', 'ars', 'ed', 'Ġ,', 'Ġargs', 'Ġ=', 'Ġ[', 'Ġ]', 'Ġ,', 'Ġopt', 's', 'Ġ=', 'Ġ{', 'Ġ}', 'Ġ)', 'Ġcase', 'Ġunp', 'ars', 'ed', 'Ġwhen',




100%|██████████████████████████████████████| 1400/1400 [00:03<00:00, 433.57it/s]
  0%|                                                  | 0/1261 [00:00<?, ?it/s][32m[2021-11-27 23:37:16]    INFO >> Show a case of /data/ncc_data/demo/CodeSearchNet/ruby/test.jsonl (1291166181.py:26, <module>())[0m
[32m[2021-11-27 23:37:16]    INFO >> Before BPE, '['def', 'print_summary', '(', 'status', ')', 'status_string', '=', 'status', '.', 'to_s', '.', 'humanize', '.', 'upcase', 'if', 'status', '==', ':success', 'heading', '(', '"Result: "', ',', 'status_string', ',', ':green', ')', 'level', '=', ':info', 'elsif', 'status', '==', ':timed_out', 'heading', '(', '"Result: "', ',', 'status_string', ',', ':yellow', ')', 'level', '=', ':fatal', 'else', 'heading', '(', '"Result: "', ',', 'status_string', ',', ':red', ')', 'level', '=', ':fatal', 'end', 'if', '(', 'actions_sentence', '=', 'summary', '.', 'actions_sentence', '.', 'presence', ')', 'public_send', '(', 'level', ',', 'actions_sentence', ')', '




100%|██████████████████████████████████████| 1261/1261 [00:03<00:00, 412.95it/s]


## Step 3. Design your model in NaturalCC <br>
*You should ensure task, model, dataset meet your requirements.*


### 1) design your dataset


In [5]:
from ncc.data.ncc_dataset import NccDataset


def collate(samples, pad_idx, ):
    from ncc.data.tools import data_utils
    def merge(key):
        return data_utils.collate_tokens(
            [s[key] for s in samples],
            pad_idx,
        )

    src_tokens = merge('source')
    tgt_tokens = merge('target')
    return {
        'id': [s['id'] for s in samples],
        'net_input': {
            'src_tokens': src_tokens,
        },
        'target': tgt_tokens,
    }


class DemoDataset(NccDataset):
    def __init__(self, dict, data, sizes):
        self.dict = dict
        self.data = data
        self.sizes = sizes
        self.pad = dict.pad()

    def __getitem__(self, index):
        src_item = self.data[index][:-1]
        tgt_item = self.data[index][1:]
        example = {
            'id': index,
            'source': src_item,
            'target': tgt_item,
        }
        return example

    def __len__(self):
        return len(self.data)

    def ordered_indices(self):
        import numpy as np
        return np.random.permutation(len(self))

    def collater(self, samples):
        return collate(samples, pad_idx=self.pad)

    def num_tokens(self, index):
        # Return the number of tokens in a sample.
        return self.sizes[index]

    def size(self, index):
        # Return an example's size.
        return self.sizes[index]


### 2) register your task


In [6]:
from ncc.tasks import NccTask, register_task


@register_task('demo')
class DemoTask(NccTask):
    def __init__(self, dictionary):
        super(DemoTask, self).__init__(args=None)
        self.dictionary = dictionary

    def load_dataset(self, split, data_file):
        # define your loading rules
        from ncc.data.indexed_dataset import MMapIndexedDataset
        from ncc.data.wrappers import TruncateDataset
        # truncate code with a length of 128 + 1
        dataset = TruncateDataset(
            MMapIndexedDataset(data_file),
            truncation_length=128 + 1,
        )
        datasizes = dataset.sizes
        self.datasets[split] = DemoDataset(self.dictionary, dataset, datasizes)

Using backend: pytorch


### 3) register your model

In [7]:
from ncc.models import register_model
from ncc.modules.base.layers import (
    Embedding, Linear, LSTM
)
from ncc.models.ncc_model import NccLanguageModel


@register_model("demo")
class DemoModel(NccLanguageModel):
    def __init__(self, dictionary, decoder):
        super().__init__(decoder)
        self.dictionary = dictionary

    @classmethod
    def build_model(cls, dictionary):
        from ncc.modules.decoders.ncc_decoder import NccDecoder

        class DemoDecoder(NccDecoder):
            def __init__(self, dictionary):
                super(DemoDecoder, self).__init__(dictionary)
                self.embedding = Embedding(len(dictionary), embedding_dim=512, padding_idx=dictionary.pad())
                self.out_projector = Linear(512, len(dictionary))
                # share embedding weight
                self.out_projector.weight = self.embedding.weight
                self.lstm = LSTM(512, 512)

            def forward(self, src_tokens, **kwargs):
                x = self.embedding(src_tokens)  # B, L-1, E
                x, _ = self.lstm(x)
                x = self.out_projector(x)
                return x

        decoder = DemoDecoder(dictionary)
        return cls(dictionary, decoder=decoder)

    def forward(self, src_tokens, **kwargs):
        return self.decoder.forward(src_tokens, **kwargs)

### 4) load datasets


In [8]:
task = DemoTask(dictionary=vocab)
for mode in ["train", "valid", "test"]:
    task.load_dataset(split=mode, data_file=os.path.join(SRC_DIR, "train"))

## Step 4. Train & inference

### 1) train


In [9]:

model = DemoModel.build_model(dictionary=vocab)
if torch.cuda.is_available():
    model = model.cuda()

from torch.optim import Adam

optimizer = Adam(lr=1e-3, params=model.parameters())


import torch.nn.functional as F
from ncc.utils.utils import move_to_cuda

# train
BATCH_SIZE = 4
train_iter = task.get_batch_iterator(dataset=task.dataset("train"), max_sentences=BATCH_SIZE). \
    next_epoch_itr(shuffle=True)
model.train()
for idx in range(5):
    batch = next(train_iter)
    if torch.cuda.is_available():
        batch = move_to_cuda(batch)
    logits = model.forward(**batch['net_input'])
    lprobs = torch.log_softmax(logits, dim=-1).view(-1, logits.size(-1))
    golds = batch['target'].view(-1)
    # ignore pad
    loss = F.nll_loss(lprobs, golds, ignore_index=vocab.pad())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    LOGGER.info(f"CrossEntropy loss: {loss.item():.4f}")

  return torch.from_numpy(np_array)
[32m[2021-11-27 23:37:22]    INFO >> CrossEntropy loss: 10.8253 (829832851.py:30, <module>())[0m
[32m[2021-11-27 23:37:23]    INFO >> CrossEntropy loss: 10.8194 (829832851.py:30, <module>())[0m
[32m[2021-11-27 23:37:23]    INFO >> CrossEntropy loss: 10.7962 (829832851.py:30, <module>())[0m
[32m[2021-11-27 23:37:23]    INFO >> CrossEntropy loss: 10.7495 (829832851.py:30, <module>())[0m
[32m[2021-11-27 23:37:23]    INFO >> CrossEntropy loss: 10.7122 (829832851.py:30, <module>())[0m


### 2) inference

In [10]:
BATCH_SIZE = 4
test_iter = task.get_batch_iterator(dataset=task.dataset("test"), max_sentences=BATCH_SIZE). \
    next_epoch_itr(shuffle=False)
model.eval()
batch = next(test_iter)
if torch.cuda.is_available():
    batch = move_to_cuda(batch)
logits = model.forward(**batch['net_input'])
# ignore pad
valid_indices = batch['net_input']['src_tokens'].view(-1) != task.dictionary.pad()
lprobs = torch.log_softmax(logits, dim=-1).view(-1, logits.size(-1))
lprobs = lprobs[valid_indices]
golds = batch['target'].view(-1)
golds = golds[valid_indices]
ranks = (lprobs >= lprobs[:, golds].diag().unsqueeze(dim=-1)).sum(-1)
mrr = 1. / ranks
mrr[ranks > 10] = 0.
mrr = mrr.sum().float().item()
LOGGER.info(f"MRR@10: {loss.item():.4f}")

[32m[2021-11-27 23:37:23]    INFO >> MRR@10: 10.7122 (4291989905.py:19, <module>())[0m
