In [1]:
%pip install -q --no-cache transformers datasets nltk sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def calculateSmoothedBLEU(reference, candidate):
    """
    Calculate the Smoothed BLEU-4 score between a reference string and a candidate string.

    Args:
    reference (str): The reference (ground truth) string.
    candidate (str): The candidate (generated) string.

    Returns:
    float: The Smoothed BLEU-4 score.
    """
    reference_tokens = reference.split()
    candidate_tokens = candidate.split()

    smoother = SmoothingFunction()
    bleu_score = sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoother.method1)

    return bleu_score

In [4]:
data = load_dataset("code_search_net", "python")['test']
data

Downloading builder script: 100%|██████████| 8.44k/8.44k [00:00<?, ?B/s]
Downloading metadata: 100%|██████████| 18.5k/18.5k [00:00<?, ?B/s]
Downloading readme: 100%|██████████| 12.9k/12.9k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 941M/941M [10:14<00:00, 1.53MB/s]
Downloading data files: 100%|██████████| 1/1 [10:18<00:00, 618.09s/it]
Extracting data files: 100%|██████████| 1/1 [00:13<00:00, 13.40s/it]
Extracting data files: 100%|██████████| 3/3 [00:05<00:00,  1.79s/it]
Generating train split: 100%|██████████| 412178/412178 [02:41<00:00, 2554.40 examples/s]
Generating test split: 100%|██████████| 22176/22176 [00:08<00:00, 2623.99 examples/s]
Generating validation split: 100%|██████████| 23107/23107 [00:10<00:00, 2126.07 examples/s]
  table = cls._concat_blocks(blocks, axis=0)


Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 22176
})

In [5]:
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base-multi-sum')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base-multi-sum')

In [6]:
idx = 14

text = data['func_code_string'][idx]
label = data['func_documentation_string'][idx]

print(f'Text:\n{text}\n\nLabel:\n{label}')

Text:
def download_by_id(self, vid = '', title = None, output_dir='.', merge=True, info_only=False,**kwargs):
        """self, str->None
        
        Keyword arguments:
        self: self
        vid: The video ID for BokeCC cloud, something like
        FE3BB999594978049C33DC5901307461
        
        Calls the prepare() to download the video.
        
        If no title is provided, this method shall try to find a proper title
        with the information providin within the
        returned content of the API."""

        assert vid

        self.prepare(vid = vid, title = title, **kwargs)

        self.extract(**kwargs)

        self.download(output_dir = output_dir, 
                    merge = merge, 
                    info_only = info_only, **kwargs)

Label:
self, str->None
        
        Keyword arguments:
        self: self
        vid: The video ID for BokeCC cloud, something like
        FE3BB999594978049C33DC5901307461
        
        Calls the prepare() to downl

In [7]:
input_ids = tokenizer(text, return_tensors="pt").input_ids
generated_ids = model.generate(input_ids, max_length=20)

In [8]:
predicted = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
predicted

'Download a single entry from the BokeCC cloud by ID.'

In [9]:
print(calculateSmoothedBLEU(label, predicted))

0.0006213748171322087
