# Test the `map()` Function to Format Datasets

## Install and Import Dependencies

In [2]:
!pip install -qU datasets pprint

[31mERROR: Could not find a version that satisfies the requirement pprint (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pprint[0m[31m
[0m

In [3]:
from datasets import load_dataset
import pprint
import torch

## Load Dataset

In [4]:
dataset = load_dataset("claudios/code_search_net", "python", split="train[:21]")

Downloading readme:   0%|          | 0.00/13.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/130M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/135M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/125M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

In [5]:
print(dataset)

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_documentation_string', 'func_code_url'],
    num_rows: 21
})


## Format Existing Rows to an Alpaca-styled Dataset

> Note: The resultant formatted dataset will not have the text column with the prompts yet.

In [7]:
# EOS Token is required to stop open-ended generation and eventual hallucination
# EOS_TOKEN = tokenizer.eos_token
EOS_TOKEN = "<|eot_id|>"

In [8]:
def formatFunctionSample(sample):
    language = sample['language']
    instruction = f"What does this {language} function do?"
    inputText = sample['func_code_string']
    outputText = sample['func_documentation_string']

    # Returning a dictionary of the new columns we want to create
    return {
        "instruction": instruction,
        "input": inputText,
        "output": outputText
    }

## Use the `map()` Method to Format the Dataset

In [18]:
# Mapping the existing dataset to the new format keeping only the keys of the dictionary we returned
dataset = dataset.map(formatFunctionSample, remove_columns=dataset.column_names)

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

KeyError: 'language'

## Sample Formatted Dataset

In [10]:
len(dataset)

21

In [11]:
pprint.pp(dataset[1])

{'instruction': 'What does this python function do?',
 'input': 'def setparents(self):\n'
          '        """Correct all parent relations for elements within the '
          'scop. There is sually no need to call this directly, invoked '
          'implicitly by :meth:`copy`"""\n'
          '        for c in self:\n'
          '            if isinstance(c, AbstractElement):\n'
          '                c.parent = self\n'
          '                c.setparents()',
 'output': 'Correct all parent relations for elements within the scop. There '
           'is sually no need to call this directly, invoked implicitly by '
           ':meth:`copy`'}


## Add Text Column with the Prompt Representing each Row

In [12]:
# sample = dataset[2]

In [13]:
alpacaFormatString = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [14]:
# Define the function to create the new 'text' column
def createAlpacaFormatString(sample):
    instruction = sample['instruction']
    inputText = sample['input']
    outputText = sample['output']
    text = alpacaFormatString.format(instruction, inputText, outputText) + EOS_TOKEN
    sample['text'] = text
    return sample

# Apply the function to the dataset
dataset = dataset.map(createAlpacaFormatString)

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

In [16]:
pprint.pp(dataset[0])

{'instruction': 'What does this python function do?',
 'input': 'def addidsuffix(self, idsuffix, recursive = True):\n'
          '        """Appends a suffix to this element\'s ID, and optionally '
          'to all child IDs as well. There is sually no need to call this '
          'directly, invoked implicitly by :meth:`copy`"""\n'
          '        if self.id: self.id += idsuffix\n'
          '        if recursive:\n'
          '            for e in self:\n'
          '                try:\n'
          '                    e.addidsuffix(idsuffix, recursive)\n'
          '                except Exception:\n'
          '                    pass',
 'output': "Appends a suffix to this element's ID, and optionally to all child "
           'IDs as well. There is sually no need to call this directly, '
           'invoked implicitly by :meth:`copy`',
 'text': 'Below is an instruction that describes a task, paired with an input '
         'that provides further context. Write a response t

In [17]:
# Print a sample to verify
# pprint.pprint(formattedDataset[1])
print(dataset[0]['text'])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
What does this python function do?

### Input:
def addidsuffix(self, idsuffix, recursive = True):
        """Appends a suffix to this element's ID, and optionally to all child IDs as well. There is sually no need to call this directly, invoked implicitly by :meth:`copy`"""
        if self.id: self.id += idsuffix
        if recursive:
            for e in self:
                try:
                    e.addidsuffix(idsuffix, recursive)
                except Exception:
                    pass

### Response:
Appends a suffix to this element's ID, and optionally to all child IDs as well. There is sually no need to call this directly, invoked implicitly by :meth:`copy`<|eot_id|>
