# Environment Setup

We'll begin by installing the necessary libraries:

1. HuggingFace Transformers: Essential for the CodeT5p model.
2. HuggingFace Datasets: Required for loading and preprocessing the dataset.
3. PyTorch Lightning: Used for training the model.
4. Weights and Biases: Implements the logging of training metrics.

In [1]:
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q git+https://github.com/huggingface/transformers.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [3]:
!pip install git+https://github.com/huggingface/accelerate

Collecting git+https://github.com/huggingface/accelerate
  Cloning https://github.com/huggingface/accelerate to /tmp/pip-req-build-jnxvjgma
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate /tmp/pip-req-build-jnxvjgma
  Resolved https://github.com/huggingface/accelerate to commit d087be01566477d99b660526adb7da4ec31abf1d
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: accelerate
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
  Created wheel for accelerate: filename=accelerate-0.22.0.dev0-py3-none-any.whl size=251017 sha256=fa8ac250144a2316203f7b83c4d09f6d6557f410ba4e3c199f71f6c27447a737
  Stored in directory: /tmp/pip-ephem-wheel-cache-4bn5uclx/wheels/f6/c7/9d/1b8a5ca8353d9307733bc719107acb67acdc95063bba749f26
Successfully built accelerate
Installing collect

In [4]:
!pip install ipython-autotime


Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Collecting jedi>=0.16 (from ipython->ipython-autotime)
  Downloading jedi-0.19.0-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, ipython-autotime
Successfully installed ipython-autotime-0.3.1 jedi-0.19.0


In [5]:
%load_ext autotime


time: 324 µs (started: 2023-08-20 16:32:15 +00:00)


In [6]:
from transformers import set_seed
set_seed(42)

time: 6.55 s (started: 2023-08-20 16:32:15 +00:00)


# Preprocessing Data

The CodeXGLUE dataset's "code_to_text" portion is loaded, specifically focusing on the Python programming language examples.

In [7]:
from datasets import load_dataset

dataset = load_dataset("code_x_glue_ct_code_to_text", "python")
print(dataset)

Downloading builder script:   0%|          | 0.00/5.92k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/17.9k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/25.7k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/941M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/251820 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13914 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/14918 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 251820
    })
    validation: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 13914
    })
    test: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 14918
    })
})
time: 7min 5s (started: 2023-08-20 16:32:22 +00:00)


In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 251820
    })
    validation: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 13914
    })
    test: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 14918
    })
})

time: 2.12 ms (started: 2023-08-20 16:39:27 +00:00)


In [9]:
dataset["train"]

Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
    num_rows: 251820
})

time: 1.97 ms (started: 2023-08-20 16:39:27 +00:00)


In [10]:
example = dataset['train'][0]

print("Code:", example["code"])
print("Docstring:", example["docstring"])

Code: def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):
    """
    Trains a k-nearest neighbors classifier for face recognition.

    :param train_dir: directory that contains a sub-directory for each known person, with its name.

     (View in source code to see train_dir example tree structure)

     Structure:
        <train_dir>/
        ├── <person1>/
        │   ├── <somename1>.jpeg
        │   ├── <somename2>.jpeg
        │   ├── ...
        ├── <person2>/
        │   ├── <somename1>.jpeg
        │   └── <somename2>.jpeg
        └── ...

    :param model_save_path: (optional) path to save model on disk
    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified
    :param knn_algo: (optional) underlying data structure to support knn.default is ball_tree
    :param verbose: verbosity of training
    :return: returns knn classifier that was trained on the given data.
    

# Objective: The objective is to create a model that generates docstrings based on the provided code.

**Preparing Code-Docstring Pairs:**

1. Tokenization: Transformer models (like BERT, BART, T5) require integers as input (known as input_ids in HuggingFace Transformers) rather than direct text. These integers correspond to tokens in the model's vocabulary.

2. Contextual Embedding Vectors: The model learns rich vectors for each token, which helps in obtaining quality results. Conversion to input_ids: Both the "Code" and "Docstring" must be turned into input_ids; the former becomes the model's input and the latter serves as labels.

3. Padding and Truncation: Since models are trained in batches, inputs and labels must be of the same length, requiring padding/truncation.

4. Attention Mask: An attention_mask is also added to make sure padding tokens are not considered in attention score computations.

5. Preprocessing Function: Finally, a preprocess_examples function is defined, allowing the entire dataset to be processed according to these requirements.

### To summarize:
- **input:** code, which is turned into input_ids + attention_mask
- **output:** docstrings, which are turned into labels (which are the input_ids of the docstrings).

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer

time: 973 ms (started: 2023-08-20 16:39:27 +00:00)


In [12]:
from transformers import AutoConfig

checkpoint = "Salesforce/codet5p-2b"

# Load the existing config
config = AutoConfig.from_pretrained(checkpoint)

# Modify the properties
config.decoder.decoder_start_token_id = config.bos_token_id


print("Before loading the model:")
print("BOS token ID:", config.bos_token_id)
print("Decoder start token ID:", config.decoder_start_token_id)



Downloading (…)lve/main/config.json:   0%|          | 0.00/5.07k [00:00<?, ?B/s]

Loading Salesforce/codet5p-2b requires to execute some code in that repo, you can inspect the content of the repository at https://hf.co/Salesforce/codet5p-2b. You can dismiss this prompt by passing `trust_remote_code=True`.
Do you accept? [y/N] y


Downloading (…)iguration_codet5p.py:   0%|          | 0.00/4.07k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/codet5p-2b:
- configuration_codet5p.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Before loading the model:
BOS token ID: None
Decoder start token ID: None
time: 7.36 s (started: 2023-08-20 16:39:28 +00:00)


In [13]:

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, config=config, trust_remote_code=True)

model.config.pad_token_id = tokenizer.pad_token_id


prefix = "Summarize Python: "
max_input_length = 256
max_target_length = 128



Downloading (…)okenizer_config.json:   0%|          | 0.00/284 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

Downloading (…)/modeling_codet5p.py:   0%|          | 0.00/43.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/codet5p-2b:
- modeling_codet5p.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/6.45G [00:00<?, ?B/s]

time: 1min 13s (started: 2023-08-20 16:39:35 +00:00)


In [14]:
def preprocess_examples(examples):
    codes = examples['code']
    docstrings = examples['docstring']

    inputs = [prefix + code for code in codes]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

    # Encode the summaries
    labels = tokenizer(docstrings, max_length=max_target_length, padding="max_length", truncation=True).input_ids

    # Prepare decoder_input_ids
    decoder_input_ids = model_inputs["input_ids"]  # Set decoder_input_ids as input_ids

    # Important: we need to replace the index of the padding tokens by -100
    labels_with_ignore_index = []
    for labels_example in labels:
        labels_example = [label if label != 0 else -100 for label in labels_example]
        labels_with_ignore_index.append(labels_example)

    model_inputs["labels"] = labels_with_ignore_index
    model_inputs["decoder_input_ids"] = decoder_input_ids

    return model_inputs








time: 743 µs (started: 2023-08-20 16:40:49 +00:00)


After defining the function, we can apply it quickly to the HuggingFace Dataset object using the .map() method with a default batch size of 1,000.

In [15]:
dataset = dataset.map(preprocess_examples, batched=True)

Map:   0%|          | 0/251820 [00:00<?, ? examples/s]

Map:   0%|          | 0/13914 [00:00<?, ? examples/s]

Map:   0%|          | 0/14918 [00:00<?, ? examples/s]

time: 1min 22s (started: 2023-08-20 16:40:49 +00:00)


In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'input_ids', 'attention_mask', 'labels', 'decoder_input_ids'],
        num_rows: 251820
    })
    validation: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'input_ids', 'attention_mask', 'labels', 'decoder_input_ids'],
        num_rows: 13914
    })
    test: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'input_ids', 'attention_mask', 'labels', 'decoder_input_ids'],
        num_rows: 14918
    })
})

time: 2.55 ms (started: 2023-08-20 16:42:12 +00:00)


Next, let's set the format to "torch" and create PyTorch dataloaders.

In [17]:
from torch.utils.data import DataLoader

dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=8)
valid_dataloader = DataLoader(dataset['validation'], batch_size=4)
test_dataloader = DataLoader(dataset['test'], batch_size=4)

time: 2.33 ms (started: 2023-08-20 16:42:12 +00:00)


In [18]:

batch = next(iter(train_dataloader))
print(batch.keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])
time: 41.7 ms (started: 2023-08-20 16:42:12 +00:00)


Let's verify an example, by decoding it back into text:



In [19]:
tokenizer.decode(batch['input_ids'][0])

'Summarize Python: def vs(*args, **kwargs):\n    """\n    exactly like v, but doesn\'t print variable names or file positions\n\n   .. seealso:: ss()\n    """\n    if not args:\n        raise ValueError("you didn\'t pass any arguments to print out")\n\n    with Reflect.context(args, **kwargs) as r:\n        instance = V_CLASS(r, stream, **kwargs)\n        instance.writeline(instance.value())<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoft

time: 2.99 ms (started: 2023-08-20 16:42:12 +00:00)


In [20]:
labels = batch['labels'][0]
tokenizer.decode([label for label in labels if label != -100])

"exactly like v, but doesn't print variable names or file positions\n\n   .. seealso:: ss()<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|

time: 9.03 ms (started: 2023-08-20 16:42:12 +00:00)


In [21]:
from transformers import DataCollatorForSeq2Seq


data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=tokenizer.pad_token_id,
    pad_to_multiple_of=None
)


time: 373 µs (started: 2023-08-20 16:42:12 +00:00)


In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

time: 21.6 ms (started: 2023-08-20 16:42:12 +00:00)


In [23]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="finetuned-codet5p-codexglue-python-docstring-generator",  # change to a repo name of your choice
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    warmup_steps=100,
    num_train_epochs = 25,
    gradient_checkpointing=False,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=4,
    save_steps=5000,
    eval_steps=5000,
    logging_steps=5000,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=True,
)

time: 50.9 ms (started: 2023-08-20 16:42:12 +00:00)


In [25]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

time: 279 ms (started: 2023-08-20 16:43:11 +00:00)


In [26]:
model.config

CodeT5pConfig {
  "_commit_hash": "48dd1d613db426104ce52c0bb2dfd19352173fac",
  "_name_or_path": "Salesforce/codet5p-2b",
  "architectures": [
    "CodeT5pEncoderDecoderModel"
  ],
  "auto_map": {
    "AutoConfig": "Salesforce/codet5p-2b--configuration_codet5p.CodeT5pConfig",
    "AutoModel": "Salesforce/codet5p-2b--modeling_codet5p.CodeGenModel",
    "AutoModelForSeq2SeqLM": "Salesforce/codet5p-2b--modeling_codet5p.CodeT5pEncoderDecoderModel"
  },
  "decoder": {
    "_name_or_path": "codet5p-2b-decoder",
    "activation_function": "gelu_new",
    "add_cross_attention": true,
    "architectures": [
      "CodeT5pForCausalLM"
    ],
    "attn_pdrop": 0.0,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": 1,
    "chunk_size_feed_forward": 0,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "embd_pdrop": 0.0,
    "encoder_no_repeat_ngram_size":

time: 4.25 ms (started: 2023-08-20 16:43:16 +00:00)


In [28]:
print("BOS token ID:", model.config.bos_token_id)
print("Decoder start token ID:", model.config.decoder_start_token_id)


BOS token ID: None
Decoder start token ID: None
time: 1.06 ms (started: 2023-08-20 16:43:24 +00:00)


In [29]:
print("Decoder start token ID:", model.config.decoder.decoder_start_token_id)


Decoder start token ID: None
time: 952 µs (started: 2023-08-20 16:43:26 +00:00)


In [30]:
print(tokenizer.special_tokens_map)


{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}
time: 502 µs (started: 2023-08-20 16:43:28 +00:00)


In [31]:
trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: ignored

time: 310 ms (started: 2023-08-20 16:43:40 +00:00)
