# Training

## Installs and imports

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [25]:
from datasets import load_dataset, concatenate_datasets, Dataset
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from tqdm.auto import tqdm
import torch
import random
import re

## Dataset

For the training the model, I will use "codeparrot/apps" dataset of Python code. I am concatenating both "train" and "test" parts of the dataset in order to have more samples in actual train data. I will split the whole dataset with the 0.9 rate, so 9000 samples goes to train data and 1000 goes to the test data.

In [5]:
dataset_1 = load_dataset("codeparrot/apps", split="train", trust_remote_code=True)
dataset_2 = load_dataset("codeparrot/apps", split="test", trust_remote_code=True)

dataset = concatenate_datasets([dataset_1, dataset_2])

Let's ensure the dataset was downloaded correctly

In [6]:
dataset

Dataset({
    features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
    num_rows: 10000
})

In [7]:
len(dataset["solutions"])

10000

In [8]:
dataset[1000]['solutions']

'["# cook your dish here\\nfrom math import ceil\\nfor _ in range(int(input())):\\n    n=int(input())\\n    a=list(map(int,input().split()))\\n    ans=ceil(n/min(a))\\n    print(int(ans))", "from math import *\\nfor i in range(int(input())):\\n    n=int(input())\\n    arr=[int(i) for i in input().split()]\\n    print(ceil(n/min(arr)))", "import math\\nt=int(input())\\nwhile t:\\n    t=t-1\\n    n=int(input())\\n    a=list(map(int,input().split()))\\n    print(math.ceil(n/min(a)))", "# cook your dish here\\nfor _ in range(int(input())):\\n    n=int(input())\\n    l=list(map(int,input().split()))\\n    mi=min(l)\\n    ans=n/mi\\n    if(ans==int(ans)):\\n        print(int(ans))\\n    else:\\n        print(int(ans)+1)", "# cook your dish here\\nfor _ in range(int(input())):\\n    n=int(input())\\n    l=list(map(int,input().split()))\\n    mi=min(l)\\n    ans=n/mi\\n    if(ans==int(ans)):\\n        print(int(ans))\\n    else:\\n        print(int(ans)+1)", "# cook your dish here\\nfrom math 

In [16]:
dataset_new = {
    "source": list(),
    "target": list()
}

def replace_with_length(match):
    number_of_additional_spaces = random.randint(len(match.group(0))+1, len(match.group(0))+10)
    return " " * number_of_additional_spaces
11
# Here I corrupt the code in two ways: I use both space-removing and space-adding techniques in order the model can not only add the required spaces, but also remove the ambiguous ones
def remove_formatting(code):
    corruption_choice = random.randint(0, 2)
    if corruption_choice == 0:
        unformatted_code = re.sub(r'\s+', '', code)
    elif corruption_choice == 1:
        unformatted_code = re.sub(r'\s+', ' ', code)
    else:
        unformatted_code = re.sub(r'\s+', replace_with_length, code)
    return unformatted_code

def prepare_data_pairs(dataset_split):
    data_pairs = []
    for item in tqdm(dataset_split):
        formatted_code = item['solutions'][2:-3]
        if formatted_code is None:
            continue
        unformatted_code = remove_formatting(formatted_code)
        dataset_new["source"].append(unformatted_code)
        dataset_new["target"].append(formatted_code)
    return data_pairs

prepare_data_pairs(dataset)

dataset_n = Dataset.from_dict(dataset_new)
dataset_n

  0%|          | 0/10000 [00:00<?, ?it/s]

Dataset({
    features: ['source', 'target'],
    num_rows: 10000
})

Now I split a dataset with the rate of 0.9

In [17]:
dataset_n = dataset_n.train_test_split(test_size=0.1)
train_dataset = dataset_n['train']
val_dataset = dataset_n['test']

print(train_dataset, val_dataset, sep="\n")

Dataset({
    features: ['source', 'target'],
    num_rows: 9000
})
Dataset({
    features: ['source', 'target'],
    num_rows: 1000
})


## Model

I chose the T5 model for this task. I will use "t5-base" version of the model as it is pretty small and can be trained pretty fast

In [18]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Now I prepare the data to be input to the model

In [19]:
def tokenize_function(examples):
    inputs = tokenizer(
        examples['source'],
        max_length=512,
        truncation=True,
        padding='max_length',
    )

    labels = tokenizer(
        examples['target'],
        max_length=512,
        truncation=True,
        padding='max_length',
    )

    inputs['labels'] = labels['input_ids']

    return inputs

train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['source', 'target']
)

val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['source', 'target']
)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

And let's ensure that *train_dataset* and *val_dataset* have the appropriate format for the model

In [21]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9000
})

In [22]:
val_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [23]:
train_dataset.set_format(type='torch')
val_dataset.set_format(type='torch')

Training the model

In [28]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1000,
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    predict_with_generate=True,
    report_to=[],
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss,Validation Loss
500,0.151,0.084175
1000,0.0914,0.056812
1500,0.0727,0.046165
2000,0.0675,0.041728
2500,0.0596,0.037841
3000,0.0495,0.037009
3500,0.0449,0.034679
4000,0.0501,0.032658
4500,0.0474,0.032616
5000,0.0448,0.032201


('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/spiece.model',
 './trained_model/added_tokens.json')

## Saving the trained model

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [31]:
!cp trained_model.zip /content/drive/MyDrive/

In [29]:
from google.colab import files
import shutil

shutil.make_archive("trained_model", 'zip', "trained_model")
# files.download("trained_model.zip")

'/content/trained_model.zip'