In [1]:
!pip install -q datasets
!pip install -q evaluate
!pip install -q rouge_score

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m378.9/491.4 kB[0m [31m11.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━

In [1]:
import os
import re
import json
import torch
import evaluate
import numpy as np
from tqdm.notebook import tqdm
from datasets import load_dataset, Dataset
from transformers import (
    GPT2TokenizerFast,
    GPT2LMHeadModel,
    GPT2Config,
    EarlyStoppingCallback,
    TrainerCallback,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the codeparrot/apps from HuggingFace

In [2]:
dataset = load_dataset('codeparrot/apps')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
        num_rows: 5000
    })
})

In [4]:
sample_idx = 0  # Change this index to view different samples
print(f"Problem ID: {dataset['train'][sample_idx]['problem_id']}")
print(f"Difficulty: {dataset['train'][sample_idx]['difficulty']}")
print(f"Problem Statement:\n{dataset['train'][sample_idx]['question']}")
print(f"Solution:\n{dataset['train'][sample_idx]['solutions']}")

Problem ID: 0
Difficulty: interview
Problem Statement:
Polycarp has $n$ different binary words. A word called binary if it contains only characters '0' and '1'. For example, these words are binary: "0001", "11", "0" and "0011100".

Polycarp wants to offer his set of $n$ binary words to play a game "words". In this game, players name words and each next word (starting from the second) must start with the last character of the previous word. The first word can be any. For example, these sequence of words can be named during the game: "0101", "1", "10", "00", "00001".

Word reversal is the operation of reversing the order of the characters. For example, the word "0111" after the reversal becomes "1110", the word "11010" after the reversal becomes "01011".

Probably, Polycarp has such a set of words that there is no way to put them in the order correspondent to the game rules. In this situation, he wants to reverse some words from his set so that:  the final set of $n$ words still contains

In [5]:
# Check the number of test cases per problem
test_case_counts = [len(item['input_output']) for item in tqdm(dataset['train']) if 'input_output' in item]
print(f"Min number of test cases: {min(test_case_counts) if test_case_counts else 'N/A'}")
print(f"Max number of test cases: {max(test_case_counts) if test_case_counts else 'N/A'}")
print(f"Average number of test cases: {np.mean(test_case_counts) if test_case_counts else 'N/A'}")

  0%|          | 0/5000 [00:00<?, ?it/s]

Min number of test cases: 0
Max number of test cases: 23613166
Average number of test cases: 5749.3512


In [None]:
no_test_cases = sum(1 for item in dataset['train'] if 'input_output' not in item or len(item['input_output']) == 0)
no_solutions = sum(1 for item in dataset['test'] if 'solutions' not in item or len(item['solutions']) == 0)
print(f'Problems without test cases in train split: {no_test_cases}')
print(f'Problems without solutions in test split: {no_solutions}')

Problems without test cases in train split: 195
Problems without solutions in test split: 1235


# Split and Clean the Data

In [3]:
def clean_code(code):
    code = re.sub(r'#\s*Time:.*|#\s*Space:.*|#\s*@author:.*|#\s*@date:.*', '', code) # Remove comments that don't add value
    if 'def ' in code or 'class ' in code: # Skip imports, handle only function/class definitions
        # Try to find the first function or class definition
        first_def = code.find('def ')
        first_class = code.find('class ')

        # Find the earliest occurrence of either def or class
        start_idx = min(x for x in [first_def, first_class] if x >= 0) if first_def >= 0 or first_class >= 0 else 0
        code = code[start_idx:]

    # Remove trailing whitespace and ensure consistent newlines
    code = '\n'.join(line.rstrip() for line in code.strip().splitlines())
    return code

In [4]:
# Split the train dataset into train and validation at the problem level to avoid leakage
# and create (question, solution) pairs, one per solution, for training and validation
train_val_split = dataset['train'].train_test_split(test_size=0.1, seed=42)
num_of_solutions = 1 # Number of solutions to take per question

train_data = [{'question': sample['question'], 'solution': clean_code(solution)}
    for sample in tqdm(train_val_split['train'])
    for solution in json.loads(sample['solutions'])[-num_of_solutions:]
]
val_data = [
    {'question': sample['question'], 'solution': clean_code(solution)}
    for sample in tqdm(train_val_split['test'])
    for solution in json.loads(sample['solutions'])[-num_of_solutions:]
]
test_data = [
    {'question': sample['question'], 'solution': clean_code(solution)}
    for sample in tqdm(dataset['test']) if sample['solutions']
    for solution in json.loads(sample['solutions'])[-num_of_solutions:]
]

  0%|          | 0/4500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

In [5]:
processed_data_path = 'processed_data'
os.makedirs(processed_data_path, exist_ok=True)

for split_name, split_data in zip(['train', 'val', 'test'], [train_data, val_data, test_data]):
    output_file = os.path.join(processed_data_path, f'{split_name}.json')
    with open(output_file, 'w') as f: # Save processed data splits to files
        json.dump(split_data, f, indent=2)
print(f'Extracted {len(train_data)} train, {len(val_data)} validation, and {len(test_data)} test examples')

Extracted 4500 train, 500 validation, and 3765 test examples


# Retrain the Tokenizer

In [6]:
# Extract questions and solutions from the train split to create a domain-specific corpus
questions = [sample['question'] for sample in dataset['train']]
solutions = [sol for sample in dataset['train'] for sol in json.loads(sample['solutions'])]

In [8]:
# Initialize a base tokenizer and train a new 1 on our corpus
base_tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
new_tokenizer = base_tokenizer.train_new_from_iterator(
    questions + solutions, # Combine natural language and code
    vocab_size = 50257,  # Match model's original vocab size for compatibility
    new_special_tokens = ['[CODE]']
)
new_tokenizer.pad_token = new_tokenizer.eos_token
new_tokenizer.save_pretrained('apps_tokenizer')  # Save the retrained tokenizer

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

('apps_tokenizer/tokenizer_config.json',
 'apps_tokenizer/special_tokens_map.json',
 'apps_tokenizer/vocab.json',
 'apps_tokenizer/merges.txt',
 'apps_tokenizer/added_tokens.json',
 'apps_tokenizer/tokenizer.json')

In [7]:
new_tokenizer = GPT2TokenizerFast.from_pretrained('apps_tokenizer')
test_input = 'def solve(nums):\n    return sum(nums)'
encoded = new_tokenizer.encode(test_input)
decoded = new_tokenizer.decode(encoded)
print(f'Testing tokenizer:\n'
      f'Original: {test_input}\n'
      f'Encoded: {encoded}\n'
      f'Decoded: {decoded}\n'
      f'Vocabulary size: {new_tokenizer.vocab_size}')

Testing tokenizer:
Original: def solve(nums):
    return sum(nums)
Encoded: [311, 1175, 9, 621, 283, 273, 296, 501, 9, 621, 10]
Decoded: def solve(nums):
    return sum(nums)
Vocabulary size: 50257


# Tokenization for Auto-regression Task

In [8]:
def tokenize_function(example): # tokenization function
    text = example['question'] + '\n[CODE]\n' + example['solution'] + new_tokenizer.eos_token
    inputs = new_tokenizer(text, truncation=True, padding='max_length', max_length=512)
    question = new_tokenizer(example['question'] + '\n[CODE]\n', truncation=True, max_length=512)

    # Create labels - we only want to compute loss on the solution part
    solution_start = len(question['input_ids'])  # Find where the solution starts in the encoded sequence
    labels = [-100] * solution_start + inputs['input_ids'][solution_start:] # Set labels to -100 for question part (ignored in loss calculation)
    inputs['labels'] = labels[:512]
    return inputs

In [9]:
train_dataset_processed = Dataset.from_list(train_data)
val_dataset_processed = Dataset.from_list(val_data)

tokenized_train_dataset = train_dataset_processed.map(tokenize_function, batched=False, remove_columns=train_dataset_processed.column_names)
tokenized_val_dataset = val_dataset_processed.map(tokenize_function, batched=False, remove_columns=val_dataset_processed.column_names)
tokenized_val_dataset # Dynamic padding will be handled by DataCollator

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})

In [None]:
# labels = np.array(tokenized_val_dataset[3]['labels'])
# labels = np.where(labels != -100, labels, new_tokenizer.pad_token_id)
# print(new_tokenizer.decode(labels))

# Metrics

In [10]:
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')
meteor = evaluate.load('meteor')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
def preprocess_logits_for_metrics(logits, labels):
    '''
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/15
    '''
    pred_ids = torch.argmax(logits, dim=-1)
    return pred_ids, labels

In [None]:
def compute_metrics(eval_preds):
    preds = eval_preds.predictions[0]
    labels = eval_preds.label_ids
    preds = np.where(preds != -100, preds, new_tokenizer.pad_token_id) # Replace -100 with pad token id
    labels = np.where(labels != -100, labels, new_tokenizer.pad_token_id) # Replace -100 with pad token id

    # Decode predictions and labels
    decoded_preds = new_tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = new_tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU, ROUGE, and exact match score
    bleu_results = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    meteor_results = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    # exact_match = sum(pred == label for pred, label in zip(decoded_preds, decoded_labels)) / len(decoded_preds)

    return {
        'bleu': bleu_results['bleu'],
        'rouge1': rouge_results['rouge1'],
        'rouge2': rouge_results['rouge2'],
        'rougeL': rouge_results['rougeL'],
        'meteor': meteor_results['meteor'],
        # 'exact_match': exact_match,
    }

In [14]:
class PerplexityCallback(TrainerCallback): # Define callback to compute perplexity from eval_loss
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        if 'eval_loss' in metrics:
            perplexity = torch.exp(torch.tensor(metrics['eval_loss']))
            metrics['perplexity'] = perplexity.item()

# Training Setup

In [15]:
# config = GPT2Config.from_pretrained(
#     'gpt2',
#     vocab_size=new_tokenizer.vocab_size,
#     n_positions=1024,
#     # n_embd=768,
#     # n_layer=12,
#     # n_head=12
# )
# model = GPT2LMHeadModel(config) # Initialize a new model with this configuration
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(new_tokenizer))
total_params = sum(p.numel() for p in model.parameters())
print(f'Model initialized with {total_params / 1e6:.2f}M parameters')

Model initialized with 124.44M parameters


In [16]:
training_args = TrainingArguments(       # Define training arguments for fine-tuning
    output_dir='./results',              # Directory for checkpoints and logs
    num_train_epochs=20,                 #
    per_device_train_batch_size=32,      # Batch size per GPU
    per_device_eval_batch_size=32,       # Evaluation batch size
    learning_rate=2e-4,                  #
    weight_decay=0.01,                   # Regularization
    logging_strategy='epoch',            #
    eval_strategy='epoch',               # Evaluate after each epoch
    save_strategy='epoch',               # Save after each epoch
    load_best_model_at_end=True,         # Load the best model based on validation loss
    metric_for_best_model='eval_loss',   # Use validation loss for early stopping
    greater_is_better=False,             # Lower loss is better
    fp16=torch.cuda.is_available(),      # Enable mixed-precision training if a CUDA GPU is available (faster, less memory)
)

# Fine-tune the Model

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=new_tokenizer, mlm=False),
    processing_class=new_tokenizer,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
    callbacks=[PerplexityCallback, EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()  # Perform the fine-tuning
trainer.save_model('trained_model')  # Save the fine-tuned model

[34m[1mwandb[0m: Currently logged in as: [33m18520339[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss,Bleu,Rouge1,Rouge2,Rougel,Meteor,Unnamed: 8
1,4.9543,3.976755,0.273161,0.563107,0.226466,0.384646,0.387706,53.343636
2,3.6244,3.540423,0.291264,0.578471,0.241537,0.403468,0.413555,34.481499
3,3.2403,3.340668,0.284745,0.587121,0.247554,0.409686,0.424755,28.237995
4,3.0103,3.222893,0.280323,0.590837,0.25118,0.41702,0.428779,25.100643
5,2.8396,3.159079,0.273932,0.590115,0.252366,0.418318,0.430826,23.548899
6,2.7058,3.104282,0.275269,0.598313,0.25904,0.425082,0.433934,22.2932
7,2.5939,3.070092,0.280922,0.600739,0.257454,0.426773,0.439801,21.543888
8,2.4955,3.058115,0.281441,0.603888,0.259667,0.430511,0.440719,21.287403
9,2.409,3.047671,0.266718,0.597383,0.257909,0.428171,0.435731,21.066221
10,2.3349,3.047067,0.264642,0.60022,0.258386,0.428677,0.439291,21.053507


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


# Evaluation on Test Set

In [19]:
test_dataset_processed = Dataset.from_list(test_data)
tokenized_test_dataset = test_dataset_processed.map(tokenize_function, batched=False, remove_columns=test_dataset_processed.column_names)
trainer.evaluate(tokenized_test_dataset, metric_key_prefix='test')

Map:   0%|          | 0/3765 [00:00<?, ? examples/s]

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


{'test_loss': 2.6213130950927734,
 'test_bleu': 0.33308415419639403,
 'test_rouge1': 0.6623177856210263,
 'test_rouge2': 0.3221555979845476,
 'test_rougeL': 0.4829885926971944,
 'test_meteor': 0.5058737893190384,
 'test_runtime': 267.1236,
 'test_samples_per_second': 14.095,
 'test_steps_per_second': 0.442,
 'epoch': 14.0}