In [None]:
## CHECK COLAB GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun May 11 02:21:21 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P0             69W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%pip install -q emoji
%pip install -q datasets
%pip install -q evaluate
%pip install -q rouge_score

In [None]:
import os
import re
import json
import emoji
import torch
import evaluate
import numpy as np
from tqdm.notebook import tqdm
from datasets import load_dataset, Dataset
from transformers import (
    GPT2TokenizerFast,
    GPT2LMHeadModel,
    GPT2Config,
    EarlyStoppingCallback,
    TrainerCallback,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the codeparrot/apps from HuggingFace

In [None]:
dataset = load_dataset('codeparrot/apps')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
        num_rows: 5000
    })
})

In [None]:
from collections import Counter
Counter(dataset['train']['difficulty'])
Counter(dataset['test']['difficulty'])

Counter({'interview': 3000, 'competition': 1000, 'introductory': 1000})

In [None]:
sample_idx = 10  # Change this index to view different samples
print(f"Problem ID: {dataset['train'][sample_idx]['problem_id']}")
print(f"Difficulty: {dataset['train'][sample_idx]['difficulty']}")
print(f"Problem Statement:\n{dataset['train'][sample_idx]['question']}")
print(f"Solution:\n{dataset['train'][sample_idx]['solutions']}")

Problem ID: 10
Difficulty: interview
Problem Statement:
Given a permutation $p$ of length $n$, find its subsequence $s_1$, $s_2$, $\ldots$, $s_k$ of length at least $2$ such that:  $|s_1-s_2|+|s_2-s_3|+\ldots+|s_{k-1}-s_k|$ is as big as possible over all subsequences of $p$ with length at least $2$.  Among all such subsequences, choose the one whose length, $k$, is as small as possible. 

If multiple subsequences satisfy these conditions, you are allowed to find any of them.

A sequence $a$ is a subsequence of an array $b$ if $a$ can be obtained from $b$ by deleting some (possibly, zero or all) elements.

A permutation of length $n$ is an array of length $n$ in which every element from $1$ to $n$ occurs exactly once.


-----Input-----

The first line contains an integer $t$ ($1 \le t \le 2 \cdot 10^4$) — the number of test cases. The description of the test cases follows.

The first line of each test case contains an integer $n$ ($2 \le n \le 10^5$) — the length of the permutation $p$.

In [None]:
# Check the number of test cases per problem
test_case_counts = [len(item['input_output']) for item in tqdm(dataset['train']) if 'input_output' in item]
print(f"Min number of test cases: {min(test_case_counts) if test_case_counts else 'N/A'}")
print(f"Max number of test cases: {max(test_case_counts) if test_case_counts else 'N/A'}")
print(f"Average number of test cases: {np.mean(test_case_counts) if test_case_counts else 'N/A'}")

  0%|          | 0/5000 [00:00<?, ?it/s]

Min number of test cases: 0
Max number of test cases: 23613166
Average number of test cases: 5749.3512


In [None]:
no_test_cases = sum(1 for item in dataset['train'] if 'input_output' not in item or len(item['input_output']) == 0)
no_solutions = sum(1 for item in dataset['test'] if 'solutions' not in item or len(item['solutions']) == 0)
print(f'Problems without test cases in train split: {no_test_cases}')
print(f'Problems without solutions in test split: {no_solutions}')

Problems without test cases in train split: 195
Problems without solutions in test split: 1235


# Split and Clean the Data

In [None]:
def clean_question(question):
    question = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)', '', question) # Remove URL
    return question.strip() # Remove extra whitespace

def clean_solution(solution):
    return solution

In [None]:
# Split the train dataset into train and validation at the problem level to avoid leakage
# and create (question, solution) pairs, one per solution, for training and validation
train_val_split = dataset['train'].train_test_split(test_size=0.1, seed=42)
num_of_solutions = 1 # Number of solutions to take per question

train_data = [{'question': clean_question(sample['question']), 'solution': clean_solution(solution)}
    for sample in tqdm(train_val_split['train'])
    for solution in json.loads(sample['solutions'])[:num_of_solutions]
]
val_data = [
    {'question': clean_question(sample['question']), 'solution': clean_solution(solution)}
    for sample in tqdm(train_val_split['test'])
    for solution in json.loads(sample['solutions'])[:num_of_solutions]
]
test_data = [
    {'question': clean_question(sample['question']), 'solution': clean_solution(solution)}
    for sample in tqdm(dataset['test']) if sample['solutions']
    for solution in json.loads(sample['solutions'])[:num_of_solutions]
]

  0%|          | 0/4500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
processed_data_path = 'processed_data'
os.makedirs(processed_data_path, exist_ok=True)

for split_name, split_data in zip(['train', 'val', 'test'], [train_data, val_data, test_data]):
    output_file = os.path.join(processed_data_path, f'{split_name}.json')
    with open(output_file, 'w') as f: # Save processed data splits to files
        json.dump(split_data, f, indent=2)
print(f'Extracted {len(train_data)} train, {len(val_data)} validation, and {len(test_data)} test examples')

Extracted 4500 train, 500 validation, and 3765 test examples


In [None]:
# Initialize a base tokenizer and train a new 1 on our corpus
new_tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
new_tokenizer.pad_token = new_tokenizer.eos_token

# Retrain the Tokenizer (Optional)

In [None]:
# # Extract questions and solutions from the train split to create a domain-specific corpus
# questions = [sample['question'] for sample in dataset['train']]
# solutions = [sol for sample in dataset['train'] for sol in json.loads(sample['solutions'])]

In [None]:
# new_tokenizer = base_tokenizer.train_new_from_iterator(
#     questions + solutions, # Combine natural language and code
#     vocab_size = 50257,  # Match model's original vocab size for compatibility
#     new_special_tokens = ['[CODE]']
# )
# new_tokenizer.pad_token = new_tokenizer.eos_token
# new_tokenizer.save_pretrained('apps_tokenizer')  # Save the retrained tokenizer

In [None]:
# test_input = 'def solve(nums):\n    return sum(nums)'
# encoded = new_tokenizer.encode(test_input)
# decoded = new_tokenizer.decode(encoded)
# print(f'Testing tokenizer:\n'
#       f'Original: {test_input}\n'
#       f'Encoded: {encoded}\n'
#       f'Decoded: {decoded}\n'
#       f'Vocabulary size: {new_tokenizer.vocab_size}')

Testing tokenizer:
Original: def solve(nums):
    return sum(nums)
Encoded: [4299, 8494, 7, 77, 5700, 2599, 198, 220, 220, 220, 1441, 2160, 7, 77, 5700, 8]
Decoded: def solve(nums):
    return sum(nums)
Vocabulary size: 50257


# Tokenization for Auto-regression Task

In [None]:
def tokenize_function(example): # tokenization function
    text = example['question'][:500] + '\n[CODE]\n' + example['solution'][:500] + new_tokenizer.eos_token
    inputs = new_tokenizer(text, truncation=True, padding='max_length', max_length=1024)

    question = new_tokenizer(example['question'][:500] + '\n[CODE]\n', truncation=True, max_length=1024)
    # Create labels - we only want to compute loss on the solution part
    solution_start = len(question['input_ids'])  # Find where the solution starts in the encoded sequence
    labels = [-100] * solution_start + inputs['input_ids'][solution_start:] # Set labels to -100 for question part (ignored in loss calculation)
    inputs['labels'] = labels[:1024]
    return inputs

In [None]:
train_dataset_processed = Dataset.from_list(train_data)
val_dataset_processed = Dataset.from_list(val_data)

tokenized_train_dataset = train_dataset_processed.map(tokenize_function, batched=False, remove_columns=train_dataset_processed.column_names)
tokenized_val_dataset = val_dataset_processed.map(tokenize_function, batched=False, remove_columns=val_dataset_processed.column_names)
tokenized_val_dataset # Dynamic padding will be handled by DataCollator

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})

# Metrics

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')
meteor = evaluate.load('meteor')

def preprocess_logits_for_metrics(logits, labels):
    '''
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/15
    '''
    pred_ids = torch.argmax(logits, dim=-1)
    return pred_ids, labels

def compute_metrics(eval_preds):
    preds = eval_preds.predictions[0]
    labels = eval_preds.label_ids
    preds = np.where(preds != -100, preds, new_tokenizer.pad_token_id) # Replace -100 with pad token id
    labels = np.where(labels != -100, labels, new_tokenizer.pad_token_id) # Replace -100 with pad token id

    # Decode predictions and labels
    decoded_preds = new_tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = new_tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU, ROUGE, and exact match score
    bleu_results = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    meteor_results = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    # exact_match = sum(pred == label for pred, label in zip(decoded_preds, decoded_labels)) / len(decoded_preds)

    return {
        'bleu': bleu_results['bleu'],
        'rouge1': rouge_results['rouge1'],
        'rouge2': rouge_results['rouge2'],
        'rougeL': rouge_results['rougeL'],
        'meteor': meteor_results['meteor'],
        # 'exact_match': exact_match,
    }

class PerplexityCallback(TrainerCallback): # Define callback to compute perplexity from eval_loss
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        if 'eval_loss' in metrics:
            perplexity = torch.exp(torch.tensor(metrics['eval_loss']))
            metrics['perplexity'] = perplexity.item()

# Training Setup

In [None]:
# config = GPT2Config.from_pretrained(
#     'gpt2',
#     vocab_size=new_tokenizer.vocab_size,
#     n_positions=1024,
#     # n_embd=768,
#     # n_layer=12,
#     # n_head=12
# )
# model = GPT2LMHeadModel(config) # Initialize a new model with this configuration
model = GPT2LMHeadModel.from_pretrained('gpt2')
# model.resize_token_embeddings(len(new_tokenizer))
total_params = sum(p.numel() for p in model.parameters())
print(f'Model initialized with {total_params / 1e6:.2f}M parameters')

Model initialized with 124.44M parameters


In [None]:
training_args = TrainingArguments(       # Define training arguments for fine-tuning
    output_dir='./gpt-2_codeparrots',              # Directory for checkpoints and logs
    num_train_epochs=20,                 #
    per_device_train_batch_size=16,      # Batch size per GPU
    per_device_eval_batch_size=16,       # Evaluation batch size
    learning_rate=2e-4,                  #
    # lr_scheduler_type='cosine',
    weight_decay=0.01,                   # Regularization
    logging_strategy='epoch',            #
    eval_strategy='epoch',               # Evaluate after each epoch
    save_strategy='epoch',               # Save after each epoch
    load_best_model_at_end=True,         # Load the best model based on validation loss
    metric_for_best_model='eval_loss',   # Use validation loss for early stopping
    greater_is_better=False,             # Lower loss is better
    fp16=torch.cuda.is_available(),      # Enable mixed-precision training if a CUDA GPU is available (faster, less memory)
    gradient_accumulation_steps=2,
)

# Fine-tune the Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=new_tokenizer, mlm=False),
    # preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    # compute_metrics=compute_metrics,
    callbacks=[PerplexityCallback, EarlyStoppingCallback(early_stopping_patience=5)]
)
trainer.train()  # Perform the fine-tuning
trainer.save_model('/content/drive/MyDrive/NLP_Code_Generation/gpt2-codeparrots')  # Save the fine-tuned model

[34m[1mwandb[0m: Currently logged in as: [33mhtkhang0966[0m ([33mhtkhang0966-the-university-of-technology-sydney[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss,Unnamed: 3
1,2.1332,1.840584,6.300216
2,1.7825,1.770677,5.874832
3,1.6426,1.739372,5.693764
4,1.5259,1.727403,5.626024
5,1.4348,1.731408,5.648603
6,1.3472,1.744355,5.722209
7,1.2639,1.770017,5.870955
8,1.1894,1.780013,5.929935
9,1.121,1.810094,6.111021


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
