In [None]:
!pip install -q emoji
!pip install -q datasets
!pip install -q evaluate
!pip install -q rouge_score

In [None]:
import os
import re
import json
import emoji
import torch
import evaluate
import numpy as np
from tqdm.notebook import tqdm
from datasets import load_dataset, Dataset
from transformers import (
    BartTokenizerFast,
    BartForConditionalGeneration,
    BartConfig,
    EarlyStoppingCallback,
    TrainerCallback,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    RobertaTokenizer,
    T5ForConditionalGeneration
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the codeparrot/apps from HuggingFace

In [None]:
dataset = load_dataset('codeparrot/apps')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.63k [00:00<?, ?B/s]

apps.py:   0%|          | 0.00/4.95k [00:00<?, ?B/s]

The repository for codeparrot/apps contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/codeparrot/apps.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


train.jsonl:   0%|          | 0.00/107M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
        num_rows: 5000
    })
})

In [None]:
sample_idx = 428  # Change this index to view different samples
print(f"Problem ID: {dataset['train'][sample_idx]['problem_id']}")
print(f"Difficulty: {dataset['train'][sample_idx]['difficulty']}")
print(f"Problem Statement:\n{dataset['train'][sample_idx]['question']}")
print(f"Solution:\n{dataset['train'][sample_idx]['solutions']}")

Problem ID: 428
Difficulty: interview
Problem Statement:
We are given a 2-dimensional grid. "." is an empty cell, "#" is a wall, "@" is the starting point, ("a", "b", ...) are keys, and ("A", "B", ...) are locks.
We start at the starting point, and one move consists of walking one space in one of the 4 cardinal directions.  We cannot walk outside the grid, or walk into a wall.  If we walk over a key, we pick it up.  We can't walk over a lock unless we have the corresponding key.
For some 1 <= K <= 6, there is exactly one lowercase and one uppercase letter of the first K letters of the English alphabet in the grid.  This means that there is exactly one key for each lock, and one lock for each key; and also that the letters used to represent the keys and locks were chosen in the same order as the English alphabet.
Return the lowest number of moves to acquire all keys.  If it's impossible, return -1.
 

Example 1:
Input: ["@.a.#","###.#","b.A.B"]
Output: 8


Example 2:
Input: ["@..aA","..

# Split and Clean the Data

In [None]:
def clean_question(question):
    question = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)', '', question) # Remove URL
    return question.strip() # Remove extra whitespace

def clean_solution(solution):
    return solution

In [None]:
## GET MIN number of solutions
min_n_sols = 1000
max_n_sols = 0
for sample in dataset['train']:
  solutions = json.loads(sample['solutions'])
  min_n_sols = min(min_n_sols, len(solutions))
  max_n_sols = max(max_n_sols, len(solutions))
print(min_n_sols, max_n_sols)

1 990


In [None]:
# Split the train dataset into train and validation at the problem level to avoid leakage
# and create (question, solution) pairs, one per solution, for training and validation
train_val_split = dataset['train'].train_test_split(test_size=0.1, seed=42)
max_num_of_solutions = 1 # Number of solutions to take per question

train_data = [{'question': clean_question(sample['question']), 'solution': clean_solution(solution)}
    for sample in tqdm(train_val_split['train'])
    for solution in json.loads(sample['solutions'])[:max_num_of_solutions]
]
val_data = [
    {'question': clean_question(sample['question']), 'solution': clean_solution(solution)}
    for sample in tqdm(train_val_split['test'])
    for solution in json.loads(sample['solutions'])[:max_num_of_solutions]
]
test_data = [
    {'question': clean_question(sample['question']), 'solution': clean_solution(solution)}
    for sample in tqdm(dataset['test']) if sample['solutions']
    for solution in json.loads(sample['solutions'])[:max_num_of_solutions]
]

  0%|          | 0/4500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
processed_data_path = 'processed_data'
os.makedirs(processed_data_path, exist_ok=True)

for split_name, split_data in zip(['train', 'val', 'test'], [train_data, val_data, test_data]):
    output_file = os.path.join(processed_data_path, f'{split_name}.json')
    with open(output_file, 'w') as f: # Save processed data splits to files
        json.dump(split_data, f, indent=2)
print(f'Extracted {len(train_data)} train, {len(val_data)} validation, and {len(test_data)} test examples')

Extracted 4500 train, 500 validation, and 3765 test examples


In [None]:
## Tokenizer
new_tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

'<pad>'

# Retrain the Tokenizer (Optional)

In [None]:
# # Extract questions and solutions from the train split to create a domain-specific corpus
# questions = [sample['question'] for sample in dataset['train']]
# solutions = [sol for sample in dataset['train'] for sol in json.loads(sample['solutions'])]

In [None]:
# # Initialize a base tokenizer and train a new 1 on our corpus
# base_tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base')
# new_tokenizer = base_tokenizer.train_new_from_iterator(
#     questions + solutions, # Combine natural language and code
#     vocab_size = 50265  # Match model's original vocab size for compatibility
# )
# new_tokenizer.pad_token = new_tokenizer.eos_token
# new_tokenizer.save_pretrained('apps_tokenizer')  # Save the retrained tokenizer

In [None]:
# new_tokenizer = BartTokenizerFast.from_pretrained('apps_tokenizer')
# test_input = 'def solve(nums):\n    return sum(nums)'
# encoded = new_tokenizer.encode(test_input)
# decoded = new_tokenizer.decode(encoded)
# print(f'Testing tokenizer:\n'
#       f'Original: {test_input}\n'
#       f'Encoded: {encoded}\n'
#       f'Decoded: {decoded}\n'
#       f'Vocabulary size: {new_tokenizer.vocab_size}')

tokenizer pretrained

# Tokenization for Sequence-to-Sequence Task

In [None]:
def tokenize_function(example): # tokenization function
    inputs = new_tokenizer(example['question'], truncation=True, max_length=512) # Tokenize inputs (questions)
    labels = new_tokenizer(example['solution'], truncation=True, max_length=512) # Tokenize targets (solutions)

    labels_with_ignore = [] # Replace padding token id with -100 so it's ignored in the loss
    for label in labels['input_ids']:
        labels_with_ignore.append([-100 if token == new_tokenizer.pad_token_id else token for token in label])
    inputs['labels'] = labels_with_ignore
    return inputs

In [None]:
train_dataset_processed = Dataset.from_list(train_data)
val_dataset_processed = Dataset.from_list(val_data)

tokenized_train_dataset = train_dataset_processed.map(tokenize_function, batched=True, remove_columns=train_dataset_processed.column_names)
tokenized_val_dataset = val_dataset_processed.map(tokenize_function, batched=True, remove_columns=val_dataset_processed.column_names)
tokenized_val_dataset # Dynamic padding will be handled by DataCollator

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})

# Metrics

In [None]:
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')
meteor = evaluate.load('meteor')
# codebleu = evaluate.load("dvitel/codebleu")

def preprocess_logits_for_metrics(logits, labels):
    '''
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/15
    '''
    if isinstance(logits, tuple): pred_ids = logits[0]
    else: pred_ids = logits
    if pred_ids.ndim == 3: pred_ids = torch.argmax(pred_ids, dim=-1)
    return pred_ids, labels

def compute_metrics(eval_preds):
    preds = eval_preds.predictions[0]
    labels = eval_preds.label_ids
    preds = np.where(preds != -100, preds, new_tokenizer.pad_token_id) # Replace -100 with pad token id
    labels = np.where(labels != -100, labels, new_tokenizer.pad_token_id) # Replace -100 with pad token id

    # Decode predictions and labels
    decoded_preds = new_tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = new_tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU, ROUGE, and exact match score
    bleu_results = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    meteor_results = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    # codebleu_results = codebleu.compute(predictions=decoded_preds, references=decoded_labels, lang="python")
    # exact_match = sum(pred == label for pred, label in zip(decoded_preds, decoded_labels)) / len(decoded_preds)

    return {
        'bleu': bleu_results['bleu'],
        'rouge1': rouge_results['rouge1'],
        'rouge2': rouge_results['rouge2'],
        'rougeL': rouge_results['rougeL'],
        'meteor': meteor_results['meteor'],
        # 'codebleu': meteor_results['codebleu'],
        # 'exact_match': exact_match,
    }

class PerplexityCallback(TrainerCallback): # Define callback to compute perplexity from eval_loss
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        if 'eval_loss' in metrics:
            perplexity = torch.exp(torch.tensor(metrics['eval_loss']))
            metrics['perplexity'] = perplexity.item()

# Training Setup

In [None]:
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')
total_params = sum(p.numel() for p in model.parameters())
print(f'Model initialized with {total_params / 1e6:.2f}M parameters')

Model initialized with 222.88M parameters


In [None]:
training_args = Seq2SeqTrainingArguments( # Define training arguments for fine-tuning
    output_dir='./codet5',               # Directory for checkpoints and logs
    num_train_epochs=20,                  #
    per_device_train_batch_size=8,       # Batch size per GPU
    per_device_eval_batch_size=8,        # Evaluation batch size
    learning_rate=2e-4,
    # lr_scheduler_type='cosine',
    weight_decay=0.01,                    # Regularization
    logging_strategy='epoch',             #
    eval_strategy='epoch',                # Evaluate after each epoch
    save_strategy='epoch',                # Save after each epoch
    # predict_with_generate=True,          # Whether to use generate to calculate generative metrics (ROUGE, BLEU)
    # generation_max_length=512,           #
    load_best_model_at_end=True,          # Load the best model based on validation loss
    metric_for_best_model='eval_loss',    # Use validation loss for early stopping
    greater_is_better=False,              # Lower loss is better
    fp16=torch.cuda.is_available(),       # Enable mixed-precision training if a CUDA GPU is available (faster, less memory)
)

# Fine-tune the Model

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=new_tokenizer, model=model), # Set up data collator for dynamic padding
    processing_class=new_tokenizer,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
    callbacks=[PerplexityCallback, EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()  # Perform the fine-tuning
trainer.save_model('trained_model')  # Save the fine-tuned model

Epoch,Training Loss,Validation Loss,Bleu,Rouge1,Rouge2,Rougel,Meteor,Unnamed: 8
1,2.0458,1.635689,0.402132,0.675453,0.354127,0.571122,0.555073,5.132994
2,1.5344,1.485486,0.430373,0.683077,0.379391,0.587777,0.57198,4.417112
3,1.312,1.428971,0.445434,0.695919,0.398189,0.604416,0.594938,4.174402
4,1.1433,1.445505,0.446324,0.686798,0.394755,0.603386,0.587689,4.243996
5,1.0038,1.454895,0.451988,0.695587,0.40278,0.608974,0.596478,4.284033
6,0.8776,1.486456,0.447996,0.688596,0.397748,0.606372,0.592558,4.421397


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
