In [1]:
import json

def load_and_preprocess_data(file_path):
    """Loads and preprocesses the CodeNet dataset for training."""
    
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)  # Load all data at once

    preprocessed_data = []

    for i, entry in enumerate(data):
        
            preprocessed_data.append(entry)

    return preprocessed_data

# Test with 10 samples
train_file = "/kaggle/input/code-net-python/train.jsonl"
train_data_sample = load_and_preprocess_data(train_file)

print(train_data_sample[0])




{'src_id': 'p00001_s631177546', 'src': ['from', 'sys', 'import', 'stdin', 'NEW_LINE', 'x', '=', '[', 'int', '(', 'input', '(', ')', ')', 'for', 'i', 'in', 'range', '(', '10', ')', ']', 'NEW_LINE', 'x', '.', 'reverse', '(', ')', 'NEW_LINE', 'for', 'i', 'in', 'range', '(', '3', ')', ':', 'NEW_LINE', 'INDENT', 'print', '(', 'i', ')', 'NEW_LINE', 'DEDENT'], 'src_verdict': 'Wrong Answer', 'tgt': ['from', 'sys', 'import', 'stdin', 'NEW_LINE', 'x', '=', '[', 'int', '(', 'input', '(', ')', ')', 'for', 'i', 'in', 'range', '(', '10', ')', ']', 'NEW_LINE', 'x', '.', 'sort', '(', 'reverse', '=', 'True', ')', 'NEW_LINE', 'for', 'i', 'in', 'range', '(', '3', ')', ':', 'NEW_LINE', 'INDENT', 'print', '(', 'x', '[', 'i', ']', ')', 'NEW_LINE', 'DEDENT'], 'tgt_id': 'p00001_s854661751'}


In [2]:
from datasets import Dataset



dataset = Dataset.from_list(train_data_sample)


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")  # Supports longer sequences

# Limit dataset to 20,000 samples
subset_size = 50000 
dataset_subset = dataset.select(range(min(len(dataset), subset_size)))  # Avoid errors if dataset is smaller

def tokenize_function(example):
    input_text = [' '.join(src) for src in example['src']]
    target_text = [' '.join(tgt) for tgt in example['tgt']]

    model_inputs = tokenizer(input_text, truncation=True, max_length=768, padding="max_length")  
    labels = tokenizer(target_text, truncation=True, max_length=768, padding="max_length")  

    # Replace padding token ID in labels with -100 to ignore in loss calculation
    labels["input_ids"] = [
        -100 if token == tokenizer.pad_token_id else token for token in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs





# Tokenize only the subset
tokenized_dataset = dataset_subset.map(tokenize_function, batched=True)


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
val_file = "/kaggle/input/code-net-python/valid.jsonl"
val_data_sample = load_and_preprocess_data(val_file)
val_dataset = Dataset.from_list(val_data_sample)
val_subdataset = val_dataset.select(range(min(len(val_dataset), subset_size))) 
val_tokenized_dataset = val_subdataset.map(tokenize_function,batched=True)




Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
!pip install -U bitsandbytes
!pip install -U accelerate
!pip install -U transformers




In [6]:
!pip install peft




In [8]:
import os
import torch
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType

# Initialize model, tokenizer, and training arguments
model_name = "Salesforce/codet5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with 8-bit quantization
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config={"load_in_8bit": True},  
    device_map="auto"
)

# Attach adapters using PEFT (LoRA)
lora_config = LoraConfig(
    r=8,  
    lora_alpha=32,
    target_modules=["q", "v"],  
    task_type=TaskType.SEQ_2_SEQ_LM,  
)
model = get_peft_model(model, lora_config)

# Training arguments (Disable evaluation at every epoch)
training_args = TrainingArguments(
    output_dir="/kaggle/working/PYFIXAI_training",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    learning_rate=5e-5,
    evaluation_strategy="no",  
    save_strategy="epoch",  
    save_total_limit=3,
    logging_dir="/kaggle/working/logs",
    logging_steps=10,
    load_best_model_at_end=False,  
    report_to="none",
    run_name="PYFIXAI",
    fp16=True,
)

# Data collator for seq2seq tasks
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8,
)





In [9]:
# Initialize Trainer (No eval dataset at start)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,  
)

# Function to save model after each epoch
def save_model(epoch):
    save_path = os.path.join(training_args.output_dir, f"epoch_{epoch}")
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Model saved at {save_path}")

# Training loop (No validation until last epoch)
for epoch in range(training_args.num_train_epochs):
    print(f"Training epoch {epoch + 1}/{training_args.num_train_epochs}")
    trainer.train(resume_from_checkpoint=True if epoch > 0 else None)
    save_model(epoch + 1)

# Evaluate ONLY at the last epoch
print("Evaluating model on validation set after final epoch...")
trainer.eval_dataset = val_tokenized_dataset  # 
eval_results = trainer.evaluate()
print(f"Final Validation Loss: {eval_results['eval_loss']}")


No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Training epoch 1/2


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,5.6828
20,5.5668
30,4.4957
40,4.1709
50,3.3942
60,2.25
70,1.4512
80,1.112
90,0.9816
100,0.5395


Model saved at /kaggle/working/PYFIXAI_training/epoch_1
Training epoch 2/2


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


Step,Training Loss


Model saved at /kaggle/working/PYFIXAI_training/epoch_2
Evaluating model on validation set after final epoch...


Final Validation Loss: 0.08106527477502823
