In [1]:
!pip install transformers datasets accelerate peft




In [2]:
from datasets import load_dataset

# Load the Python split of the CodeSearchNet dataset
dataset = load_dataset("CodeSearchNet", "python")

print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetNotFoundError: Dataset 'CodeSearchNet' doesn't exist on the Hub or cannot be accessed.

In [3]:
from datasets import load_dataset

# Use the correct, community-provided dataset path
dataset = load_dataset("Nan-Do/instructional_code-search-net-python")

print(dataset)

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a9aba9cddbadc4(…):   0%|          | 0.00/173M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/418545 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['INSTRUCTION', 'RESPONSE', 'SOURCE'],
        num_rows: 418545
    })
})


In [9]:
# Assuming you have already loaded the dataset with:
# dataset = load_dataset("Nan-Do/instructional_code-search-net-python")

# Print the column names of the 'train' split
print(dataset['train'].column_names)

['INSTRUCTION', 'RESPONSE', 'SOURCE']


In [10]:
# Install the necessary libraries
!pip install transformers datasets accelerate peft

# ----------------------------------------------------------------------------------------------------------------------
# STEP 1: LOAD THE DATASET
# ----------------------------------------------------------------------------------------------------------------------

from datasets import load_dataset

print("Loading dataset...")
# Make sure this dataset name is correct as per your successful load
dataset = load_dataset("Nan-Do/instructional_code-search-net-python")

# ----------------------------------------------------------------------------------------------------------------------
# STEP 2: PREPROCESS THE DATA
# ----------------------------------------------------------------------------------------------------------------------

from transformers import AutoTokenizer

print("Loading tokenizer and preprocessing data...")
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-small")

max_input_length = 256
max_target_length = 128

def preprocess_examples(examples):
    # Use the correct column names for the instruction and response
    inputs = [f"summarize: {instruction}" for instruction in examples["INSTRUCTION"]]

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["RESPONSE"], max_length=max_target_length, truncation=True, padding="max_length")

    labels["input_ids"] = [[(label if label != tokenizer.pad_token_id else -100) for label in labels_list] for labels_list in labels["input_ids"]]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# A more robust way to handle column removal
column_names = dataset["train"].column_names
tokenized_dataset = dataset.map(
    preprocess_examples,
    batched=True,
    remove_columns=column_names
)

# ----------------------------------------------------------------------------------------------------------------------
# STEP 3: FINE-TUNE WITH LoRA
# ----------------------------------------------------------------------------------------------------------------------

import torch
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType

print("Loading model and configuring LoRA...")
model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-small")

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "k", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

training_args = TrainingArguments(
    output_dir="./codet5-commenter",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-4,
    num_train_epochs=3,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

print("Starting fine-tuning...")
trainer.train()

# Save the final fine-tuned model and tokenizer
trainer.save_model("codet5_commenter_final")
tokenizer.save_pretrained("codet5_commenter_final")

print("Fine-tuning complete. Model saved!")

Loading dataset...
Loading tokenizer and preprocessing data...


Map:   0%|          | 0/418545 [00:00<?, ? examples/s]



Loading model and configuring LoRA...


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

trainable params: 442,368 || all params: 60,934,656 || trainable%: 0.7260


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [11]:
# The corrected code snippet
# You don't need to re-run the previous cells that loaded the model or data,
# as long as they are still in your Colab session's memory.

import torch
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType

print("Loading model and configuring LoRA...")
# Load the pre-trained CodeT5 model
model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-small")

# Define the LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "k", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Define the training arguments with the corrected keyword
training_args = TrainingArguments(
    output_dir="./codet5-commenter",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-4,
    num_train_epochs=3,
    save_strategy="epoch",
    eval_strategy="epoch",  # Corrected keyword
    logging_steps=100,
    report_to="none"
)

# Create the Hugging Face Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

# Start fine-tuning
print("Starting fine-tuning...")
trainer.train()

# Save the final fine-tuned model and tokenizer
trainer.save_model("codet5_commenter_final")
tokenizer.save_pretrained("codet5_commenter_final")

print("Fine-tuning complete. Model saved!")

Loading model and configuring LoRA...
trainable params: 442,368 || all params: 60,934,656 || trainable%: 0.7260


KeyError: 'validation'

In [12]:
# Assuming your dataset is loaded as 'dataset'
print(dataset.keys())


dict_keys(['train'])


In [13]:
# The corrected code to work with a dataset that only has a 'train' split

# ... (rest of the code for loading and preprocessing) ...

print("Loading model and configuring LoRA...")
model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-small")

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "k", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

training_args = TrainingArguments(
    output_dir="./codet5-commenter",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-4,
    num_train_epochs=3,
    # Set both strategies to "no" to disable evaluation and saving
    eval_strategy="no",
    save_strategy="no",
    logging_steps=100,
    report_to="none"
)

# Create the Hugging Face Trainer without an eval_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)

print("Starting fine-tuning...")
trainer.train()

# Save the final fine-tuned model and tokenizer
trainer.save_model("codet5_commenter_final")
tokenizer.save_pretrained("codet5_commenter_final")

print("Fine-tuning complete. Model saved!")

Loading model and configuring LoRA...
trainable params: 442,368 || all params: 60,934,656 || trainable%: 0.7260
Starting fine-tuning...


Step,Training Loss
100,4.9986
200,3.4525
300,3.1428
400,3.0901
500,2.9954
600,2.941
700,2.8914
800,2.8737
900,2.8247
1000,2.8003


KeyboardInterrupt: 

In [14]:
# Create a small, randomized subset of the training data
# We'll use 10,000 examples for a much faster training run.
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(10000))

# Create the Hugging Face Trainer with the smaller dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
)

In [15]:
training_args = TrainingArguments(
    output_dir="./codet5-commenter",
    per_device_train_batch_size=16, # Increased batch size
    per_device_eval_batch_size=16,
    learning_rate=3e-4,
    num_train_epochs=3,
    eval_strategy="no",
    save_strategy="no",
    logging_steps=100,
    report_to="none"
)

In [16]:
# The corrected code to work with a dataset that only has a 'train' split

# ... (rest of the code for loading and preprocessing) ...

print("Loading model and configuring LoRA...")
model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-small")

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "k", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Adjusting training arguments for a faster run
training_args = TrainingArguments(
    output_dir="./codet5-commenter",
    # Using a larger batch size for better GPU utilization
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-4,
    num_train_epochs=3,
    eval_strategy="no",
    save_strategy="no",
    logging_steps=100,
    report_to="none"
)

# Selecting a smaller, randomized subset of the training data
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(10000))

# Create the Hugging Face Trainer with the smaller dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
)

print("Starting fine-tuning...")
trainer.train()

# Save the final fine-tuned model and tokenizer
trainer.save_model("codet5_commenter_final")
tokenizer.save_pretrained("codet5_commenter_final")

print("Fine-tuning complete. Model saved!")

Loading model and configuring LoRA...
trainable params: 442,368 || all params: 60,934,656 || trainable%: 0.7260
Starting fine-tuning...


Step,Training Loss
100,4.86
200,3.3223
300,3.1532
400,3.0331
500,2.944
600,2.8847
700,2.8337
800,2.8567
900,2.7562
1000,2.8006


Fine-tuning complete. Model saved!
