<a href="https://colab.research.google.com/github/DHIVYASRI-D/Comparing-Transformer-Models-for-Token-Based-Code-Completion-in-Python/blob/main/3_codet5_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SETUP & INSTALL


In [None]:
!pip install transformers datasets evaluate accelerate

# IMPORT LIBRARIES

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import math

# LOAD DATA

In [None]:
!pip install -U datasets

In [None]:
dataset = load_dataset("code_search_net", "python")
train_data = dataset["train"].select(range(1000))
val_data = dataset["validation"].select(range(200))

# Load tokenizer and mode

In [None]:
model_checkpoint = "Salesforce/codet5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Preprocess function (token-level shifting)

In [None]:
def preprocess(example):
    code = example["func_code_string"]
    tokens = tokenizer(code, truncation=True, padding="max_length", max_length=128)

    input_ids = tokens["input_ids"]
    labels = input_ids[1:] + [tokenizer.pad_token_id]  # Shift left
    inputs = input_ids[:-1] + [tokenizer.pad_token_id]

    # Mask pad tokens
    labels = [label if label != tokenizer.pad_token_id else -100 for label in labels]

    tokens["input_ids"] = inputs
    tokens["labels"] = labels
    return tokens

# Tokenize datasets

In [None]:
tokenized_train = train_data.map(preprocess, remove_columns=train_data.column_names)
tokenized_val = val_data.map(preprocess, remove_columns=val_data.column_names)

# Data collator for padding

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training setup

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./codet5-base-results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    predict_with_generate=False,
    report_to="none"
)

# Trainer

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train

In [None]:
trainer.train()

# Evaluation function

In [None]:
def evaluate_codeT5(model, tokenizer, eval_dataset):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0

    for i in range(len(eval_dataset)):
        input_ids = torch.tensor(eval_dataset[i]["input_ids"]).unsqueeze(0)
        labels = torch.tensor(eval_dataset[i]["labels"]).unsqueeze(0)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, labels=labels)
            logits = outputs.logits
            loss = outputs.loss

        predictions = torch.argmax(logits, dim=-1)
        mask = labels != -100

        correct += ((predictions == labels) & mask).sum().item()
        total += mask.sum().item()
        total_loss += loss.item()

    accuracy = correct / total if total > 0 else 0.0
    perplexity = math.exp(total_loss / len(eval_dataset))
    return accuracy, perplexity

# Evaluate

In [None]:
accuracy, perplexity = evaluate_codeT5(model, tokenizer, tokenized_val)
print(f"Accuracy: {accuracy:.4f}")
print(f"Perplexity: {perplexity:.4f}")

In [None]:
# Choose a directory to save
save_dir = "./codeT5-base-finetuned"

# Save model
model.save_pretrained(save_dir)

# Save tokenizer
tokenizer.save_pretrained(save_dir)

print(f"Model and tokenizer saved to {save_dir}")


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
save_dir = "/content/drive/MyDrive/fine-tuned-models/codeT5-base-finetuned"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"Model and tokenizer saved permanently to {save_dir}")


# UI

In [None]:
!pip install gradio

In [None]:
import gradio as gr
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch

# Load fine-tuned CodeT5 model
model_dir = "/content/drive/MyDrive/fine-tuned-models/codeT5-base-finetuned"
model = T5ForConditionalGeneration.from_pretrained(model_dir).to("cpu").eval()
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Prediction function
def predict_codet5(input_code, max_tokens):
    prompt = f"# Python 3\n# Complete the following function:\n{input_code.strip()}\n"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_length=max_tokens,
    num_beams=5,
    early_stopping=True
    )
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)



# Gradio UI with a slider
with gr.Blocks() as demo:
    gr.Markdown("## Code Completion with Fine-Tuned CodeT5")
    code_input = gr.Textbox(label="Enter partial Python function", lines=5, placeholder="e.g., def add(a, b):")
    token_slider = gr.Slider(minimum=32, maximum=128, value=64, step=1, label="Max Tokens to Generate")
    output = gr.Textbox(label="Predicted Completion")

    btn = gr.Button("Generate")
    btn.click(fn=predict_codet5, inputs=[code_input, token_slider], outputs=output)

demo.launch()


# checking

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch

# Load the raw CodeT5-base model and tokenizer from Hugging Face
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-base").to("cpu").eval()
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")


In [None]:
def predict_codet5(input_code, max_tokens):
    prompt = f"code completion: {input_code.strip()}"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_tokens,
        do_sample=True,           # Sampling ON
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded


In [None]:
with gr.Blocks() as demo:
    gr.Markdown("## Code Completion with Raw CodeT5-Base")
    code_input = gr.Textbox(label="Enter partial Python code", lines=5)
    token_slider = gr.Slider(minimum=16, maximum=128, value=64, step=1, label="Max Tokens")
    output = gr.Textbox(label="Predicted Completion")

    btn = gr.Button("Generate")
    btn.click(fn=predict_codet5, inputs=[code_input, token_slider], outputs=output)

demo.launch()
