<a href="https://colab.research.google.com/github/DHIVYASRI-D/Comparing-Transformer-Models-for-Token-Based-Code-Completion-in-Python/blob/main/4_gpt_neo_125M.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 Install dependencies

In [None]:
!pip install transformers datasets evaluate


install librarires

In [None]:
!pip install transformers datasets --quiet

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
import torch


Load your pre-tokenized dataset (same subset for fair comparison)

In [None]:
dataset = load_dataset("code_search_net", "python")
small_train_dataset = dataset["train"].select(range(1000))
small_val_dataset = dataset["validation"].select(range(200))


 Load Tokenizer & Model

In [None]:
model_ckpt = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForCausalLM.from_pretrained(model_ckpt)

# Fix for padding
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id


Tokenization


In [None]:
def tokenize_function(examples):
    tokens = tokenizer(examples["func_code_string"], padding="max_length", truncation=True, max_length=128)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


tokenized_train = small_train_dataset.map(tokenize_function, batched=True, remove_columns=small_train_dataset.column_names)
tokenized_val = small_val_dataset.map(tokenize_function, batched=True, remove_columns=small_val_dataset.column_names)


Set Format for PyTorch

In [None]:
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")


Training Arguments and Trainer

In [None]:
training_args = TrainingArguments(
    output_dir="./gptneo-results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)


 Train

In [None]:
trainer.train()


Evaluation Code

In [None]:
import torch
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import math
from datasets import load_dataset

def evaluate_gptneo(model, tokenizer, dataset):
    model.eval()
    model.to("cuda")

    correct = 0
    total = 0
    loss_fn = CrossEntropyLoss()
    total_loss = 0.0

    for item in tqdm(dataset.select(range(200))):  # limit eval to 200 samples
        input_ids = tokenizer.encode(item["func_code_string"], return_tensors="pt", truncation=True, max_length=128).to("cuda")

        if input_ids.size(1) < 2:
            continue  # skip too-short inputs

        inputs = input_ids[:, :-1]
        labels = input_ids[:, 1:]

        with torch.no_grad():
            outputs = model(input_ids=inputs)
            logits = outputs.logits

        shift_logits = logits[:, :-1, :].contiguous()
        shift_labels = labels[:, :shift_logits.size(1)].contiguous()

        loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        total_loss += loss.item()

        predictions = torch.argmax(shift_logits, dim=-1)
        correct += (predictions == shift_labels).sum().item()
        total += shift_labels.numel()

    accuracy = correct / total
    perplexity = math.exp(total_loss / len(dataset.select(range(200))))
    return accuracy, perplexity

# Reload raw dataset for 'func_code_string'
dataset = load_dataset("code_search_net", split="train", name="python")
dataset = dataset.train_test_split(test_size=0.1, seed=42)
small_val_dataset = dataset["test"].select(range(200))

# Run evaluation
accuracy, perplexity = evaluate_gptneo(model, tokenizer, small_val_dataset)
print(f"Accuracy: {accuracy:.4f}")
print(f"Perplexity: {perplexity:.4f}")


Save

In [None]:
model.save_pretrained("/content/gptneo-125M-codecompletion")
tokenizer.save_pretrained("/content/gptneo-125M-codecompletion")


save in Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
save_path = "/content/drive/MyDrive/token-completion-models/gptneo-125M"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
