<a href="https://colab.research.google.com/github/DHIVYASRI-D/Comparing-Transformer-Models-for-Token-Based-Code-Completion-in-Python/blob/main/1_Token_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets evaluate
!pip install accelerate
!pip install bitsandbytes  # optional: if you later want to use 8-bit model loading



In [None]:
import transformers
import datasets
import evaluate
import accelerate

print("All libraries loaded successfully.")


All libraries loaded successfully.


In [None]:
from datasets import load_dataset

dataset = load_dataset("code_search_net", "python")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


#**1.CodeGPT-small-py**

### Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/CodeGPT-small-py")
tokenizer.pad_token = tokenizer.eos_token  # For GPT-like models


###Tokenization

In [None]:
def tokenize(example):
    code_line = " ".join(example["func_code_tokens"])
    return tokenizer(code_line, truncation=True, max_length=128, padding="max_length")

tokenized_dataset = dataset.map(tokenize, batched=False)
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})


###Model

In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("microsoft/CodeGPT-small-py")
model.resize_token_embeddings(len(tokenizer))


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50002, 768)

###Training Setup

In [None]:

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./codegpt-small-results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    eval_strategy = "epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(1000)),
    eval_dataset=tokenized_dataset["validation"].select(range(200)),
)

### Train

In [None]:
trainer.train()


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,1.2066,1.217278


TrainOutput(global_step=500, training_loss=1.450552734375, metrics={'train_runtime': 68.2908, 'train_samples_per_second': 14.643, 'train_steps_per_second': 7.322, 'total_flos': 65323008000000.0, 'train_loss': 1.450552734375, 'epoch': 1.0})

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import math

# Load the model and tokenizer
model_path = "./codegpt-small-results/checkpoint-500"  # adjust if needed
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("microsoft/CodeGPT-small-py")

# Move model to correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def evaluate_accuracy_and_perplexity(model, tokenizer, dataset, max_samples=200):
    model.eval()
    correct = 0
    total = 0
    loss_sum = 0.0

    for sample in dataset.select(range(max_samples)):
        input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).to(device)
        labels = torch.tensor(sample["labels"]).unsqueeze(0).to(device)

        with torch.no_grad():
            outputs = model(input_ids, labels=labels)
            logits = outputs.logits
            loss_sum += outputs.loss.item()

            predictions = torch.argmax(logits, dim=-1)
            mask = labels != -100
            correct += (predictions[mask] == labels[mask]).sum().item()
            total += mask.sum().item()

    accuracy = correct / total if total > 0 else 0
    perplexity = math.exp(loss_sum / max_samples)
    return accuracy, perplexity

accuracy, perplexity = evaluate_accuracy_and_perplexity(model, tokenizer, tokenized_dataset["validation"])
print(f"Accuracy: {accuracy:.4f}")
print(f"Perplexity: {perplexity:.4f}")


Accuracy: 0.2515
Perplexity: 3.3812


In [None]:
from google.colab import drive
drive.mount('/content/drive')


save_path = "/content/drive/MyDrive/token-completion-models/codegpt-small-py"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


('/content/drive/MyDrive/token-completion-models/codegpt-small-py/tokenizer_config.json',
 '/content/drive/MyDrive/token-completion-models/codegpt-small-py/special_tokens_map.json',
 '/content/drive/MyDrive/token-completion-models/codegpt-small-py/vocab.json',
 '/content/drive/MyDrive/token-completion-models/codegpt-small-py/merges.txt',
 '/content/drive/MyDrive/token-completion-models/codegpt-small-py/added_tokens.json',
 '/content/drive/MyDrive/token-completion-models/codegpt-small-py/tokenizer.json')