<a href="https://colab.research.google.com/github/AaravAsthana/AI-Based-Code-Auto-Completion-Bug-Detection/blob/main/Code_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AI based auto complete for coding

### Setup and Istallation

In [7]:
!pip install -q transformers datasets torch scikit-learn tqdm accelerate bitsandbytes peft


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
import torch
import transformers
import datasets

print("Torch version:", torch.__version__)
print("Transformers version:", transformers.__version__)
print("Datasets version:", datasets.__version__)

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)



Torch version: 2.5.1+cu121
Transformers version: 4.50.0
Datasets version: 3.5.0
Using device: cuda


### Data Preparation

In [9]:
# Import required modules (libraries already installed in previous steps)
from datasets import load_dataset
from transformers import AutoTokenizer

# 1. Load the CodeSearchNet dataset (Python subset)
# Note: CodeSearchNet provides several fields; here we focus on the code itself.
dataset = load_dataset("code_search_net", "python")

# (Optional) Add a dummy label if your task requires one and the dataset doesn't provide it.
# For instance, for classification tasks like bug detection, you might need labels.
def add_dummy_label(example):
    example["label"] = 0  # Dummy value; replace with actual labels if available.
    return example

dataset = dataset.map(add_dummy_label)

# 2. Initialize the CodeBERT tokenizer.
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

# 3. Define a preprocessing function to tokenize the code.
def preprocess_function(examples):
    # 'func_code_string' is the key containing the code in the dataset.
    return tokenizer(
        examples["func_code_string"],
        padding="max_length",       # Pad all sequences to the model's max length.
        truncation=True,            # Truncate sequences longer than max_length.
        max_length=256              # Adjust max_length as needed.
    )

# 4. Apply the preprocessing function to the entire dataset.
# The 'batched=True' argument processes multiple examples at once.
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 5. Set the dataset format to PyTorch tensors for training.
# We include the 'label' column along with tokenizer outputs.
tokenized_datasets.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

# 6. (Optional) Inspect a sample to verify the data format.
print(tokenized_datasets["train"][0])


Map:   0%|          | 0/412178 [00:00<?, ? examples/s]

Map:   0%|          | 0/22176 [00:00<?, ? examples/s]

Map:   0%|          | 0/23107 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Map:   0%|          | 0/412178 [00:00<?, ? examples/s]

Map:   0%|          | 0/22176 [00:00<?, ? examples/s]

Map:   0%|          | 0/23107 [00:00<?, ? examples/s]

{'label': tensor(0), 'input_ids': tensor([    0,  9232,   992,   119,  1343,  1215, 42005,  1640, 13367,  3256,
        50118,  1437,  1437,  1437,  1437,  1437,  1437,  1437,   128, 17809,
        50118,  1437,  1437,  1437,  1437,  1437,  1437,  1437, 14910,  1588,
         1001, 19348,   154,  1002,    13,     5,   992,   119,  1343, 21021,
         2187, 50118,  1437,  1437,  1437,  1437,  1437,  1437,  1437,   128,
        17809, 50118,  1437,  1437,  1437,  1437,  1437,  1437,  1437,  1403,
            4, 30529, 48672,  1215, 13033,  1536, 43048, 50118,  1437,  1437,
         1437,  1437,  1437,  1437,  1437,  6740,     4, 49320,     4, 31931,
            4, 48696,  4892,  3894, 46398, 45803,   448, 21461,   254, 49382,
        27645, 50118,  1437,  1437,  1437,  1437,  1437,  1437,  1437,  1403,
            4, 46796,  5457,   992,   119,  1343,     4, 48522,  1640, 13367,
            4,  1517,  1872, 48759, 23589,  1215, 43935,    29,   108, 45587,
        50118,  1437,  1437,  

### Create Dataloaders

In [10]:
from torch.utils.data import DataLoader, random_split

# 1. Determine dataset sizes
total_size = len(tokenized_datasets["train"])
train_size = int(0.8 * total_size)  # 80% for training
val_size = total_size - train_size  # 20% for validation

# 2. Split the dataset into train and validation sets
train_dataset, val_dataset = random_split(tokenized_datasets["train"], [train_size, val_size])

# 3. Define batch size
batch_size = 16  # Adjust based on GPU memory

# 4. Create DataLoaders
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)

# 5. Verify DataLoader
batch = next(iter(train_dataloader))
print({key: value.shape for key, value in batch.items()})  # Check batch structure


{'label': torch.Size([16]), 'input_ids': torch.Size([16, 256]), 'attention_mask': torch.Size([16, 256])}


### Model Setup

In [11]:
from transformers import AutoModelForSequenceClassification
import torch

# 1. Load the pre-trained CodeBERT model with a classification head (2 labels)
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)

# 2. Move the model to the GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 3. Optionally, print the model architecture to verify it's loaded correctly
print(model)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

### Training Configuration and Training Loop

In [1]:
import torch
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm  # For a progress bar

# Training parameters
num_epochs = 5
learning_rate = 5e-5

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Calculate total training steps (number of batches per epoch multiplied by number of epochs)
num_training_steps = len(train_dataloader) * num_epochs

# Set up a linear learning rate scheduler
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,  # No warm-up steps in this example
    num_training_steps=num_training_steps
)

# Ensure model is in training mode and moved to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Initialize a progress bar for the training loop
progress_bar = tqdm(range(num_training_steps))

# Training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    total_loss = 0

    # Iterate over batches in the training DataLoader
    for batch in train_dataloader:
        # Move each batch to the device (GPU or CPU)
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass: Get model outputs (logits)
        outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])

        # Compute the loss between the outputs and the true labels
        loss = loss_fn(outputs.logits, batch["label"])
        total_loss += loss.item()

        # Backward pass: Compute gradients
        loss.backward()

        # Update model parameters
        optimizer.step()

        # Update the learning rate scheduler
        lr_scheduler.step()

        # Reset gradients for the next step
        optimizer.zero_grad()

        # Update the progress bar
        progress_bar.update(1)

    avg_loss = total_loss / len(train_dataloader)
    print(f"Average loss for epoch {epoch + 1}: {avg_loss:.4f}")

# Save the fine-tuned model for later use
torch.save(model.state_dict(), "codebert_finetuned.pth")
print("Training complete! Model saved as 'codebert_finetuned.pth'.")


NameError: name 'model' is not defined