In [None]:
# Install dependencies
!pip install evaluate
!pip install nltk
!pip install peft
!pip install trl
!pip install sacrebleu
!pip install codebleu
!pip install tree-sitter-python

## Import Libraries

In [11]:
import pandas as pd
import re
import json
import torch
import evaluate
import nltk
from datasets import Dataset
from nltk.metrics.distance import edit_distance
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer,
    DataCollatorForSeq2Seq, BitsAndBytesConfig, pipeline
)
from peft import LoraConfig, get_peft_model, TaskType,prepare_model_for_kbit_training
from trl import SFTTrainer

## Prepare data for fine-tuning

In [15]:

# Load dataset (Ensure it has 'prompt' and 'response' columns)
df = pd.read_json("fine_tuning_dataset.jsonl",lines=True)  

# Select 100K samples for fine-tuning
df = df[['prompt', 'completion']].dropna().sample(n=10000, random_state=42)

# Convert to Hugging Face dataset format
dataset = Dataset.from_pandas(df)

# Train-Test Split (90% Train, 10% Test)
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")

# Load tokenizer
model_name = "codellama/CodeLlama-7b-Instruct-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Prevents padding issues

# Formatting function (ensures structured input-output)
def format_data(example):
    return {
        "input_ids": tokenizer(example["prompt"], padding="max_length", truncation=True, max_length=256)["input_ids"],
        "attention_mask": tokenizer(example["prompt"], padding="max_length", truncation=True, max_length=256)["attention_mask"],
        "labels": tokenizer(example["completion"], padding="max_length", truncation=True, max_length=256)["input_ids"]
    }

# Apply tokenization & formatting
tokenized_dataset = dataset.map(format_data, batched=True)


Train size: 45000, Test size: 5000


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [16]:
train_dataset = train_dataset.map(format_data, remove_columns=["prompt", "completion"])
test_dataset = test_dataset.map(format_data, remove_columns=["prompt", "completion"])

Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

## Set model config

In [13]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

# Load 4-bit quantized model
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else {"": "cpu"}

# ✅ Load 4-bit quantized model and explicitly assign device
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)

# ✅ Move model to CUDA if it's not properly assigned
if torch.cuda.is_available():
    model = model.to(torch.cuda.current_device())

# Prepare model for fine-tuning
model = prepare_model_for_kbit_training(model)

# Attach LoRA adapters
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Ensure LoRA is correctly attached


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 8,388,608 || all params: 6,746,935,296 || trainable%: 0.1243


## Train the model

In [None]:
from transformers import Trainer, DataCollatorForSeq2Seq

# Define data collator (Handles padding for efficiency)
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=True)
training_args = TrainingArguments(
    output_dir="./codellama-7b-instruct-lora",
    per_device_train_batch_size=16,  # Keep batch small due to memory constraints
    gradient_accumulation_steps=4,  # Increase accumulation to simulate a bigger batch
    learning_rate=2e-4,  # Lower LR for stability
    warmup_steps=100,
    num_train_epochs=3,
    logging_steps=50,
    save_steps=1000,  # Save less frequently to reduce overhead
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=1000,  # Evaluate less frequently to speed up training
    logging_dir="./logs",
    optim="adamw_bnb_8bit",  # Optimized for QLoRA
    fp16 = True,  # BF16 instead of FP16 for efficiency
    torch_compile=False,  # ✅ Disable for better speed
    report_to="none",
    remove_unused_columns=False,
    gradient_checkpointing=True,  # ✅ Speeds up training by reducing memory usage
)
# Initialize Trainer
import torch
if torch.cuda.get_device_capability(0)[0] >= 8:  # A100, H100, L40, etc.
    model.enable_input_require_grads()
    from accelerate import dispatch_model
    model = dispatch_model(model,device_map=device_map)
    print("✅ Flash Attention 2 Enabled!")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator)

# Start fine-tuning
trainer.train()

# Save the fine-tuned model
trainer.save_model("codellama-7b-instruct-lora-finetuned")


## Evaluating Fine-tuned Model

In [46]:
 # ✅ Frees up unused memory

# Load Fine-Tuned Model
MODEL_NAME = "codellama-7b-instruct-lora/checkpoint-2109"  # Update with actual model path
device_map ={"": "cpu"}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,device_map=device_map)

# Load Test Dataset



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
df = pd.read_json("fine_tuning_dataset.jsonl",lines=True)  

# Select 100K samples for fine-tuning
df = df[['prompt', 'completion']].dropna().sample(n=20, random_state=42)

# Convert to Hugging Face dataset format
test_dataset = Dataset.from_pandas(df)

# Load Metrics


In [25]:
def calculate_perplexity(text):
    """Computes perplexity for a given text."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256).to("cpu")
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss.item()
    return math.exp(loss)

# Compute Perplexity on test set
perplexities = [calculate_perplexity(example["prompt"]) for example in test_dataset]
avg_perplexity = sum(perplexities) / len(perplexities)

# --------------------------
# 4. BLEU Score Calculation
# --------------------------
bleu_metric = evaluate.load("bleu")

def calculate_bleu(reference, prediction):
    """Computes BLEU score for a prediction against a reference."""
    if isinstance(reference, list):  # Convert tokenized text back to string
        reference = " ".join(reference)
    if isinstance(prediction, list):
        prediction = " ".join(prediction)

    return bleu_metric.compute(predictions=[prediction], references=[[reference]])["bleu"]

# --------------------------
# 5. Pass@k (Execution-Based Metric)
# --------------------------
def pass_at_k(completions, reference_outputs, k=3):
    """Computes Pass@k metric."""
    correct = 0
    for i in range(len(completions)):
        if reference_outputs[i] in completions[i][:k]:  # If the correct answer is in the top-k predictions
            correct += 1
    return correct / len(completions)

# --------------------------
# 6. Functional Testing (Unit Test Execution)
# --------------------------
def evaluate_execution(generated_code, expected_output):
    """Executes generated code and checks if output matches expected."""
    try:
        local_vars = {}
        exec(generated_code, {}, local_vars)  # Execute in isolated namespace
        return local_vars.get("output") == expected_output  # Assumes function stores result in `output`
    except Exception:
        return False  # Fail on exceptions

# --------------------------
# 7. Run Evaluation
# --------------------------
generated_outputs = []
bleu_scores = []
execution_results = []

for example in test_dataset:
    # Tokenize input
    inputs = tokenizer(example["prompt"], return_tensors="pt", truncation=True, max_length=256).to("cpu")
    
    # Generate CodeLlama Output
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=100)  # ✅ Use max_new_tokens

    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)  # ✅ Convert tokens to text
    generated_outputs.append(generated_code)

    # Compute BLEU Score
    bleu_score = calculate_bleu(example["completion"], generated_code)
    bleu_scores.append(bleu_score)

    # Functional Testing (Check execution correctness)
    execution_results.append(evaluate_execution(generated_code, example["completion"]))

# Compute Pass@3
pass_at_k_score = pass_at_k(generated_outputs, [ex["completion"] for ex in test_dataset], k=3)

# Compute Average BLEU & Execution Accuracy
avg_bleu = sum(bleu_scores) / len(bleu_scores)
execution_accuracy = sum(execution_results) / len(execution_results)

# --------------------------
# 8. Print Final Results
# --------------------------
print("\n--- Fine-Tuned CodeLlama Evaluation Results ---")
print(f"Perplexity (Lower is better): {avg_perplexity:.3f}")
print(f"BLEU Score (Higher is better): {avg_bleu:.3f}")
print(f"Pass@3 (Higher is better): {pass_at_k_score:.3f}")
print(f"Execution Accuracy (Higher is better): {execution_accuracy:.3f}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o


--- Fine-Tuned CodeLlama Evaluation Results ---
Perplexity (Lower is better): 9427028.416
BLEU Score (Higher is better): 0.103
Pass@3 (Higher is better): 0.000
Execution Accuracy (Higher is better): 0.000


In [45]:
from transformers import AutoTokenizer

MODEL_PATH = "codellama-7b-instruct-lora/checkpoint-2109"
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")  # Base tokenizer
tokenizer.save_pretrained(MODEL_PATH)

print("✅ Tokenizer saved successfully!")


✅ Tokenizer saved successfully!


In [47]:
inputs = tokenizer("<QUESTION>what is dynamic programming</QUESTION>", return_tensors="pt", truncation=True, max_length=256).to("cpu")
    
    # Generate CodeLlama Output
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=256)  # ✅ Use max_new_tokens

generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_code

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'<QUESTION>what is dynamic programming</QUESTION>'