In [None]:
# ============================
# LIGHTWEIGHT GPT MODEL FINE-TUNING FOR API GENERATION
# ============================

!pip install -q transformers datasets accelerate evaluate
# Add this RIGHT AFTER the pip install, BEFORE any other imports
from google.colab import drive
drive.mount('/content/drive')
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
import json
import numpy as np
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from datetime import datetime
import time


# ============================
# MODEL SELECTION - CHOOSE YOUR LIGHTWEIGHT MODEL
# ============================
MODEL_OPTIONS = {
    "distilgpt2": {
        "name": "distilgpt2",  # 82M parameters (smallest, fastest)
        "context_length": 512,
        "description": "Smallest & fastest, good for quick experiments"
    },
    "gpt2": {
        "name": "gpt2",  # 124M parameters
        "context_length": 1024,
        "description": "Small GPT-2, balanced speed and performance"
    },
    "gpt2-medium": {
        "name": "gpt2-medium",  # 355M parameters
        "context_length": 1024,
        "description": "Medium GPT-2, better quality but slower"
    },
    "codeparrot-small": {
        "name": "codeparrot/codeparrot-small",  # 110M parameters
        "context_length": 512,
        "description": "Specialized for code, very lightweight"
    },
    "microsoft-codebert": {
        "name": "microsoft/codebert-base",  # 125M parameters
        "context_length": 512,
        "description": "Trained on code, good for API tasks"
    }
}

# SELECT YOUR MODEL HERE
SELECTED_MODEL = "distilgpt2"  # Change this to select different model
MODEL_CONFIG = MODEL_OPTIONS[SELECTED_MODEL]
MODEL_NAME = MODEL_CONFIG["name"]
MAX_LENGTH = MODEL_CONFIG["context_length"]

print("="*60)
print(f"ü§ñ Selected Model: {MODEL_NAME}")
print(f"üìù Description: {MODEL_CONFIG['description']}")
print(f"üìè Max Context Length: {MAX_LENGTH}")
print("="*60)

# ============================
# CONFIGURATION
# ============================
# Paths
DATASET_PATH = "/content/drive/MyDrive/API-Pack-ALL-CLEANED"
OUTPUT_DIR = f"/content/drive/MyDrive/{SELECTED_MODEL}-finetuned-api"
CHECKPOINT_DIR = f"/content/drive/MyDrive/{SELECTED_MODEL}-checkpoints"
BEST_MODEL_DIR = f"/content/drive/MyDrive/{SELECTED_MODEL}-best-model"

# Training Configuration
SPEED_MODE = "fast"  # Options: "test", "fast", "full"

configs = {
    "test": {
        "train_fraction": 0.01,    # 1% data for testing
        "val_fraction": 0.01,
        "num_epochs": 1,
        "batch_size": 8,
        "eval_steps": 500,
        "save_steps": 500,
        "learning_rate": 5e-5,
    },
    "fast": {
        "train_fraction": 0.1,     # 10% data for quick training
        "val_fraction": 0.05,
        "num_epochs": 3,
        "batch_size": 4,            # Smaller batch for GPT models
        "eval_steps": 200,
        "save_steps": 200,
        "learning_rate": 3e-5,
    },
    "full": {
        "train_fraction": 1.0,      # Full dataset
        "val_fraction": 0.1,
        "num_epochs": 5,
        "batch_size": 4,
        "eval_steps": 100,
        "save_steps": 100,
        "learning_rate": 2e-5,
    }
}

config = configs[SPEED_MODE]
print(f"\n‚öôÔ∏è  Training Mode: {SPEED_MODE.upper()}")
print(f"   Data: {config['train_fraction']*100:.0f}% training, {config['val_fraction']*100:.0f}% validation")
print(f"   Epochs: {config['num_epochs']}")

# Create directories
for dir_path in [OUTPUT_DIR, CHECKPOINT_DIR, BEST_MODEL_DIR]:
    os.makedirs(dir_path, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üñ•Ô∏è  Device: {device}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# ============================
# LOAD AND PREPARE DATASET
# ============================
print("\nüì¶ Loading dataset...")
dataset = load_from_disk(DATASET_PATH)

# Sample dataset based on configuration
original_train_size = len(dataset["train"])
original_val_size = len(dataset["validation"])

train_size = int(original_train_size * config["train_fraction"])
val_size = int(original_val_size * config["val_fraction"])

dataset["train"] = dataset["train"].shuffle(seed=42).select(range(train_size))
dataset["validation"] = dataset["validation"].shuffle(seed=42).select(range(val_size))

print(f"üìä Dataset sizes:")
print(f"   Training: {len(dataset['train']):,} samples")
print(f"   Validation: {len(dataset['validation']):,} samples")

# ============================
# LOAD TOKENIZER AND MODEL
# ============================
print(f"\nüî§ Loading tokenizer for {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# GPT models need padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"ü§ñ Loading model {MODEL_NAME}...")
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model = model.to(device)

# Enable gradient checkpointing for memory efficiency
if hasattr(model, "gradient_checkpointing_enable"):
    model.gradient_checkpointing_enable()
    print("‚úÖ Gradient checkpointing enabled")

# Print model size
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"üìè Model size: {total_params/1e6:.1f}M parameters ({trainable_params/1e6:.1f}M trainable)")

# ============================
# DATA PREPROCESSING FOR GPT
# ============================
def preprocess_for_gpt(examples):
    """
    Format data for GPT-style training.
    We'll use a prompt template for API generation.
    """
    # Create prompt-completion pairs
    prompts = []
    for source, target in zip(examples["source"], examples["target"]):
        # Format: "### Instruction: {source}\n### Response: {target}"
        prompt = f"### Instruction: {source}\n### Response: {target}{tokenizer.eos_token}"
        prompts.append(prompt)

    # Tokenize
    model_inputs = tokenizer(
        prompts,
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # For language modeling, labels are the same as input_ids
    model_inputs["labels"] = model_inputs["input_ids"].clone()

    # Replace padding token id's in labels by -100 so they're ignored by loss
    model_inputs["labels"][model_inputs["labels"] == tokenizer.pad_token_id] = -100

    return model_inputs

print("\nüîÑ Tokenizing datasets...")
tokenized_datasets = dataset.map(
    preprocess_for_gpt,
    batched=True,
    num_proc=2,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing"
)

# ============================
# CUSTOM METRICS
# ============================
import evaluate
import re

def compute_metrics(eval_preds):
    """Compute metrics for generation quality"""
    predictions, labels = eval_preds

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Extract generated responses (after "### Response:")
    generated_responses = []
    for pred in decoded_preds:
        if "### Response:" in pred:
            response = pred.split("### Response:")[-1].strip()
            generated_responses.append(response)
        else:
            generated_responses.append(pred)

    # Extract expected responses
    expected_responses = []
    for label in decoded_labels:
        if "### Response:" in label:
            response = label.split("### Response:")[-1].strip()
            expected_responses.append(response)
        else:
            expected_responses.append(label)

    # Calculate metrics
    exact_matches = sum(1 for gen, exp in zip(generated_responses, expected_responses)
                       if gen.strip() == exp.strip())

    # Simple BLEU calculation
    try:
        bleu = evaluate.load("sacrebleu")
        bleu_result = bleu.compute(
            predictions=generated_responses[:10],  # Sample for speed
            references=[[exp] for exp in expected_responses[:10]]
        )
        bleu_score = bleu_result['score'] / 100
    except:
        bleu_score = 0.0

    return {
        "exact_match": exact_matches / len(generated_responses),
        "bleu": bleu_score
    }

# ============================
# CHECKPOINT MANAGEMENT
# ============================
checkpoint_info_path = os.path.join(CHECKPOINT_DIR, "checkpoint_info.json")

class GPTTrainerWithCheckpoints(Trainer):
    """Custom trainer with checkpoint management"""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.best_metric = float('-inf')
        self.checkpoint_info = {
            "best_metric": self.best_metric,
            "best_checkpoint": None,
            "training_history": []
        }

        # Load existing checkpoint info
        if os.path.exists(checkpoint_info_path):
            with open(checkpoint_info_path, 'r') as f:
                self.checkpoint_info = json.load(f)
                self.best_metric = self.checkpoint_info.get("best_metric", float('-inf'))

    def _save_checkpoint(self, model, trial, metrics=None):
        super()._save_checkpoint(model, trial, metrics)

        if metrics and "eval_bleu" in metrics:
            self.checkpoint_info["training_history"].append({
                "step": self.state.global_step,
                "metrics": metrics,
                "timestamp": datetime.now().isoformat()
            })

            if metrics["eval_bleu"] > self.best_metric:
                self.best_metric = metrics["eval_bleu"]
                self.checkpoint_info["best_metric"] = self.best_metric
                self.checkpoint_info["best_checkpoint"] = self.state.global_step

                print(f"\nüèÜ New best model! BLEU: {self.best_metric:.4f}")
                self.model.save_pretrained(BEST_MODEL_DIR)
                self.tokenizer.save_pretrained(BEST_MODEL_DIR)

            # Save checkpoint info
            with open(checkpoint_info_path, 'w') as f:
                json.dump(self.checkpoint_info, f, indent=2)

# ============================
# TRAINING SETUP
# ============================
print("\n‚öôÔ∏è  Setting up training...")

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # GPT uses causal LM, not masked LM
    pad_to_multiple_of=8  # Efficient padding
)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=config["num_epochs"],
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"] * 2,
    eval_steps=config["eval_steps"],
    save_steps=config["save_steps"],
    warmup_steps=100,
    learning_rate=config["learning_rate"],
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_bleu",
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision on GPU
    gradient_checkpointing=True,
    gradient_accumulation_steps=2,  # Accumulate gradients for larger effective batch
    dataloader_num_workers=2,
    report_to="none",
    remove_unused_columns=False,
)

# Initialize trainer
trainer = GPTTrainerWithCheckpoints(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3,
            early_stopping_threshold=0.001
        )
    ]
)

# ============================
# TRAIN
# ============================
print("\n" + "="*60)
print("üèãÔ∏è  STARTING GPT FINE-TUNING")
print("="*60)
print(f"Model: {MODEL_NAME}")
print(f"Training samples: {len(tokenized_datasets['train']):,}")
print(f"Batch size: {config['batch_size']} (effective: {config['batch_size']*2})")
print(f"Epochs: {config['num_epochs']}")
print(f"Learning rate: {config['learning_rate']}")
print("="*60)

start_time = time.time()

try:
    # Train
    train_result = trainer.train()

    # Save final model
    print("\nüíæ Saving final model...")
    trainer.save_model(OUTPUT_DIR)
    trainer.save_state()

    training_time = (time.time() - start_time) / 60
    print(f"\n‚úÖ Training completed in {training_time:.1f} minutes!")

    # Final evaluation
    print("\nüìä Final evaluation...")
    eval_results = trainer.evaluate()

    print("\nüìà Final Metrics:")
    for key, value in eval_results.items():
        if key.startswith("eval_"):
            metric_name = key.replace("eval_", "")
            print(f"   {metric_name}: {value:.4f}")

except KeyboardInterrupt:
    print("\n‚ö†Ô∏è  Training interrupted!")
    trainer.save_model(os.path.join(CHECKPOINT_DIR, "interrupted"))
    print("Checkpoint saved.")

# ============================
# TEST THE FINE-TUNED MODEL
# ============================
print("\n" + "="*60)
print("üß™ TESTING FINE-TUNED GPT MODEL")
print("="*60)

# Load best model
if os.path.exists(BEST_MODEL_DIR):
    print(f"Loading best model from {BEST_MODEL_DIR}")
    test_model = AutoModelForCausalLM.from_pretrained(BEST_MODEL_DIR)
    test_tokenizer = AutoTokenizer.from_pretrained(BEST_MODEL_DIR)
else:
    print(f"Loading final model from {OUTPUT_DIR}")
    test_model = model
    test_tokenizer = tokenizer

test_model = test_model.to(device)
test_model.eval()

# Test generation function
def generate_api_code(instruction, max_length=256):
    """Generate API code from instruction"""
    prompt = f"### Instruction: {instruction}\n### Response:"

    inputs = test_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    inputs = inputs.to(device)

    with torch.no_grad():
        outputs = test_model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            pad_token_id=test_tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.9,
        )

    generated_text = test_tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the response part
    if "### Response:" in generated_text:
        response = generated_text.split("### Response:")[-1].strip()
        return response
    return generated_text

# Test examples
test_examples = [
    "Create a Python function to fetch user data from GitHub API",
    "Write code to post a message to Slack webhook",
    "Generate a request to OpenWeatherMap API for current weather"
]

print("\nüìù Testing with examples:\n")
for i, example in enumerate(test_examples, 1):
    print(f"Example {i}:")
    print(f"Instruction: {example}")
    print(f"Generated: {generate_api_code(example)[:200]}...")
    print("-" * 40)

print("\n‚úÖ Fine-tuning complete!")
print(f"üìÅ Models saved:")
print(f"   - Best model: {BEST_MODEL_DIR}")
print(f"   - Final model: {OUTPUT_DIR}")

# ============================
# INTERACTIVE TESTING
# ============================
print("\n" + "="*60)
print("üéÆ INTERACTIVE API CODE GENERATION")
print("Type 'quit' to exit")
print("="*60)

while True:
    user_input = input("\nüìù Enter API instruction (or 'quit'): ").strip()

    if user_input.lower() == 'quit':
        break

    if not user_input:
        continue

    print("\n‚öôÔ∏è  Generating...")
    generated = generate_api_code(user_input)
    print(f"\nüéØ Generated Code:\n{generated}")