In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
import os

# Kaggle data path
data_path = "/kaggle/input/mitre-datset"

if os.path.exists(data_path):
    files = os.listdir(data_path)
    print("üìÇ Files in data folder:")
    for f in files:
        size = os.path.getsize(os.path.join(data_path, f)) / (1024**2)
        print(f"   {f}: {size:.1f} MB")
    
    # Verify JSONL
    import json
    for filename in ['train.jsonl', 'val.jsonl', 'test.jsonl']:
        filepath = os.path.join(data_path, filename)
        if os.path.exists(filepath):
            with open(filepath, 'r', encoding='utf-8') as f:
                count = sum(1 for _ in f)
            print(f"‚úÖ {filename}: {count:,} examples")
        else:
            print(f"‚ùå {filename}: NOT FOUND")
else:
    print("‚ùå Dataset not found!")
    print("üìå Upload data: Click 'Add Data' > 'Upload' > 'New Dataset'")
    print("   Name it: mitre-dataset")
    print("   Upload: train.jsonl, val.jsonl, test.jsonl")

In [None]:
# Install required packages
print("üì¶ Installing dependencies...")
!pip install -q transformers datasets accelerate peft
print("‚úÖ Dependencies installed!")

In [None]:
# Cell 4: Configuration for Kaggle
import torch

MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"  # 1.5B fits well in GPU

# Kaggle paths (CHANGED from Colab)
DATA_PATH = "/kaggle/input/mitre-datset"
OUTPUT_DIR = "/kaggle/working/checkpoints"
FINAL_MODEL_DIR = "/kaggle/working/fine_tuned_model"

TRAIN_FILE = f"{DATA_PATH}/train.jsonl"
VAL_FILE = f"{DATA_PATH}/val.jsonl"

# Training settings - OPTIMIZED for 7-log chunks with DYNAMIC PADDING
MAX_LENGTH = 4096        # Optimized for 7-log chunks (~2,200 tokens + instruction = ~3,000 total)
                         # Maximum length for truncation only (not padding!)
                         # Each batch will pad to its longest example (not MAX_LENGTH)
BATCH_SIZE = 4           # Increased from 1 (7-log chunks are much smaller!)
GRAD_ACCUM_STEPS = 4     # Reduced from 16 (effective batch = 4 * 4 = 16)
NUM_EPOCHS = 5           # Increased from 2 for better learning
LEARNING_RATE = 3e-4     # Slightly higher for better convergence with larger batches

print("‚úÖ Configuration loaded (Kaggle)")
print(f"   Model: {MODEL_NAME}")
print(f"   MAX_LENGTH: {MAX_LENGTH} tokens (optimized for 7-log chunks)")
print(f"   Batch size: {BATCH_SIZE} (increased for smaller chunks!)")
print(f"   Effective batch: {BATCH_SIZE * GRAD_ACCUM_STEPS}")
print(f"   Dynamic padding: Each batch pads to its longest example")
print(f"   Epochs: {NUM_EPOCHS}")
print(f"   LoRA: r=32, alpha=64 (increased capacity)")
print(f"   Estimated time: ~3-4 hours (faster with optimized batch size!)")

In [None]:
# Cell 5: Load model WITHOUT quantization (T4 has 15GB, should fit)
print("\nüîÑ Loading model...")

from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,  # Use FP16 instead of 4-bit
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

print(f"‚úÖ Model loaded: {MODEL_NAME}")
print(f"üìä GPU Memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB / 15 GB")

In [None]:
# Cell 6: Configure LoRA (REQUIRED!)
print("\nüîÑ Configuring LoRA...")

from peft import LoraConfig, get_peft_model

# Enable gradient checkpointing and disable cache
model.gradient_checkpointing_enable()
model.config.use_cache = False  # CRITICAL: Required for gradient checkpointing

# Prepare model for training
for param in model.parameters():
    param.requires_grad = False  # Freeze base model
    if param.ndim == 1:
        param.data = param.data.to(torch.float32)  # Cast layer norms to fp32

model.enable_input_require_grads()

lora_config = LoraConfig(
    r=32,                # Increased from 16 for better capacity with longer sequences
    lora_alpha=64,       # Increased from 32 (alpha = 2 * r)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("‚úÖ LoRA configured")
print(f"üìä GPU Memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB / 15 GB")

In [None]:
# Cell 7: Load dataset
print("\nüîÑ Loading dataset...")

from datasets import load_dataset

dataset = load_dataset(
    'json',
    data_files={
        'train': TRAIN_FILE,
        'validation': VAL_FILE
    }
)

print(f"‚úÖ Dataset loaded:")
print(f"   Training: {len(dataset['train'])} examples")
print(f"   Validation: {len(dataset['validation'])} examples")
print(f"\nüìã Sample entry:")
print(dataset['train'][0])

In [None]:
# Cell 7b: Analyze actual token lengths in your data
print("\nüîç Analyzing token lengths in your dataset...")
print("This helps determine if MAX_LENGTH needs adjustment\n")

# Sample random examples to check lengths
import random
random.seed(42)

sample_size = min(1000, len(dataset['train']))
sample_indices = random.sample(range(len(dataset['train'])), sample_size)

token_lengths = []

for idx in sample_indices:
    example = dataset['train'][idx]
    formatted = f"""{example['instruction']}

### Input:
{example['input']}

### Response:
{example['output']}"""
    
    # Tokenize without truncation to see actual length
    tokens = tokenizer(formatted, truncation=False)
    token_lengths.append(len(tokens['input_ids']))

# Statistics
import numpy as np
token_lengths = np.array(token_lengths)

print(f"üìä Token Length Statistics ({sample_size} examples):")
print(f"   Min:     {token_lengths.min():,} tokens")
print(f"   Max:     {token_lengths.max():,} tokens")
print(f"   Mean:    {token_lengths.mean():,.0f} tokens")
print(f"   Median:  {np.median(token_lengths):,.0f} tokens")
print(f"   95th %:  {np.percentile(token_lengths, 95):,.0f} tokens")
print(f"   99th %:  {np.percentile(token_lengths, 99):,.0f} tokens")

# Check if MAX_LENGTH is sufficient
if token_lengths.max() > MAX_LENGTH:
    print(f"\n‚ö†Ô∏è WARNING: Some examples exceed MAX_LENGTH ({MAX_LENGTH})!")
    over_limit = (token_lengths > MAX_LENGTH).sum()
    print(f"   {over_limit} examples ({over_limit/sample_size*100:.1f}%) will be truncated")
    print(f"   Consider increasing MAX_LENGTH to {int(np.percentile(token_lengths, 99))} (99th percentile)")
else:
    print(f"\n‚úÖ MAX_LENGTH ({MAX_LENGTH}) is sufficient for all examples")
    recommended = int(np.percentile(token_lengths, 99))
    print(f"   Recommended MAX_LENGTH: {recommended} (99th percentile)")

print(f"\nüí° BENEFITS OF DYNAMIC PADDING:")
print(f"   Without: Every example padded to {MAX_LENGTH} = {MAX_LENGTH:,} tokens/example")
print(f"   With:    Average padding to {token_lengths.mean():,.0f} tokens/example")
print(f"   Savings: {(1 - token_lengths.mean()/MAX_LENGTH)*100:.1f}% less computation!")

In [None]:
# Cell 8: Format and tokenize dataset
print("\nüîÑ Formatting and tokenizing dataset...")

# Dataset has columns: instruction, input, output
print(f"Dataset columns: {dataset['train'].column_names}")

def format_prompt(example):
    # Combine instruction + input + output into training format
    return f"""{example['instruction']}

### Input:
{example['input']}

### Response:
{example['output']}"""

def tokenize_function(examples):
    # Format each example
    texts = [
        format_prompt({
            'instruction': inst,
            'input': inp,
            'output': out
        })
        for inst, inp, out in zip(
            examples['instruction'],
            examples['input'],
            examples['output']
        )
    ]

    # Tokenize with DYNAMIC PADDING (no padding here - let DataCollator handle it)
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=MAX_LENGTH,
        padding=False  # NO padding here! DataCollator will pad each batch dynamically
    )

    # CRITICAL: Mask instruction/input in labels - only compute loss on Response!
    # This makes the model focus on learning to generate good responses
    labels = []
    for i, (text, input_ids) in enumerate(zip(texts, tokenized['input_ids'])):
        # Find where "### Response:" starts
        response_marker = "### Response:"
        response_start = text.find(response_marker)
        
        if response_start != -1:
            # Tokenize up to response marker to find the position
            text_before_response = text[:response_start + len(response_marker)]
            tokens_before = tokenizer(text_before_response, add_special_tokens=False)['input_ids']
            response_token_start = len(tokens_before)
            
            # Create label with masking (-100 = ignore in loss computation)
            label = [-100] * response_token_start + input_ids[response_token_start:]
        else:
            # Fallback: use all tokens if marker not found
            label = input_ids.copy()
        
        labels.append(label)
    
    tokenized["labels"] = labels

    return tokenized

print("\nüîÑ Tokenizing...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset['train'].column_names
)

print(f"‚úÖ Dataset tokenized")
print(f"   Training: {len(tokenized_dataset['train'])} examples")
print(f"   Validation: {len(tokenized_dataset['validation'])} examples")

In [None]:
# Cell 9: Set up training arguments with better evaluation
print("\nüîÑ Setting up training arguments...")

from transformers import TrainingArguments
import numpy as np

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,     # Optimized for 7-log chunks (~3000 tokens)
    per_device_eval_batch_size=BATCH_SIZE,      # Match training batch size
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,  # Effective batch = 4 * 4 = 16
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=25,                           # More frequent logging
    eval_strategy="steps",
    eval_steps=100,                             # Evaluate more frequently (every 100 steps)
    save_strategy="steps",
    save_steps=200,                             # Save more frequently
    save_total_limit=3,                         # Keep 3 best checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,                    # Lower loss is better
    warmup_ratio=0.1,                           # 10% warmup for better stability
    weight_decay=0.01,                          # Weight decay for regularization
    max_grad_norm=1.0,                          # Gradient clipping
    optim="adamw_torch",
    adam_beta1=0.9,                             # AdamW beta1
    adam_beta2=0.999,                           # AdamW beta2
    adam_epsilon=1e-8,                          # AdamW epsilon
    lr_scheduler_type="cosine",                 # Cosine learning rate schedule
    label_smoothing_factor=0.1,                 # Label smoothing for better generalization
    report_to="none",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    logging_first_step=True,
    logging_nan_inf_filter=True,
    dataloader_num_workers=2,                   # Parallel data loading
    dataloader_pin_memory=True,                 # Faster GPU transfer
)

print("‚úÖ Training arguments configured")
print(f"   Batch size: {BATCH_SIZE} per device (optimized for 7-log chunks)")
print(f"   Effective batch size: {BATCH_SIZE * GRAD_ACCUM_STEPS}")
print(f"   Total steps: ~{len(tokenized_dataset['train']) // (BATCH_SIZE * GRAD_ACCUM_STEPS) * NUM_EPOCHS}")
print(f"   Learning rate: {LEARNING_RATE} (cosine schedule with 10% warmup)")
print(f"   Weight decay: 0.01, Label smoothing: 0.1")
print(f"   Eval frequency: every 100 steps (more frequent monitoring)")
print(f"   üéØ Loss computed ONLY on Response (instruction/input masked)")

In [None]:
# Cell 10: Create Trainer with early stopping
print("\nüîÑ Creating trainer...")

from transformers import Trainer, DataCollatorForLanguageModeling, EarlyStoppingCallback

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Add early stopping to prevent overfitting
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=5,  # Stop if no improvement for 5 evals (more patience)
    early_stopping_threshold=0.005  # Smaller threshold for finer control
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    callbacks=[early_stopping],
)

print("‚úÖ Trainer created with early stopping")
print(f"   Early stopping patience: 5 evaluations")
print(f"   Will stop if loss doesn't improve by 0.005")
print(f"   Label smoothing: 0.1 (better generalization)")
print(f"   Response-only loss: Ignores instruction/input tokens")

print(f"üìä GPU Memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB / 15 GB")

In [None]:
# Cell 11: Start training with progress tracking
print("\nüöÄ Starting training...\n")
print("üí° OPTIMIZED TRAINING CONFIGURATION:")
print(f"   Data: 7-log chunks (~2,200 tokens each)")
print(f"   MAX_LENGTH: {MAX_LENGTH} tokens (plenty of headroom)")
print(f"   Batch size: {BATCH_SIZE} (4x faster than before!)")
print(f"   Effective batch: {BATCH_SIZE * GRAD_ACCUM_STEPS}")
print(f"   NUM_EPOCHS: {NUM_EPOCHS}")
print(f"   LoRA: r={32}, alpha={64}")
print(f"   Learning rate: {LEARNING_RATE} with cosine schedule")
print(f"   Label smoothing: 0.1 (better generalization)")
print(f"   üéØ Response-only loss (ignores instruction/input)")
print(f"\n‚è±Ô∏è Estimated time: ~3-4 hours (faster with optimized batching!)\n")
print("="*80)

import time
start_time = time.time()

# Train and capture results
train_result = trainer.train()

elapsed_time = time.time() - start_time

print("\n" + "="*80)
print("‚úÖ TRAINING COMPLETED")
print("="*80)
print(f"‚è±Ô∏è Total time: {elapsed_time/3600:.2f} hours ({elapsed_time/60:.1f} minutes)")
print(f"üìä Final training loss: {train_result.training_loss:.4f}")
print(f"üìä Total steps: {train_result.global_step}")

# Get final evaluation metrics
print("\nüîÑ Running final evaluation...")
final_metrics = trainer.evaluate()
print(f"üìä Final validation loss: {final_metrics['eval_loss']:.4f}")
print(f"üìä Perplexity: {np.exp(final_metrics['eval_loss']):.2f}")
print("="*80)

In [None]:
# Cell 12: Save the fine-tuned model
print("\nüíæ Saving model...")

model.save_pretrained(FINAL_MODEL_DIR)
tokenizer.save_pretrained(FINAL_MODEL_DIR)

print(f"‚úÖ Model saved to: {FINAL_MODEL_DIR}")
print("\nüì• Download model:")
print("   See next cell to create download link")

In [None]:
# Cell 13: Download model from Kaggle
import os

print("üì¶ Creating downloadable zip...")

# Zip the model
!zip -r fine_tuned_model.zip {FINAL_MODEL_DIR}

print(f"‚úÖ Model zipped: fine_tuned_model.zip")
print(f"   Size: {os.path.getsize('fine_tuned_model.zip') / (1024**2):.1f} MB")
print("\nüì• Download: Check the 'Output' section on the right")
print("   Click on fine_tuned_model.zip to download")