# Phase 2: Fine-Tuning LawBot with Qwen2.5-1.5B

## Objectives:
1. Load Qwen2.5-1.5B-Instruct model
2. Apply 4-bit QLoRA using Unsloth
3. Fine-tune on legal Q&A data
4. Evaluate model performance
5. Save adapter weights


In [None]:
# Install unsloth for fast fine-tuning
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

# Import libraries
from unsloth import is_bfloat16_supported
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")


## Step 1: Load and Prepare Data


In [None]:
import json

# Load train and validation data
def load_jsonl(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_data = load_jsonl('../data/processed/train.jsonl')
val_data = load_jsonl('../data/processed/val.jsonl')

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")

# Save to temporary JSON files for datasets library
with open('/tmp/train.jsonl', 'w', encoding='utf-8') as f:
    for item in train_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

with open('/tmp/val.jsonl', 'w', encoding='utf-8') as f:
    for item in val_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

# Load with datasets library
dataset = load_dataset('json', data_files={'train': '/tmp/train.jsonl', 'val': '/tmp/val.jsonl'})
print(dataset)


## Step 2: Load Qwen2.5-1.5B Model with QLoRA


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-1.5B-Instruct",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=3407,
)

print(f"Model loaded: {model.config.name_or_path}")
print(f"Max sequence length: {2048}")


## Step 3: Prepare Dataset Format


In [None]:
def format_instruction(data):
    """Format data for Qwen2.5 instruction following"""
    instruction = data["instruction"]
    output = data["output"]
    
    text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{output}<|im_end|>\n"
    return text

# Apply formatting
dataset = dataset.map(lambda x: {"text": format_instruction(x)})
print("Sample formatted text:")
print(dataset["train"][0]["text"])


## Step 4: Fine-Tuning Configuration


In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    max_seq_length=2048,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=50,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="../models/adapters",
        save_strategy="epoch",
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        save_total_limit=3,
    ),
)

print("Trainer initialized successfully")


## Step 5: Train Model


In [None]:
# Train the model
trainer.train()

print("Training completed!")


## Step 6: Save Fine-Tuned Model


In [None]:
# Save model and tokenizer
model.save_pretrained("../models/adapters/lawbot_qwen_adapter")
tokenizer.save_pretrained("../models/adapters/lawbot_qwen_adapter")

print("Adapter weights saved successfully!")


## Step 7: Evaluate Model Performance


In [None]:
from rouge_score import rouge_scorer
from sacrebleu import BLEU
import json

# Load evaluation metrics
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
bleu_scorer = BLEU()

def evaluate_model(model, tokenizer, dataset, num_samples=10):
    """Evaluate model on sample data"""
    results = []
    
    FastLanguageModel.for_inference(model)
    
    for i, sample in enumerate(dataset[:num_samples]):
        prompt = f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        
        outputs = model.generate(**inputs, max_new_tokens=512, use_cache=True)
        generated = tokenizer.decode(outputs[0], skip_special_tokens=False)
        
        # Extract generated text
        generated_text = generated.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0]
        ground_truth = sample['output']
        
        # Calculate ROUGE scores
        rouge_scores = scorer.score(ground_truth, generated_text)
        
        # Calculate BLEU score
        bleu_score = bleu_scorer.sentence_score(generated_text, [ground_truth])
        
        results.append({
            'instruction': sample['instruction'][:100],
            'generated': generated_text[:200],
            'ground_truth': ground_truth[:200],
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure,
            'rougeL': rouge_scores['rougeL'].fmeasure,
            'bleu': bleu_score.score / 100.0
        })
    
    return results

# Run evaluation
eval_results = evaluate_model(model, tokenizer, val_data, num_samples=20)

# Calculate average scores
avg_rouge1 = sum(r['rouge1'] for r in eval_results) / len(eval_results)
avg_rouge2 = sum(r['rouge2'] for r in eval_results) / len(eval_results)
avg_rougeL = sum(r['rougeL'] for r in eval_results) / len(eval_results)
avg_bleu = sum(r['bleu'] for r in eval_results) / len(eval_results)

print(f"\nEvaluation Results:")
print(f"Average ROUGE-1: {avg_rouge1:.4f}")
print(f"Average ROUGE-2: {avg_rouge2:.4f}")
print(f"Average ROUGE-L: {avg_rougeL:.4f}")
print(f"Average BLEU: {avg_bleu:.4f}")

# Save evaluation results
with open('../data/processed/evaluation_results.json', 'w') as f:
    json.dump({
        'avg_scores': {
            'rouge1': avg_rouge1,
            'rouge2': avg_rouge2,
            'rougeL': avg_rougeL,
            'bleu': avg_bleu
        },
        'detailed_results': eval_results[:5]  # Save first 5 for review
    }, f, indent=2)

print("\nEvaluation results saved to data/processed/evaluation_results.json")


## Summary

Phase 2 completed successfully! The model has been:
1. ✅ Loaded Qwen2.5-1.5B-Instruct model
2. ✅ Applied QLoRA with 4-bit quantization
3. ✅ Fine-tuned on legal Q&A data (3 epochs)
4. ✅ Evaluated with ROUGE and BLEU metrics
5. ✅ Saved adapter weights

**Deliverables:**
- `models/adapters/lawbot_qwen_adapter/` - Fine-tuned adapter weights
- `data/processed/evaluation_results.json` - Performance metrics
- Training history with validation loss tracking
