# Self-Supervised Fine-Tuning of Llama-2-7B on Audit Reports

This notebook adapts **Llama-2-7B** to the domain of professional audit reports using self-supervised fine-tuning (continued pretraining).

**Objective**: Enhance Llama-2's domain fluency for audit documentation.
**Model**: `NousResearch/Llama-2-7b-hf` (Non-gated Llama 2 base model).
**Method**: QLoRA (4-bit quantization + LoRA) on T4 GPU.

## 1. Setup and Installation
**IMPORTANT**: After installation, restart the runtime (Runtime > Restart session).

In [None]:
# Install dependencies
!pip install -q -U torch torchvision torchaudio transformers peft datasets bitsandbytes trl pdfplumber accelerate

print("Installation complete. Please RESTART the runtime now.")

In [None]:
import os
import glob
import pdfplumber
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PeftModel
import re

torch.manual_seed(42)

In [None]:
import sys
from pathlib import Path

if 'google.colab' in sys.modules:
    from google.colab import drive
    try:
        drive.mount('/content/drive')
    except:
        pass
    DATA_DIR = Path('/content/drive/MyDrive/Data')
    print(f"Mounted Google Drive. DATA_DIR set to: {DATA_DIR}")
else:
    DATA_DIR = Path("./Data")
    print(f"Using local Data directory: {DATA_DIR}")

## 2. Data Preparation
Extract text from PDFs.

In [None]:
def extract_text_from_pdf(pdf_path):
    text_content = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text: continue
            lines = text.split('\n')
            # Heuristic cleaning
            if len(lines) > 2:
                if len(lines[0]) < 50: lines = lines[1:]
                if len(lines) > 0 and len(lines[-1]) < 20: lines = lines[:-1]
            text_content.append("\n".join(lines))
    return "\n\n".join(text_content)

def clean_data(text):
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[\w\.-]+@[\w\.-]+', '[EMAIL]', text)
    return text

try:
    pdf_files = glob.glob(str(DATA_DIR / "*.pdf"))
except:
    pdf_files = glob.glob("./Data/*.pdf")

raw_texts = []
print(f"Found {len(pdf_files)} PDFs.")
for pdf_file in pdf_files:
    try:
        raw_text = extract_text_from_pdf(pdf_file)
        cleaned_text = clean_data(raw_text)
        if len(cleaned_text) > 500:
            raw_texts.append(cleaned_text)
    except Exception as e:
        print(f"Error: {e}")

print(f"Loaded {len(raw_texts)} documents.")

## 3. Dataset & Tokenizer (Llama 2)
We use `NousResearch/Llama-2-7b-hf`.

In [None]:
# Create Dataset
dataset = Dataset.from_dict("text": raw_texts)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

# Load Llama 2 Tokenizer
model_id = "NousResearch/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def chunk_and_tokenize(examples):
    chunk_size = 1024
    tokens = tokenizer(examples["text"], truncation=False, return_attention_mask=False)["input_ids"]
    concatenated_tokens = [tok for doc in tokens for tok in doc]
    total_length = len(concatenated_tokens)
    if total_length >= chunk_size:
        total_length = (total_length // chunk_size) * chunk_size
    else:
        concatenated_tokens += [tokenizer.eos_token_id] * (chunk_size - total_length)
        total_length = chunk_size

    result = {
        "input_ids": [concatenated_tokens[i : i + chunk_size] for i in range(0, total_length, chunk_size)],
        "labels": [concatenated_tokens[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    }
    return result

tokenized_dataset = dataset.map(
    chunk_and_tokenize,
    batched=True,
    remove_columns=dataset["train"].column_names
)

## 4. QLoRA Setup for Llama 2

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# LoRA Config for Llama 2
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

## 5. Training

In [None]:
torch.cuda.empty_cache()
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./audit-llama2-finetuned",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    logging_steps=10,
    num_train_epochs=3,
    save_strategy="epoch",
    eval_strategy="steps",
    eval_steps=20,
    fp16=True,
    optim="paged_adamw_8bit",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

## 6. Evaluation & Advanced Metrics

In [None]:
# Basic Perplexity
import math
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results['eval_loss'])
print(f"Perplexity: {perplexity:.2f}")

In [None]:
# Advanced: Cosine Similarity & Forgetting
from torch.nn.functional import cosine_similarity
import numpy as np

if not isinstance(model, PeftModel):
    try:
        # Try to reload if needed
        pass 
    except: pass

def get_sentence_embedding(model, tokenizer, text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    return outputs.hidden_states[-1].mean(dim=1)

def analyze_model_performance(model, tokenizer):
    print("--- Advanced Analysis ---")
    audit_sents = ["The audit committee oversees financial reporting.", "Material misstatements can arise from fraud."]
    general_sents = ["The cat sat on the mat.", "Paris is the capital of France."]
    
    # Cosine Similarity
    sims = []
    for text in audit_sents:
        try: model.enable_adapters()
        except: pass
        ft_emb = get_sentence_embedding(model, tokenizer, text)
        try:
            with model.disable_adapter():
                base_emb = get_sentence_embedding(model, tokenizer, text)
            sims.append(cosine_similarity(ft_emb, base_emb).item())
        except: sims.append(1.0)
    
    print(f"Avg Domain Consistency: {np.mean(sims):.4f}")
    
    # General Perplexity
    print("Checking General Knowledge...")
    try: model.enable_adapters()
    except: pass
    enc = tokenizer("\n".join(general_sents), return_tensors="pt").to("cuda")
    with torch.no_grad():
        loss = model(enc.input_ids, labels=enc.input_ids).loss
    print(f"General Perplexity: {torch.exp(loss).item():.2f}")

analyze_model_performance(model, tokenizer)

In [None]:
# Save Model
save_path = "/content/drive/MyDrive/Self_Supervised_finetuning_Model/audit-llama2-7b-qlora"
trainer.save_model(save_path)
print(f"Model saved to {save_path}")