# Self-Supervised Fine-Tuning of Mistral-7B on Audit Reports

This notebook demonstrates how to adapt a Large Language Model (Mistral-7B) to the domain of professional audit reports using self-supervised fine-tuning (continued pretraining). 

**Objective**: Enhance the model's domain fluency, vocabulary, and stylistic consistency for audit documentation.
**Method**: Causal Language Modeling (Next-Token Prediction) on raw text extracted from PDF reports.
**Hardware**: Optimized for a T4 GPU (Google Colab free tier compatible) using QLoRA (4-bit quantization + LoRA).

## 1. Setup and Installation
We need to install the necessary libraries for PDF extraction, efficient model loading, and training.

**IMPORTANT**: After running the installation cell below, you MUST restart the runtime/session (Runtime > Restart session) for the updates to take effect, then run the cells starting from the imports.

In [1]:
# Install all key dependencies including PyTorch components to ensure version compatibility
!pip install -q -U torch torchvision torchaudio transformers peft datasets bitsandbytes trl pdfplumber accelerate

print("Installation complete. Please RESTART the runtime (Runtime > Restart session) to apply changes, then run the next cells.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m915.7/915.7 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.3/188.3 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m107.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9

In [2]:
import os
import glob
import pdfplumber
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
import re

# Set seed for reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x7869967edeb0>

In [3]:
import sys
from pathlib import Path

# Check if running in Colab
if 'google.colab' in sys.modules:
    from google.colab import drive
    try:
        drive.mount('/content/drive')
    except:
        pass
    # Update DATA_DIR to point to mounted Drive
    # Make sure you have uploaded the Data folder to your Google Drive root
    DATA_DIR = Path('/content/drive/MyDrive/Data')
    print(f"Mounted Google Drive. DATA_DIR set to: {DATA_DIR}")
else:
    DATA_DIR = Path("./Data")
    print(f"Not running in Colab. Using local Data directory: {DATA_DIR}")

Mounted at /content/drive
Mounted Google Drive. DATA_DIR set to: /content/drive/MyDrive/Data


## 2. Data Preparation

We will extract text from the PDF audit reports located in the `Data` directory. 

**Cleaning Steps**:
- Extract text using `pdfplumber`.
- Remove potential headers and footers (heuristic: very short lines at top/bottom of pages).
- Normalize whitespace.
- Anonymize sensitive patterns (placeholder implementation).

In [4]:
def extract_text_from_pdf(pdf_path):
    text_content = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text
            text = page.extract_text()
            if not text:
                continue
            
            lines = text.split('\n')
            
            # Basic Heuristic: Remove first and last lines if they likely resemble headers/footers (e.g., page numbers or short titles)
            # Adjust this logic based on your specific PDF layout
            if len(lines) > 2:
                # Remove header if short (arbitrary length < 50 chars as a heuristic)
                if len(lines[0]) < 50:
                    lines = lines[1:]
                # Remove footer if short and looks like page number
                if len(lines) > 0 and len(lines[-1]) < 20:
                    lines = lines[:-1]
            
            page_text = "\n".join(lines)
            text_content.append(page_text)
    
    full_text = "\n\n".join(text_content)
    return full_text

def clean_data(text):
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Placeholder for anonymization (e.g., replace emails, phone numbers)
    # This regex is a simple example and should be expanded for real production use
    text = re.sub(r'[\w\.-]+@[\w\.-]+', '[EMAIL]', text)
    
    return text

# Main Data Loading Loop
try:
    data_dir = DATA_DIR
except NameError:
    data_dir = "./Data"
    
print(f"Searching for PDFs in: {data_dir}")
pdf_files = glob.glob(str(data_dir / "*.pdf"))

raw_texts = []
for pdf_file in pdf_files:
    print(f"Processing {pdf_file}...")
    try:
        raw_text = extract_text_from_pdf(pdf_file)
        cleaned_text = clean_data(raw_text)
        if len(cleaned_text) > 500: # Only keep documents with substantial content
            raw_texts.append(cleaned_text)
    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")

print(f"\nSuccessfully loaded {len(raw_texts)} documents.")

Searching for PDFs in: /content/drive/MyDrive/Data
Processing /content/drive/MyDrive/Data/Annual_Review_of_Audit_Quality_2025.pdf...
Processing /content/drive/MyDrive/Data/BDO_LLP_Audit_Quality_Inspection_and_Supervision_2025.pdf...
Processing /content/drive/MyDrive/Data/Deloitte_LLP_Audit_Quality_Inspection_and_Supervision_2025.pdf...
Processing /content/drive/MyDrive/Data/Ernst__Young_LLP_Audit_Quality_Inspection_and_Supervision_2025.pdf...
Processing /content/drive/MyDrive/Data/Forvis_Mazars_LLP_Audit_Quality_Inspection_and_Supervision_2025.pdf...
Processing /content/drive/MyDrive/Data/KPMG_LLP_Audit_Quality_Inspection_and_Supervision_2025.pdf...
Processing /content/drive/MyDrive/Data/PricewaterhouseCoopers_LLP_Audit_Quality_Inspection_and_Supervision_2025.pdf...
Processing /content/drive/MyDrive/Data/Annual_Review_of_Audit_Quality_2024_7yhxTsi.pdf...
Processing /content/drive/MyDrive/Data/Tier_1_Firms__Overview_2023.pdf...
Processing /content/drive/MyDrive/Data/FRC_Audit_Quality_In

## 3. Dataset Tokenization and Chunking

We need to process the text into chunks suitable for the model's context window. 
- **Context Window**: 1024 tokens (Reduced from 2048 to save VRAM).
- **Overlap**: No overlap in packing strategy.
- **Format**: Prepare as a Hugging Face Dataset.

In [5]:
# Create HF Dataset
dataset = Dataset.from_dict({"text": raw_texts})

# Split into train and validation
dataset = dataset.train_test_split(test_size=0.1, seed=42)
print(dataset)

# Load Tokenizer
model_id = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Mistral has no pad token by default

def chunk_and_tokenize(examples):
    # Flatten texts into a single long string of tokens
    chunk_size = 1024 # Reduced from 2048 to save VRAM
    
    # Basic tokenization without padding/truncated
    tokens = tokenizer(examples["text"], truncation=False, return_attention_mask=False)["input_ids"]
    
    # Flatten list of lists into one big list of tokens
    concatenated_tokens = [tok for doc in tokens for tok in doc]
    
    # Calculate total length divisible by chunk_size
    # We drop the small remainder at the very end of the entire dataset
    total_length = len(concatenated_tokens)
    if total_length >= chunk_size:
        total_length = (total_length // chunk_size) * chunk_size
    else:
        # Handle highly unlikely case where entire dataset < chunk_size tokens
        # Pad to chunk_size
        concatenated_tokens += [tokenizer.eos_token_id] * (chunk_size - total_length)
        total_length = chunk_size

    # Split by chunks of max_len
    result = {
        "input_ids": [concatenated_tokens[i : i + chunk_size] for i in range(0, total_length, chunk_size)],
        "labels": [concatenated_tokens[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    }
    
    return result

# Apply processing
tokenized_dataset = dataset.map(
    chunk_and_tokenize,
    batched=True,
    remove_columns=dataset["train"].column_names,
    desc="Chunking and Tokenizing"
)

print(f"Train chunks: {len(tokenized_dataset['train'])}")
print(f"Test chunks: {len(tokenized_dataset['test'])}")

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 38
    })
    test: Dataset({
        features: ['text'],
        num_rows: 5
    })
})


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Chunking and Tokenizing:   0%|          | 0/38 [00:00<?, ? examples/s]

Chunking and Tokenizing:   0%|          | 0/5 [00:00<?, ? examples/s]

Train chunks: 488
Test chunks: 58


## 4. Model Loading with QLoRA

We load Mistral-7B in 4-bit quantization to fit on a T4 GPU.
Then we attach LoRA adapters for parameter-efficient fine-tuning.

In [6]:
# 4-bit Quantization Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, # or bfloat16 if supported by hardware
    bnb_4bit_use_double_quant=False,
)

# Load Base Model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# LoRA Configuration
peft_config = LoraConfig(
    r=16, # Rank
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

trainable params: 41,943,040 || all params: 7,283,675,136 || trainable%: 0.5758


## 5. Training

We use the basic `Trainer` with `DataCollatorForLanguageModeling`. 
The objective is purely self-supervised next-token prediction.

In [7]:

# Clear cache before training
torch.cuda.empty_cache()

# Data Collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./audit-mistral-finetuned",
    per_device_train_batch_size=1, # Reduced to 1 to fit T4 VRAM
    gradient_accumulation_steps=8, # Increased to 8 to maintain effective batch size
    learning_rate=2e-4,
    logging_steps=10,
    num_train_epochs=3, # Increased to 3 epochs
    save_strategy="epoch",
    eval_strategy="steps", # Evaluate more frequently
    eval_steps=10,
    fp16=True,
    optim="paged_adamw_8bit", # Memory efficient optimizer
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

# Start Training
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
10,2.041756,1.821471
20,1.909383,1.722224
30,1.843072,1.622767
40,1.711572,1.553089
50,1.670609,1.48217
60,1.618916,1.416355
70,1.363736,1.375976
80,1.24824,1.336936
90,1.290128,1.30672
100,1.201266,1.269688


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=183, training_loss=1.3194929198489163, metrics={'train_runtime': 11170.02, 'train_samples_per_second': 0.131, 'train_steps_per_second': 0.016, 'total_flos': 6.433634912934298e+16, 'train_loss': 1.3194929198489163, 'epoch': 3.0})

## 6. Evaluation

We calculate Perplexity as a quantitative metric of how well the model predicts the domain text.

In [8]:
import math

eval_results = trainer.evaluate()
perplexity = math.exp(eval_results['eval_loss'])
print(f"Perplexity: {perplexity:.2f}")

Perplexity: 3.20


## 7. Inference

Test the model's generation capabilities on an audit-related prompt.

In [None]:
# Save the model (adapters only)
#trainer.save_model("/content/drive/MyDrive/Self_Supervised_finetuning_Model/audit-mistral-7b-qlora")

# Inference Prompt
prompt = "how audit is done and what meatrics y use"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Key Audit Matter: Revenue Recognition
The Group reported revenue of $4,520 million for the year ended 31 December 2024 (2023: $4,105 million).
The risk is that revenue may be overstated due to the pressure management may feel to achieve performance targets. We focused on the risk of cut-off and the valuation of unbilled revenue at year-end.
Our audit procedures included:
•  Agreement of the cut-off policy with management.
•  A cut-off compliance testing programme.
•  Evaluating the completeness of revenue cut-off procedures for the largest revenue stream.
•  A valuation of unbilled revenue at year-end for a sample of material contracts, including consideration of the impact of post-balance sheet events.
Further details of our work on revenue can be found in Section 2 of this report.
Further information on the firm’s internal procedures which support the audit work on revenue can be found in the firm’s Quality Control procedures in Appendix A. 10 KPMG LLP – Audit Quality Inspection (Ju

In [10]:
trainer.save_model("/content/drive/MyDrive/Self_Supervised_finetuning_Model/audit-mistral-7b-qlora")


# Cosine Similarity | General Perplexity

In [13]:
from torch.nn.functional import cosine_similarity
import numpy as np
import torch
from peft import PeftModel

# --- FIX: Ensure Adapters are Active ---
# If the model is just a base model, we attach the trained adapters we just formatted
if not isinstance(model, PeftModel):
    # If we just finished training, the model might still be wrapped in the trainer
    # We can try to use the model directly if it already has peft config
    if hasattr(model, "peft_config"):
         # It's already a PeftModel, just maybe not typed correctly or in a weird state
         pass
    else:
        # Worst case: Load adapters from the checkpoint we just saved (Safety net)
        # This assumes you ran trainer.save_model() in the previous cell
        adapter_path = "/content/drive/MyDrive/Self_Supervised_finetuning_Model/audit-mistral-7b-qlora"
        try:
            model = PeftModel.from_pretrained(model, adapter_path)
            print("Loaded adapters from disk.")
        except:
            print("Could not load adapters from disk. Using current model state.")

# 1. Define Helper to get Embeddings
def get_sentence_embedding(model, tokenizer, text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    last_hidden_state = outputs.hidden_states[-1]
    return last_hidden_state.mean(dim=1) 

# 2. Define Metrics Calculation
def analyze_model_performance(model, tokenizer):
    print("--- Starting Advanced Analysis ---\n")
    
    audit_sentences = [
        "The audit committee is responsible for overseeing the financial reporting process.",
        "Material misstatements can arise from fraud or error in the financial statements.",
        "We conducted our audit in accordance with International Standards on Auditing (UK)."
    ]
    
    general_sentences = [
        "The cat sat on the mat and looked out the window at the rain.",
        "The capital of France is Paris, which is known for its culture and history.",
        "Photosynthesis is the process by which plants use sunlight to create energy."
    ]
    
    print("Calculating Cosine Similarity (Base vs Fine-Tuned)...")
    similarities = []
    
    for text in audit_sentences:
        # 1. Get Fine-Tuned Embedding
        # Instead of model.enable_adapters(), we use the PEFT context manager if available, 
        # or just assume the current state is the fine-tuned state.
        try:
             model.enable_adapters()
        except:
             pass # If it fails, we assume adapters are already active or merged
             
        ft_emb = get_sentence_embedding(model, tokenizer, text)
        
        # 2. Get Base Model Embedding
        # We try to disable adapters. If we can't, we skip this metric to avoid crashing.
        try:
            with model.disable_adapter():
                base_emb = get_sentence_embedding(model, tokenizer, text)
            
            sim = cosine_similarity(ft_emb, base_emb).item()
            similarities.append(sim)
            print(f"  Sentence: '{text[:40]}...' | Similarity: {sim:.4f}")
        except Exception as e:
            print(f"  Could not separate Base/Adapter embeddings: {e}")
            similarities.append(1.0) # Fallback

    avg_similarity = np.mean(similarities)
    print(f"Average Domain Representation Consistency: {avg_similarity:.4f}")
    
    print("\nCalculating General Knowledge Retention (Perplexity)...")
    encodings = tokenizer("\n\n".join(general_sentences), return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model(encodings.input_ids, labels=encodings.input_ids)
        general_loss = outputs.loss
    
    general_perplexity = torch.exp(general_loss).item()
    print(f"Perplexity on General Topics: {general_perplexity:.2f}")

    return avg_similarity, general_perplexity

analyze_model_performance(model, tokenizer)

--- Starting Advanced Analysis ---

Calculating Cosine Similarity (Base vs Fine-Tuned)...
  Sentence: 'The audit committee is responsible for o...' | Similarity: 0.7966
  Sentence: 'Material misstatements can arise from fr...' | Similarity: 0.7899
  Sentence: 'We conducted our audit in accordance wit...' | Similarity: 0.7636
Average Domain Representation Consistency: 0.7833

Calculating General Knowledge Retention (Perplexity)...
Perplexity on General Topics: 5.72


(np.float64(0.7833432952562968), 5.718234062194824)