# Fine-tuning Mistral-7B-Instruct with QLoRA on YouTube Transcripts

This notebook performs QLoRA fine-tuning on `mistralai/Mistral-7B-Instruct-v0.3` using a dataset derived from YouTube video transcripts.

**Steps:**
1. Installs necessary libraries.
2. Sets up Hugging Face Hub authentication.
3. Loads and prepares the dataset (`train.jsonl`).
4. Configures the QLoRA parameters and loads the base model in 4-bit.
5. Sets up the `SFTTrainer` from the TRL library.
6. Runs the fine-tuning process.
7. Saves the trained LoRA adapter locally.
8. (Optional) Pushes the adapter to the Hugging Face Hub.
9. (Optional) Performs basic evaluation (Perplexity, ROUGE-L).

## 1. Setup & Installs

Install the required libraries. `bitsandbytes` requires a specific version compatible with Colab's GPU environment (usually T4 or A100).

In [2]:
!pip install -q transformers datasets accelerate peft trl bitsandbytes sentencepiece py7zr torch ninja huggingface_hub evaluate rouge_score pyyaml

## 2. Hugging Face Hub Authentication

Log in to Hugging Face Hub to save the adapter and potentially download gated models. You'll need a User Access Token with `write` permissions.

Get your token here: https://huggingface.co/settings/tokens

In [1]:
from huggingface_hub import login, notebook_login
# Use notebook_login() for interactive login in Colab/Jupyter
# or login("YOUR_HF_TOKEN") if running in a script
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 3. Load and Prepare Dataset

Upload your `train.jsonl` file (generated by `data_gen.py`) to your Colab session. You can do this using the file browser on the left panel.

Alternatively, if you've pushed it to a Hugging Face dataset repository, you can load it directly from there.

In [2]:
import os
import torch
from datasets import load_dataset, DatasetDict # Import DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import evaluate # For ROUGE score
import numpy as np

# --- Configuration ---
# Model and Tokenizer
base_model_name = "mistralai/Mistral-7B-Instruct-v0.3"

# Dataset paths (ensure these files are uploaded to Colab)
train_dataset_path = "train.jsonl"
test_dataset_path = "test.jsonl" # Path to the test split

# Option 2: Load from Hugging Face Hub (replace with your repo ID if you pushed the dataset)
# dataset_hub_id = "your_username/your_dataset_repo_name"
# dataset_files = {"train": "train.jsonl", "test": "test.jsonl"}

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4", # Recommended
    bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for faster training
    bnb_4bit_use_double_quant=True, # Recommended
)

# LoRA config
peft_config = LoraConfig(
    r=8,                 # LoRA attention dimension (rank)
    lora_alpha=16,       # Alpha parameter for scaling
    lora_dropout=0.05,   # Dropout probability for LoRA layers
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[ # Find target modules using script below or common sense
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        # "gate_proj", # Optional
        # "up_proj",   # Optional
        # "down_proj", # Optional
    ],
)

# Training arguments
output_dir = "./mistral-qlora-adapter" # Local directory to save adapter
per_device_train_batch_size = 2
gradient_accumulation_steps = 8
# num_train_epochs = 1.0 # Can use epochs or max_steps
max_steps = 300 # Adjust based on dataset size and desired training time (~200-400 recommended)
learning_rate = 2e-4
optim = "paged_adamw_32bit" # Recommended optimizer for QLoRA
logging_steps = 25
save_steps = 50 # Save checkpoints periodically
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "constant" # Or "cosine", "linear"
evaluation_strategy = "steps" # Evaluate during training using the test set
eval_steps = 50             # Evaluate every N steps
# report_to="tensorboard" # Or wandb

# SFT Trainer specific
max_seq_length = MAX_CHUNK_TOKENS = 512 # Defined in data_gen.py, ensure consistency
packing = False # Set to True if you want to pack sequences, requires more memory

# Hugging Face Hub repo ID (optional)
hf_hub_repo_id = "your_username/mistral-7b-instruct-youtube-qlora" # CHANGE THIS to your HF username/repo name

# --- Load Dataset ---
train_dataset = None
eval_dataset = None

try:
    # Check if local files exist
    if os.path.exists(train_dataset_path) and os.path.exists(test_dataset_path):
        print(f"Loading dataset from local files: {train_dataset_path}, {test_dataset_path}")
        # Load both files into a DatasetDict
        dataset = load_dataset('json', data_files={'train': train_dataset_path, 'test': test_dataset_path})
        train_dataset = dataset['train']
        eval_dataset = dataset['test'] # Use the 'test' split for evaluation
        print(f"Datasets loaded: Train size={len(train_dataset)}, Eval size={len(eval_dataset)}")
    # elif dataset_hub_id: # Option to load from Hub
    #     print(f"Local files not found. Attempting to load from Hub: {dataset_hub_id}")
    #     dataset = load_dataset(dataset_hub_id, data_files=dataset_files)
    #     train_dataset = dataset['train']
    #     eval_dataset = dataset['test']
    #     print(f"Datasets loaded from Hub: Train size={len(train_dataset)}, Eval size={len(eval_dataset)}")
    else:
        missing_files = []
        if not os.path.exists(train_dataset_path): missing_files.append(train_dataset_path)
        if not os.path.exists(test_dataset_path): missing_files.append(test_dataset_path)
        raise FileNotFoundError(f"Dataset file(s) not found. Please upload: {', '.join(missing_files)}")

except Exception as e:
    print(f"Error loading dataset: {e}")
    # Stop execution if datasets aren't loaded
    # exit()

# Ensure evaluation strategy is set correctly if eval_dataset exists
if eval_dataset is None:
    evaluation_strategy = "no"
    eval_steps = None
    print("Warning: No evaluation dataset loaded. Disabling evaluation during training.")
else:
    # Keep evaluation_strategy and eval_steps as defined earlier
    print("Evaluation dataset loaded. Evaluation during training is enabled.")


# --- Format dataset for SFTTrainer ---
# Mistral Instruct format:
# <s>[INST] Instruction [/INST] Answer </s>
# We need a function that takes a sample and returns a formatted string.

def format_instruction(sample):
    # Uses the 'instruction', 'input', and 'output' fields from train.jsonl/test.jsonl
    # 'input' contains the original transcript chunk
    # 'output' contains the LLM-generated answer
    instruction = sample['instruction']
    context = sample['input'] # The transcript chunk
    response = sample['output'] # The LLM-generated answer

    # Combine instruction and context for the prompt
    prompt = f"{instruction}\n---\n{context}\n---" # Separators help delineate

    # Format according to Mistral Instruct template
    return f"<s>[INST] {prompt} [/INST] {response} </s>"

print("Dataset formatting function defined.")
# Example of formatted text:
if train_dataset and len(train_dataset) > 0:
    print("\nExample formatted training sample:")
    print(format_instruction(train_dataset[0]))
else:
    print("Train dataset is empty or not loaded, cannot show example.")

ModuleNotFoundError: No module named 'datasets'

## 4. Load Model and Tokenizer with QLoRA Config

Load the base model (`Mistral-7B-Instruct-v0.3`) with 4-bit quantization using the `BitsAndBytesConfig`. We also load the corresponding tokenizer.

In [None]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Set pad token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
print("Tokenizer loaded.")

# Load Model with QLoRA config
print(f"Loading base model: {base_model_name} with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto", # Automatically map layers to GPU
    trust_remote_code=True, # Necessary for some models
    # torch_dtype=torch.bfloat16, # dtype is set in bnb_config
)
print("Base model loaded.")

# --- Sanity Check: Find LoRA Target Modules ---
# Uncomment the following lines to see all linear layer names
# This helps verify the `target_modules` in LoraConfig
# print("\nModel Architecture:")
# print(model)
# print("\nFinding potential LoRA target modules (Linear layers):")
# linear_layers = set()
# for name, module in model.named_modules():
#     if isinstance(module, torch.nn.Linear):
#          #Focus on layers typically targeted by LoRA in transformers
#          if any(layer_name in name for layer_name in ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']):
#              # Get the last part of the name (e.g., 'q_proj')
#              layer_name = name.split('.')[-1]
#              linear_layers.add(layer_name)
# print(f"Found linear layer names: {linear_layers}")
# print(f"Using target modules: {peft_config.target_modules}")
# print("Ensure these match the typical layers for Mistral architecture.")

# --- Prepare model for k-bit training ---
# Cast layer norms and head to fp32 for stability
# model = prepare_model_for_kbit_training(model) # TRL's SFTTrainer handles this

# --- Create PEFT Model ---
# Note: SFTTrainer can also handle PEFT model creation if peft_config is passed
# Creating it explicitly here for clarity
# print("\nApplying LoRA adapter to the base model...")
# model = get_peft_model(model, peft_config)
# print("LoRA adapter applied.")
# model.print_trainable_parameters()

# Configure cache usage (optional, but recommended)
model.config.use_cache = False # Important for training stability with gradient checkpointing
# model.config.pretraining_tp = 1 # If you face tensor parallelism issues
print("Model prepared for training.")

## 5. Configure SFTTrainer

We use the `SFTTrainer` from the TRL library, which simplifies the process of supervised fine-tuning for instruction-following tasks.

In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    # num_train_epochs=num_train_epochs,
    max_steps=max_steps,
    fp16=False, # Use bf16 if available (Ampere GPUs like A100)
    bf16=True, # Set to True for Ampere GPUs, False for T4 (if bnb_compute_dtype is bfloat16)
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True, # Speeds up training by grouping similar length sequences
    lr_scheduler_type=lr_scheduler_type,
    # Evaluation settings (only if eval_dataset is provided)
    evaluation_strategy=evaluation_strategy,
    eval_steps=eval_steps,
    # report_to=report_to,
    # Pushing to Hub options
    # push_to_hub=True, # Set to True to push model/adapter during training
    # hub_model_id=hf_hub_repo_id, # Repository name on Hugging Face Hub
    # hub_strategy="checkpoint", # Push on every save
    # hub_token=os.getenv("HF_TOKEN") # Use token stored in environment or login()
)

print("Training Arguments configured.")

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset, # Pass evaluation dataset here
    peft_config=peft_config, # Pass PEFT config here
    # dataset_text_field="text", # Use if you pre-formatted into a 'text' column
    formatting_func=format_instruction, # Pass the formatting function
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

print("SFTTrainer initialized.")
# TRL automatically handles prepare_model_for_kbit_training when peft_config is passed
# model.print_trainable_parameters()

# Verify bf16 setting based on GPU availability
if torch.cuda.is_bf16_supported():
    print("\nBF16 is supported. Training will use BF16.")
    if not training_arguments.bf16:
      print("Warning: BF16 supported but not enabled in TrainingArguments. Enabling it.")
      training_arguments.bf16 = True
      training_arguments.fp16 = False # Ensure fp16 is off if bf16 is on
else:
    print("\nBF16 is NOT supported. Ensure compute_dtype in BitsAndBytesConfig is appropriate (e.g., float16) and bf16=False in TrainingArguments.")
    if training_arguments.bf16:
        print("Warning: BF16 is not supported, but bf16=True in TrainingArguments. Setting bf16=False and fp16=True.")
        training_arguments.bf16 = False
        training_arguments.fp16 = True # Fallback to fp16 if bf16 not available

# Re-initialize trainer if arguments changed (e.g., bf16 status)
# This might not be strictly necessary as args are references, but safer
trainer.args = training_arguments
print("Trainer arguments updated based on hardware support.")

## 6. Start Fine-tuning

Launch the training process. This will take some time depending on the dataset size, `max_steps`, and the Colab GPU assigned (T4 is slower than A100). Aiming for < 2 hours on a T4 as requested.

In [None]:
print("Starting fine-tuning...")
train_result = trainer.train()
print("Fine-tuning finished.")

# --- Log Training Metrics ---
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
print("Training metrics saved.")

## 7. Save Adapter Locally

Save the trained QLoRA adapter weights to the specified output directory.

In [None]:
print(f"Saving LoRA adapter to {output_dir}...")
trainer.save_model(output_dir) # Saves the adapter config and weights
print(f"Adapter saved locally to {output_dir}")

# Optional: Save the tokenizer as well (good practice)
tokenizer.save_pretrained(output_dir)
print(f"Tokenizer saved locally to {output_dir}")

## 8. (Optional) Push Adapter to Hugging Face Hub

Push the trained adapter and tokenizer to your Hugging Face Hub repository for easy sharing and loading later.

In [None]:
# Make sure hf_hub_repo_id is set correctly
push_to_hub = True # Set to False if you don't want to push

if push_to_hub:
    print(f"Pushing adapter and tokenizer to Hugging Face Hub repo: {hf_hub_repo_id}...")
    try:
        # Push the adapter (trainer saves adapter to output_dir)
        trainer.model.push_to_hub(hf_hub_repo_id, use_auth_token=True)

        # Push the tokenizer
        tokenizer.push_to_hub(hf_hub_repo_id, use_auth_token=True)

        print("Successfully pushed to Hub.")
    except Exception as e:
        print(f"Error pushing to Hub: {e}")
else:
    print("Skipping push to Hugging Face Hub.")

## 9. (Optional) Evaluation

Perform evaluation on the held-out test set (if created) to calculate Perplexity and ROUGE scores.

In [None]:
import math

if eval_dataset:
    print("\nStarting evaluation on the test set...")

    # --- Perplexity ---
    try:
        eval_metrics = trainer.evaluate()
        perplexity = math.exp(eval_metrics["eval_loss"])
        print(f"Evaluation Loss: {eval_metrics['eval_loss']:.4f}")
        print(f"Perplexity: {perplexity:.4f}")
        # Save eval metrics
        metrics["eval_perplexity"] = perplexity
        trainer.log_metrics("eval", eval_metrics)
        trainer.save_metrics("eval", eval_metrics)
    except Exception as e:
        print(f"Could not calculate perplexity during evaluation: {e}")

    # --- ROUGE Score (More involved for generative tasks) ---
    # Requires generating predictions and comparing them to references.
    # SFTTrainer doesn't have a built-in ROUGE computation during evaluate.
    # We need to manually generate responses for the eval set.

    print("\nCalculating ROUGE score (this may take a while)...")
    rouge_scorer = evaluate.load('rouge')

    # Ensure the model is in evaluation mode and on the correct device
    # model.eval() # Trainer usually handles this, but good practice
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    # If using the trainer's model, it should already be on the device
    # If loading manually: model.to(device)

    all_preds = []
    all_labels = []

    # Prepare inputs and get references (outputs) from the eval dataset
    # The 'output' column in our JSONL *is* the reference/label
    # The input to the model should be the '[INST] ... [/INST]' part

    eval_batch_size = 4 # Adjust based on GPU memory
    for i in range(0, len(eval_dataset), eval_batch_size):
        batch_samples = eval_dataset[i:i+eval_batch_size]

        # Extract the prompt part (instruction + input) for generation
        prompts = []
        labels = []
        for sample in batch_samples:
            instruction = sample['instruction']
            context = sample['input']
            prompt_text = f"<s>[INST] {instruction}\n---\n{context}\n--- [/INST]" # Match training format
            prompts.append(prompt_text)
            labels.append(sample['output']) # The reference transcript chunk

        # Tokenize prompts
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_seq_length).to(device)

        # Generate predictions
        # Use the trainer's model directly
        with torch.no_grad():
             # Adjust generation parameters as needed (e.g., max_new_tokens)
            outputs = trainer.model.generate(
                **inputs,
                max_new_tokens=max_seq_length, # Allow generating up to max length
                eos_token_id=tokenizer.eos_token_id,
                do_sample=False, # Use greedy decoding for simplicity
                # num_beams=1 # for greedy
            )

        # Decode generated sequences
        # Important: Decode *only the generated part*, not the prompt
        preds_decoded = []
        for idx, output_tokens in enumerate(outputs):
            input_token_len = len(inputs["input_ids"][idx])
            generated_tokens = output_tokens[input_token_len:]
            pred = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            preds_decoded.append(pred.strip())

        all_preds.extend(preds_decoded)
        all_labels.extend(labels)

        if (i // eval_batch_size + 1) % 10 == 0: # Print progress every 10 batches
            print(f"Generated predictions for {i+eval_batch_size}/{len(eval_dataset)} samples...")

    # Compute ROUGE
try:
    rouge_results = rouge_scorer.compute(
        predictions=all_preds,
        references=all_labels
    )
    print("\nROUGE Scores:")
    print(rouge_results)

    # Add ROUGE-L to metrics
    if 'rougeL' in rouge_results:
        metrics["eval_rougeL"] = rouge_results['rougeL']
        # Log and save updated metrics
        trainer.log_metrics("eval", {"rougeL": rouge_results['rougeL']})
        trainer.save_metrics("eval", metrics) # Save combined eval metrics

except Exception as e:
    print(f"Could not compute ROUGE scores: {e}")
    print("Example Prediction:", all_preds[0] if all_preds else "N/A")
    print("Example Label:", all_labels[0] if all_labels else "N/A")

else:
    print("\nNo evaluation dataset provided. Skipping evaluation.")

print("\n--- Training and Evaluation Complete ---")
print(f"Adapter saved in: {output_dir}")
if push_to_hub and hf_hub_repo_id:
    print(f"Adapter pushed to: https://huggingface.co/{hf_hub_repo_id}")

# Clean up memory (important in Colab)
# del model
# del trainer
# import gc
# torch.cuda.empty_cache()
# gc.collect()

## 10. Download Adapter

If you want to download the adapter directly from Colab, you can zip the output directory.

In [None]:
import shutil

adapter_zip_name = f"{os.path.basename(output_dir)}"
# Check if the directory exists before zipping
if os.path.isdir(output_dir):
    print(f"Zipping adapter directory: {output_dir} -> {adapter_zip_name}.zip")
    shutil.make_archive(adapter_zip_name, 'zip', output_dir)
    print(f"Adapter zipped to {adapter_zip_name}.zip")
    # You can now download this zip file from the Colab file browser
else:
    print(f"Output directory {output_dir} not found. Cannot create zip file.")