<a href="https://colab.research.google.com/github/Blopinpg1/finetuning-_with_loRA/blob/main/finetuning__with_loRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#SETUP AND CONFIG

In [None]:
# @title Install the Dependencies and Set Everything Up {"display-mode": "form"}

!pip install transformers datasets accelerate bitsandbytes -q

# - transformers: For models and tokenizers
# - datasets: To easily load and process our training data
# - accelerate: A library from Hugging Face to simplify training on any infrastructure (like the Colab GPU)
# - bitsandbytes: For quantization to make training more memory-efficient

# Import the required libraries
import pprint
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    pipeline,
    logging
)
from datasets import load_dataset
from google.colab import output
import pprint
import peft

# Suppress verbose output from transformers
logging.set_verbosity_error()

output.clear()

print("ü§ò The setup is complete.")

#FINDING AND PREPARING THE DATA

In [None]:
from datasets import load_dataset

dataset = load_dataset("Someman/news_nepali", split="train")
dataset

dataset[0]


In [None]:
def format_prompt(example):
    instruction = "‡§Ø‡•Ä ‡§®‡•á‡§™‡§æ‡§≤‡•Ä ‡§∏‡§Æ‡§æ‡§ö‡§æ‡§∞‡§ï‡•ã ‡§∏‡§Ç‡§ï‡•ç‡§∑‡§ø‡§™‡•ç‡§§ ‡§∏‡§æ‡§∞ ‡§≤‡•á‡§ñ‡•ç‡§®‡•Å‡§π‡•ã‡§∏‡•ç:"

    article = example["article"]
    summary = example["article_summary"]

    prompt = (
        f"<s>[INST] {instruction}\n\n"
        f"{article} [/INST] "
        f"{summary} </s>"
    )
    return {"text": prompt}



formatted_dataset = dataset.map(format_prompt)

# This is what the first examplwe will look like.
print(formatted_dataset[0]['text'])


# Loading the Pre-Trained Model and Tokenizer

In [None]:
model_name = "NousResearch/Llama-2-7b-chat-hf"

# Quantization configuration to load the model in 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

def get_model():
    # Load the model with our quantization configuration
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        trust_remote_code=True
    )

    model.config.use_cache = False

    output.clear()

    return model

# Disable cache to prepare for training

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set the padding token to be the same as the end-of-sequence token.
# This is a common practice for decoder-only models.
tokenizer.pad_token = tokenizer.eos_token

print("[EOS]", tokenizer.pad_token)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Configure LoRA
lora_config = peft.LoraConfig(
    r=8,  # Rank of the update matrices.
    lora_alpha=32,  # Scaling factor for the LoRA weights.
    lora_dropout=0.05,  # Dropout probability for LoRA layers.
    bias="none",  # Bias type (none, all, or lora_only).
    task_type="CAUSAL_LM",  # Task type (e.g., CAUSAL_LM for language generation).
    fan_in_fan_out=True, # Explicitly set for Conv1D layers
)

# Add LoRA adapters to the model
model = peft.get_peft_model(get_model(), lora_config)

# Print the trainable parameters to see the effect of LoRA
model.print_trainable_parameters()

#tokenize the dataset of our new formatted strings.

In [None]:
def tokenize_nepali_dataset(examples):
    """
    Tokenizes the formatted instruction-response text for LLaMA/LoRA training.

    Args:
        examples: a batch of examples from formatted_dataset, each with a 'text' key

    Returns:
        A dictionary with 'input_ids' and 'attention_mask' for the model
    """
    return tokenizer(
        examples['text'],        # the formatted <instruction>‚Ä¶<response> text
        padding="max_length",    # pad all sequences to max_length
        truncation=True,         # truncate sequences longer than max_length
        max_length=256          # adjust this to fit your GPU/memory
    )

# Tokenize the entire dataset
tokenized_dataset = formatted_dataset.map(tokenize_nepali_dataset, batched=True)
small_dataset = tokenized_dataset.select(range(1000))


# Optional: inspect the first example
import pprint
pprint.pp(tokenized_dataset[0], compact=True)


#TRAINING ARGUMENTS

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./nepali-lora-summarization",  # Save directory for your LoRA model
    num_train_epochs=1,                        # Train for 3 epochs (can adjust)
    per_device_train_batch_size=1,             # 1 example per GPU (safe for 7B in 4-bit)
    gradient_accumulation_steps=4,             # Simulates larger batch size
    learning_rate=2e-4,                        # LoRA-friendly learning rate
    fp16=True,                                 # Mixed precision for faster training
    logging_steps=50,                          # Log every 50 steps
    save_steps=500,                            # Save checkpoint every 500 steps
    save_total_limit=2,                        # Keep only last 2 checkpoints
    report_to="none",                          # Disable logging to W&B or other services
    remove_unused_columns=True,
    # lr_scheduler_type="cosine",
    # warmup_steps=50,

)

from transformers import Trainer

trainer = Trainer(
    model=model,                     # LoRA-wrapped model
    args=training_args,               # Training arguments from above
    train_dataset=small_dataset,      # small part  tokenized Nepali dataset
    tokenizer=tokenizer,              # Needed for saving the model correctly
    data_collator=data_collator       # Prepares batches
)


#FINE TUNING

In [None]:
# Let's start fine-tuning!
print("üöÄ Starting fine-tuning‚Ä¶")
trainer.train()
print("‚úÖ Fine-tuning complete!")

# This saves the final model and tokenizer to the output directory
final_model_dir = "./nepali-lora-summarization-final"
trainer.save_model(final_model_dir)
print(f"Model saved to {final_model_dir}")

# Clear model and trainer to free up GPU memory before inference
del model
del trainer
import gc
gc.collect()
torch.cuda.empty_cache()
print("GPU memory cleared after training.")

#TESTING OUR MODEL

In [None]:
from peft import PeftModel
import torch
import gc # Import gc for garbage collection

# Original article to be summarized
original_article = "‡§∏‡§§‡•ç‡§§‡§æ‡§∞‡•Å‡§¢ ‡§®‡•á‡§ï‡§™‡§æ (‡§Æ‡§æ‡§ì‡§µ‡§æ‡§¶‡•Ä ‡§ï‡•á‡§®‡•ç‡§¶‡•ç‡§∞)‡§≤‡•á ‡§ö‡•à‡§§ ‡•ß‡•ß ‡§ó‡§§‡•á (‡§∏‡•ã‡§Æ‡§¨‡§æ‡§∞)‡§≠‡§ø‡§§‡•ç‡§∞ ‡§Æ‡§®‡•ç‡§§‡•ç‡§∞‡§ø‡§™‡§∞‡§ø‡§∑‡§¶‡•ç‡§≤‡§æ‡§à ‡§™‡•Ç‡§∞‡•ç‡§£‡§§‡§æ ‡§¶‡§ø‡§®‡•á ‡§®‡§ø‡§∞‡•ç‡§£‡§Ø ‡§ó‡§∞‡•á‡§ï‡•ã ‡§õ‡•§ ‡§∂‡§®‡§ø‡§¨‡§æ‡§∞ ‡§¶‡§ø‡§â‡§Å‡§∏‡•ã ‡§™‡•á‡§∞‡§ø‡§∏‡§°‡§æ‡§Å‡§°‡§æ‡§Æ‡§æ ‡§¨‡§∏‡•á‡§ï‡•ã ‡§™‡§æ‡§∞‡•ç‡§ü‡•Ä ‡§™‡§¶‡§æ‡§ß‡§ø‡§ï‡§æ‡§∞‡•Ä ‡§¨‡•à‡§†‡§ï‡§≤‡•á ‡§â‡§ï‡•ç‡§§ ‡§®‡§ø‡§∞‡•ç‡§£‡§Ø ‡§ó‡§∞‡•á‡§ï‡•ã ‡§π‡•ã‡•§ ‡§Æ‡§æ‡§ì‡§µ‡§æ‡§¶‡•Ä ‡§ï‡•á‡§®‡•ç‡§¶‡•ç‡§∞‡§ï‡•ã ‡§ï‡•á‡§®‡•ç‡§¶‡•ç‡§∞‡•Ä‡§Ø ‡§ï‡§æ‡§∞‡•ç‡§Ø‡§æ‡§≤‡§Ø‡§ï‡§æ ‡§Ö‡§®‡•Å‡§∏‡§æ‡§∞ ‡§™‡§æ‡§∞‡•ç‡§ü‡•Ä‡§≤‡•á ‡§∏‡•ã‡§Æ‡§¨‡§æ‡§∞‡§∏‡§Æ‡•ç‡§Æ ‡§Æ‡§®‡•ç‡§§‡•ç‡§∞‡§ø‡§™‡§∞‡§ø‡§∑‡§¶‡•ç ‡§µ‡§ø‡§∏‡•ç‡§§‡§æ‡§∞ ‡§ó‡§∞‡•ç‡§®‡•á ‡§î‡§™‡§ö‡§æ‡§∞‡§ø‡§ï ‡§®‡§ø‡§∞‡•ç‡§£‡§Ø ‡§ó‡§∞‡•á‡§ï‡•ã ‡§∏‡§ö‡§ø‡§µ ‡§¶‡•á‡§µ‡•á‡§®‡•ç‡§¶‡•ç‡§∞ ‡§™‡•å‡§°‡•á‡§≤‡§≤‡•á ‡§ú‡§æ‡§®‡§ï‡§æ‡§∞‡•Ä ‡§¶‡§ø‡§è‡•§ ‡§∏‡§ö‡§ø‡§µ ‡§™‡•å‡§°‡•á‡§≤‡§≤‡•á ‡§™‡•ç‡§∞‡§¶‡•á‡§∂ ‡§∏‡§∞‡§ï‡§æ‡§∞‡§≤‡§æ‡§à ‡§∏‡§ô‡•ç‡§ò‡•Ä‡§Ø ‡§∏‡§∞‡§ï‡§æ‡§∞‡§ï‡•ã ‡§®‡•Ä‡§§‡§ø ‡§§‡§•‡§æ ‡§ï‡§æ‡§∞‡•ç‡§Ø‡§ï‡•ç‡§∞‡§Æ‡§∏‡§Å‡§ó ‡§ú‡•ã‡§°‡•á‡§∞ ‡§ó‡§§‡§ø‡§∂‡§ø‡§≤ ‡§¨‡§®‡§æ‡§â‡§®‡•Å‡§™‡§∞‡•ç‡§®‡•á‡§Æ‡§æ ‡§®‡•á‡§§‡§æ‡§π‡§∞‡•Å‡§≤‡•á ‡§ú‡•ã‡§° ‡§¶‡§ø‡§è‡§ï‡•ã ‡§ú‡§æ‡§®‡§ï‡§æ‡§∞‡•Ä ‡§¶‡§ø‡§Å‡§¶‡•à ‡§™‡§æ‡§∞‡•ç‡§ü‡•Ä ‡§®‡•á‡§§‡§æ‡§π‡§∞‡•Å‡§≤‡•á ‡§™‡•ç‡§∞‡§ß‡§æ‡§®‡§Æ‡§®‡•ç‡§§‡•ç‡§∞‡•Ä ‡§™‡•Å‡§∑‡•ç‡§™‡§ï‡§Æ‡§≤ ‡§¶‡§æ‡§π‡§æ‡§≤‡§ï‡•ã ‡§ß‡•ç‡§Ø‡§æ‡§®‡§æ‡§ï‡§∞‡•ç‡§∑‡§£ ‡§ó‡§∞‡§æ‡§è‡§ï‡§æ ‡§õ‡§®‡•ç‡•§ ‡§™‡§æ‡§∞‡•ç‡§ü‡•Ä ‡§∞ ‡§∏‡§∞‡§ï‡§æ‡§∞‡§¨‡•Ä‡§ö‡§ï‡•ã ‡§ñ‡§æ‡§°‡§≤ ‡§ò‡§ü‡§æ‡§â‡§® ‡§™‡§æ‡§∞‡•ç‡§ü‡•Ä ‡§Ö‡§ß‡•ç‡§Ø‡§ï‡•ç‡§∑ ‡§°‡§æ.‡§≤‡•á ‡§™‡§®‡§ø ‡§™‡•ç‡§∞‡§Ø‡§æ‡§∏ ‡§ó‡§∞‡•á‡§ï‡§æ ‡§õ‡§®‡•ç‡•§"

# Define the instruction for summarization as used in training
instruction = "‡§Ø‡•Ä ‡§®‡•á‡§™‡§æ‡§≤‡•Ä ‡§∏‡§Æ‡§æ‡§ö‡§æ‡§∞‡§ï‡•ã ‡§∏‡§Ç‡§ï‡•ç‡§∑‡§ø‡§™‡•ç‡§§ ‡§∏‡§æ‡§∞ ‡§≤‡•á‡§ñ‡•ç‡§®‡•Å‡§π‡•ã‡§∏‡•ç:"

# Format the prompt for inference according to the training format
formatted_prompt = (
    f"<s>[INST] {instruction}\n\n"
    f"{original_article} [/INST] "
)

print("--- Testing the Original Base Model ---")

# Load base model in 4-bit for inference
base_model_for_inference = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    device_map="auto"
)

base_generator = pipeline(
    "text-generation",
    model=base_model_for_inference, # Pass the loaded 4-bit model object
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

base_out = base_generator(
    formatted_prompt, # Use the formatted prompt here
    max_new_tokens=150,
    do_sample=False,      # Better for summarization
)

print("Base model response:")
# Extract only the generated summary part by removing the input prompt
base_generated_text = base_out[0]["generated_text"].replace(formatted_prompt, "").strip()
# Remove the </s> token if present
base_generated_text = base_generated_text.replace("</s>", "").strip()
print(base_generated_text)

# Clear base model from memory before loading fine-tuned model
del base_generator
del base_model_for_inference
gc.collect()
torch.cuda.empty_cache()
print("Base model cleared from GPU memory.")

print("\n--- Testing Our Fine-Tuned Nepali Model ---")

# Load base model with the same quantization config used during training
base_model_for_finetune = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    device_map="auto"
)

# Load LoRA adapters
fine_tuned_model = PeftModel.from_pretrained(base_model_for_finetune, final_model_dir)

# Merge LoRA into base model for inference
fine_tuned_model = fine_tuned_model.merge_and_unload()

# Create inference pipeline
fine_tuned_generator = pipeline(
    "text-generation",
    model=fine_tuned_model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto"
)

ft_out = fine_tuned_generator(
    formatted_prompt, # Use the formatted prompt here
    max_new_tokens=250,
    do_sample=False,      # <- IMPORTANT
)

print("Fine-tuned model response:")
# Extract only the generated summary part by removing the input prompt
ft_generated_text = ft_out[0]["generated_text"].replace(formatted_prompt, "").strip()
# Remove the </s> token if present
ft_generated_text = ft_generated_text.replace("</s>", "").strip()
print(ft_generated_text)