In [1]:
# ✅ Install Dependencies
# Install required libraries for quantized training, model loading, fine-tuning, evaluation, etc.
!pip install -q -U bitsandbytes transformers peft accelerate datasets scipy einops evaluate trl rouge_score

In [2]:
# ✅ Imports
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
)
from trl import SFTTrainer
import torch, time, os
import pandas as pd
import numpy as np
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)

In [3]:
# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

In [4]:
# ✅ Load Dataset
# Using a summarization dataset based on dialogue
dataset = load_dataset("neil-code/dialogsum-test")

In [5]:
# ✅ Quantization Config
# Use 4-bit quantization (NF4) to load model efficiently in limited memory
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

In [6]:
# ✅ Load Base Model & Tokenizer
model_name = "microsoft/phi-2"

# Load the pretrained language model in quantized form
original_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

In [7]:
# Load corresponding tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left",
    add_bos_token=True,
    add_eos_token=True,
    trust_remote_code=True,
    use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token  # Ensure correct padding token

In [8]:
# ✅ Format Data
# Create a prompt-response format suitable for instruction fine-tuning
def create_prompt_formats(sample):
    blurb = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    instruction = "### Instruct: Summarize the below conversation."
    response = f"### Output:\n{sample['summary']}"
    end = "### End"
    parts = [blurb, instruction, sample['dialogue'], response, end]
    sample["text"] = "\n\n".join(parts)
    return sample

In [9]:
# ✅ Preprocess Functions
from functools import partial

# Get the model's maximum supported sequence length
def get_max_length(model):
    for attr in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_len = getattr(model.config, attr, None)
        if max_len:
            return max_len
    return 1024  # fallback default

# Tokenize each example
def preprocess_batch(batch, tokenizer, max_length):
    return tokenizer(batch["text"], max_length=max_length, truncation=True)

# Full dataset preprocessing pipeline
def preprocess_dataset(tokenizer, max_length, seed, dataset):
    dataset = dataset.map(create_prompt_formats)
    preprocess_fn = partial(preprocess_batch, tokenizer=tokenizer, max_length=max_length)
    dataset = dataset.map(preprocess_fn, batched=True, remove_columns=['id', 'topic', 'dialogue', 'summary'])
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    return dataset.shuffle(seed=seed)

# ✅ Tokenize & Prepare Dataset
seed = 42
max_length = get_max_length(original_model)

# Process train and validation splits
train_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset["train"])
eval_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset["validation"])

In [10]:
# ✅ Prepare for QLoRA Training
original_model = prepare_model_for_kbit_training(original_model)

# ✅ Apply LoRA Configuration
# LoRA allows training a small subset of weights, making fine-tuning memory-efficient
config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "dense"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

# Enable gradient checkpointing and apply LoRA
original_model.gradient_checkpointing_enable()
peft_model = get_peft_model(original_model, config)

# ✅ Print Trainable Parameters
def print_number_of_trainable_model_parameters(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable} / {total} ({100 * trainable / total:.2f}%)")

print_number_of_trainable_model_parameters(peft_model)

In [None]:
# ✅ Define Training Arguments
output_dir = f"/content/peft-dialogue-summary-{int(time.time())}"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=1000,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    save_strategy="steps",
    save_steps=25,
    gradient_checkpointing=True,
    report_to="none",
    group_by_length=True,
)

from transformers import DataCollatorForLanguageModeling

# ✅ Train Model
trainer = Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args,
    # data_collator=torch.utils.data.default_collate,
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)
trainer.train()

In [None]:
# ✅ Load Trained PEFT Model
# Merge LoRA adapter with the base model for inference
ft_model = PeftModel.from_pretrained(
    AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=bnb_config),
    output_dir + "/checkpoint-1000",
    torch_dtype=torch.float16,
    is_trainable=False
)

# ✅ Inference Function
def gen(model, prompt, max_new_tokens=100):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    outputs = model.generate(input_ids, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [15]:
# ✅ Evaluate on Sample Test Set
from evaluate import load as eval_load

# Get sample data
test_dialogues = dataset["test"][:10]["dialogue"]
test_summaries = dataset["test"][:10]["summary"]

# Load original (non-fine-tuned) base model
base_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=bnb_config)

# Generate summaries for both models
original_summaries = []
peft_summaries = []

for d in test_dialogues:
    prompt = f"Instruct: Summarize the following conversation.\n{d}\nOutput:\n"
    orig = gen(base_model, prompt, 100)[0].split("Output:\n")[-1]
    peft = gen(ft_model, prompt, 100)[0].split("Output:\n")[-1].split("###")[0]
    original_summaries.append(orig)
    peft_summaries.append(peft)

# ✅ Create Comparison Table
df = pd.DataFrame({
    "Human Summary": test_summaries,
    "Original Model": original_summaries,
    "PEFT Model": peft_summaries
})
display(df)

# ✅ Compute ROUGE Scores
rouge = eval_load("rouge")
orig_scores = rouge.compute(predictions=original_summaries, references=test_summaries)
peft_scores = rouge.compute(predictions=peft_summaries, references=test_summaries)

# ✅ Print Results
print("Original Model:", orig_scores)
print("PEFT Model:", peft_scores)

# ✅ Print Improvement
print("Improvement:")
for k in orig_scores.keys():
    diff = peft_scores[k] - orig_scores[k]
    print(f"{k}: {diff*100:.2f}%")

In [16]:
!pip install huggingface_hub

In [17]:
# Save the fine-tuned model
peft_model.save_pretrained("/content/fine_tuned_phi_model")

# Save the tokenizer
tokenizer.save_pretrained("/content/fine_tuned_phi_model")

In [20]:
from huggingface_hub import create_repo, upload_folder, login

# Log in to Hugging Face
login(token="your_hf_token")

# Define the repository name
repo_name = "ArchitJ6/phi-2-finetune-archit"

# Create the repository on Hugging Face (if not already created)
create_repo(repo_name)

# Push the fine-tuned model and tokenizer to Hugging Face
upload_folder(
    folder_path="/content/fine_tuned_phi_model",  # Path to the saved model and tokenizer
    repo_id=repo_name,  # Your Hugging Face model repository name
    repo_type="model"  # Specify the type as model
)
