<a href="https://colab.research.google.com/github/DhrubaAdhikary/GEN_AI_DEMO/blob/master/FineTuning_SLM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
from google.colab import drive
drive.mount('/content/drive')

import os

BASE_DIR = "/content/drive/MyDrive/qlora-shakespeare"
os.makedirs(BASE_DIR, exist_ok=True)

print("Saving everything to:", BASE_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saving everything to: /content/drive/MyDrive/qlora-shakespeare


In [21]:
!pip install -qU \
    transformers \
    datasets \
    peft \
    accelerate \
    bitsandbytes \
    trl \
    sentencepiece


In [22]:
!git clone https://github.com/cobanov/shakespeare-dataset.git \
    /content/drive/MyDrive/shakespeare-dataset


Cloning into '/content/drive/MyDrive/shakespeare-dataset'...
remote: Enumerating objects: 56, done.[K
remote: Counting objects: 100% (56/56), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 56 (delta 4), reused 3 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (56/56), 2.08 MiB | 5.21 MiB/s, done.
Resolving deltas: 100% (4/4), done.
Updating files: 100% (43/43), done.


In [23]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

MODEL_NAME = "tiiuae/falcon-rw-1b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

model.gradient_checkpointing_enable()
model.config.use_cache = False


Loading weights:   0%|          | 0/292 [00:00<?, ?it/s]



In [24]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


trainable params: 1,572,864 || all params: 1,416,220,672 || trainable%: 0.1111


In [26]:
import glob
from datasets import Dataset

# Load each play separately
repo_path = "/content/drive/MyDrive/shakespeare-dataset/text"
text_files = glob.glob(repo_path + "/*.txt")

texts = []

for file_path in text_files:
    with open(file_path, "r", encoding="utf-8") as f:
        texts.append(f.read())

print("Loaded", len(texts), "plays")

dataset = Dataset.from_dict({"text": texts})


Loaded 42 plays


In [27]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=False
    )

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)


Map:   0%|          | 0/42 [00:00<?, ? examples/s]

In [28]:
block_size = 512

def group_texts(examples):
    concatenated_input_ids = sum(examples["input_ids"], [])
    concatenated_attention = sum(examples["attention_mask"], [])

    total_length = (len(concatenated_input_ids) // block_size) * block_size

    input_ids = [
        concatenated_input_ids[i:i+block_size]
        for i in range(0, total_length, block_size)
    ]

    attention_mask = [
        concatenated_attention[i:i+block_size]
        for i in range(0, total_length, block_size)
    ]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": input_ids.copy()
    }

lm_dataset = tokenized_dataset.map(
    group_texts,
    batched=True
)

print(lm_dataset)


Map:   0%|          | 0/42 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3118
})


In [29]:
lm_dataset = lm_dataset.train_test_split(test_size=0.05)

train_dataset = lm_dataset["train"]
eval_dataset = lm_dataset["test"]


In [30]:


print("Train samples:", len(train_dataset))
print("Eval samples:", len(eval_dataset))


Train samples: 2962
Eval samples: 156


In [32]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=BASE_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    max_steps=2000,
    logging_steps=50,
    save_steps=500,
    save_total_limit=3,
    fp16=True,
    optim="paged_adamw_8bit",
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="none"
)


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

In [35]:
import os
from transformers.trainer_utils import get_last_checkpoint

checkpoint = get_last_checkpoint(BASE_DIR)

if checkpoint is not None:
    print("Resuming from:", checkpoint)
    trainer.train(resume_from_checkpoint=checkpoint)
else:
    print("Starting fresh training...")
    trainer.train()


Starting fresh training...


  return fn(*args, **kwargs)


Step,Training Loss
50,3.301892
100,3.117101
150,3.054011
200,3.039511
250,3.016397
300,2.989052
350,2.991069
400,2.953282
450,2.938145
500,2.946184


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


In [36]:
FINAL_DIR = os.path.join(BASE_DIR, "final_adapter")
os.makedirs(FINAL_DIR, exist_ok=True)

model.save_pretrained(FINAL_DIR)
tokenizer.save_pretrained(FINAL_DIR)

('/content/drive/MyDrive/qlora-shakespeare/final_adapter/tokenizer_config.json',
 '/content/drive/MyDrive/qlora-shakespeare/final_adapter/tokenizer.json')

In [37]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

MODEL_NAME = "tiiuae/falcon-rw-1b"
FINAL_DIR = "/content/drive/MyDrive/qlora-shakespeare/final_adapter"

# 4-bit config (same as training)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

# Attach LoRA
model = PeftModel.from_pretrained(base_model, FINAL_DIR)

model.eval()
print("Model loaded successfully.")


Loading weights:   0%|          | 0/292 [00:00<?, ?it/s]



Model loaded successfully.


In [38]:
def generate_text(prompt,
                  max_new_tokens=150,
                  temperature=0.8,
                  top_p=0.9):

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [39]:
prompt = "To be, or not to be,"
result = generate_text(prompt)

print("\n=== GENERATED TEXT ===\n")
print(result)



=== GENERATED TEXT ===

To be, or not to be, a
savior of my soul, and of my neighbor's, and of my own
soul, and of my neighbor's neighbor, I will do the
will of my master, my lord, and my lady, and do the
will of God, and do the will of my neighbor.

PRINCE  Come, come.

QUEEN ELIZABETH
But, God be with you! What news?

PRINCE  You have done your part.

QUEEN ELIZABETH
Then I must go and talk with him.
[She exits.]

PRINCE  O, here's a good man!

FALSTAFF



In [40]:
# Load base model without LoRA
base_only = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

def generate_base(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = base_only.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.8,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print("=== BASE MODEL ===\n")
print(generate_base(prompt))


Loading weights:   0%|          | 0/292 [00:00<?, ?it/s]



=== BASE MODEL ===

To be, or not to be, that is the question.”
As the question is answered, the answer will change in a very literal sense. In the past, when the question has been answered, the answer has changed in a very literal sense.
“If the earth was flat, we’d all be floating on it!”
“If the earth was flat, we’d all be floating on it!”
“If the earth was flat, we’d all be floating on it!”
“If the earth was flat, we’d all be floating on it!”
If the earth was flat, we’d all be floating on it!
If the earth
