In [None]:
!pip install torch transformers datasets peft bitsandbytes ipython

In [None]:
!pip install datasets --upgrade

In [None]:
!pip install bitsandbytes --upgrade

In [None]:
!pip install peft --upgrade

In [None]:
# Step 0: Import necessary packages
from typing import Dict, List
from datasets import Dataset, load_dataset, disable_caching
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model
import torch
from torch.utils.data import Dataset
from functools import partial
import copy
from IPython.display import Markdown

# Check if prepare_model_for_int8_training is available
try:
    from peft import prepare_model_for_int8_training
    USE_INT8_PREP = True
except ImportError:
    print("Warning: prepare_model_for_int8_training not found in peft. Proceeding with load_in_8bit=True only.")
    USE_INT8_PREP = False

# Disable Hugging Face cache
disable_caching()

# Step 1: Data Loading
# Load LaMini-instruction dataset
try:
    dataset = load_dataset("MBZUAI/LaMini-instruction", split='train')
    small_dataset = dataset.select([i for i in range(200)])
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Define templates
prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request. Instruction: {instruction}\n Response:"""
answer_template = """{response}"""

# Function to add prompt, answer, and text keys to the dataset
def _add_text(rec):
    instruction = rec["instruction"]
    response = rec["response"]
    if not instruction:
        raise ValueError(f"Expected an instruction in: {rec}")
    if not response:
        raise ValueError(f"Expected a response in: {rec}")
    rec["prompt"] = prompt_template.format(instruction=instruction)
    rec["answer"] = answer_template.format(response=response)
    rec["text"] = rec["prompt"] + rec["answer"]
    return rec

# Apply the function to the dataset
small_dataset = small_dataset.map(_add_text)

# Step 2: Tokenizer and Model Loading
model_id = "databricks/dolly-v2-3b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Check if GPU is available
if torch.cuda.is_available():
    print("Using GPU for model loading")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        load_in_8bit=True,
        torch_dtype=torch.float16
    )
else:
    print("GPU not available, falling back to CPU (note: this may be slower and memory-intensive)")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="cpu",
        torch_dtype=torch.float16
    )

# Resize token embeddings
model.resize_token_embeddings(len(tokenizer))

# Step 3: Data Preparation
MAX_LENGTH = 256

# Function to preprocess batch
def _preprocess_batch(batch: Dict[str, List]):
    model_inputs = tokenizer(batch["text"], max_length=MAX_LENGTH, truncation=True, padding='max_length')
    model_inputs["labels"] = copy.deepcopy(model_inputs['input_ids'])
    return model_inputs

_preprocessing_function = partial(_preprocess_batch)

# Apply preprocessing to the dataset
encoded_small_dataset = small_dataset.map(
    _preprocessing_function,
    batched=True,
    remove_columns=["instruction", "response", "prompt", "answer"]
)

# Filter dataset by token length
processed_dataset = encoded_small_dataset.filter(lambda rec: len(rec["input_ids"]) <= MAX_LENGTH)

# Split dataset into train and test
split_dataset = processed_dataset.train_test_split(test_size=14, seed=0)

# Create data collator
data_collator = DataCollatorForSeq2Seq(
    model=model,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    pad_to_multiple_of=8,
    padding='max_length'
)

# Step 4: Configuring LoRA
LORA_R = 256
LORA_ALPHA = 512
LORA_DROPOUT = 0.05

lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["query_key_value"]
)

# Prepare model for int8 training if available
if USE_INT8_PREP and torch.cuda.is_available():
    model = prepare_model_for_int8_training(model)

# Apply LoRA configuration
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Step 5: Model Training and Saving
EPOCHS = 3
LEARNING_RATE = 1e-4
MODEL_SAVE_FOLDER_NAME = "dolly-3b-lora"

training_args = TrainingArguments(
    output_dir=MODEL_SAVE_FOLDER_NAME,
    overwrite_output_dir=True,
    fp16=True if torch.cuda.is_available() else False,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    logging_strategy="epoch",
    eval_strategy="epoch",  # Changed from evaluation_strategy
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset["test"],
    data_collator=data_collator
)

model.config.use_cache = False
trainer.train()

# Save the LoRA weights and full model
trainer.model.save_pretrained(MODEL_SAVE_FOLDER_NAME)
trainer.save_model(MODEL_SAVE_FOLDER_NAME)
trainer.model.config.save_pretrained(MODEL_SAVE_FOLDER_NAME)

# Step 6: Prediction with the Fine-tuned Model
# Function to postprocess the response
def postprocess(response):
    messages = response.split("Response:")
    if not messages:
        raise ValueError("Invalid template for prompt. The template should include the term 'Response:'")
    return "".join(messages[1:])

# Inference prompt
inference_prompt = "List 5 reasons why someone should learn to cook"

# Create inference pipeline
inf_pipeline = pipeline('text-generation', model=trainer.model, tokenizer=tokenizer, max_length=256, trust_remote_code=True)

# Generate response
response = inf_pipeline(prompt_template.format(instruction=inference_prompt))[0]['generated_text']
formatted_response = postprocess(response)

# Print the formatted response
print(formatted_response)

In [None]:
# Make sure you are logged in to Hugging Face
from huggingface_hub import login
login()  # Run this once in your environment

In [None]:
with open("dolly-3b-lora/README.md", "w") as f:
    f.write(
        "---\n"
        "license: apache-2.0\n"
        "tags:\n"
        "  - dolly-v2\n"
        "  - instruction-tuning\n"
        "  - peft\n"
        "  - lora\n"
        "library_name: transformers\n"
        "---\n\n"
        "# Fine-tuned Dolly V2 3B with LoRA\n\n"
        "This model was fine-tuned on the LaMini-instruction dataset using PEFT LoRA.\n"
    )

In [None]:
model.push_to_hub("avinashhm/dolly-3b-lora")
tokenizer.push_to_hub("avinashhm/dolly-3b-lora")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from transformers import pipeline

# Model names
base_model_name = "databricks/dolly-v2-3b"
peft_model_name = "avinashhm/dolly-3b-lora"

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load Base Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load PEFT (LoRA) Adapter
model = PeftModel.from_pretrained(
    base_model,
    peft_model_name,
    torch_dtype=torch.float16
)

# Merge adapter weights into base model (optional, improves speed)
model = model.merge_and_unload()

# Define prompt template (same as training)
prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request. Instruction: {instruction}\n Response:"""

# Create Text Generation Pipeline
inf_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    pad_token_id=tokenizer.eos_token_id,
    truncation=True,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.1
)

# List of test prompts
test_prompts = [
    "List 5 reasons why someone should learn to cook.",
    "Explain quantum computing in simple terms.",
    "Write a short story about a robot who dreams of flying.",
    "How can I stay productive while working from home?",
    "What are the benefits of practicing mindfulness daily?",
    "Tell me a joke about animals.",
    "Why is learning new languages important?"
]

# Run inference on all prompts
for i, prompt in enumerate(test_prompts):
    formatted_prompt = prompt_template.format(instruction=prompt)
    print(f"\n🔍 Prompt {i+1}: {prompt}")

    try:
        outputs = inf_pipeline(formatted_prompt)
        full_response = outputs[0]['generated_text']

        # Extract only the "Response:" part
        if " Response:" in full_response:
            response = full_response.split(" Response:")[-1].strip()
        else:
            response = full_response[len(formatted_prompt):].strip()

        print(f"🤖 Response: {response}")
    except Exception as e:
        print(f"❌ Error generating response: {e}")

    print("-" * 80)