In [1]:
# Install required packages
!pip install --no-deps peft accelerate bitsandbytes
!pip install py7zr

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1
Collecting py7zr
  Downloading py7zr-0.22.0-py3-none-any.whl.metadata (16 kB)
Collecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting pyzstd>=0.15.9 (from py7zr)
  Downloading pyzstd-0.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.4 kB)
Collecting pyppmd<1.2.0,>=1.1.0 (from py7zr)
  Downloading pyppmd-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux20

In [2]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# Install Flash Attention 2 for softcapping support
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

In [3]:

from unsloth import FastLanguageModel
import torch
max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-9b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.10: Fast Gemma2 patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [4]:
# Apply PEFT (Parameter Efficient Fine-Tuning) to the loaded model
model = FastLanguageModel.get_peft_model(
    model,
    r=8,  # Reduced LoRA rank for lower VRAM usage
    target_modules=[
        "q_proj", "v_proj", "gate_proj",
    ],  # Minimal modules for task-specific fine-tuning
    lora_alpha=16,  # Scaling factor for LoRA; unchanged
    lora_dropout=0,  # Small dropout for better generalization
    bias="none",  # No additional bias to reduce memory
    use_gradient_checkpointing="unsloth",  # Optimized gradient checkpointing
    random_state=3407,  # Ensure reproducibility
    use_rslora=False,  # Disabling Rank Stabilized LoRA (default)
    loftq_config=None,  # Disabling LoftQ (default)
)


Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2024.11.10 patched 42 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [5]:
from datasets import load_dataset


In [9]:
# Load dataset and filter for brainstorming examples
dataset = load_dataset("databricks/databricks-dolly-15k")
brainstorm_dataset = dataset.filter(lambda x: x['category'] == 'brainstorming')
brainstorm_subset = brainstorm_dataset['train']

In [10]:
# Print dataset information
print(f"\n## Dataset Overview")
print(f"Total brainstorming examples: {len(brainstorm_subset)}")

# Print sample examples
print("\n## Sample Brainstorming Examples")
for i in range(2):
    print(f"\nExample {i+1}:")
    print("Instruction:")
    print("-" * 50)
    print(brainstorm_subset[i]['instruction'])
    print("\nResponse:")
    print("-" * 50)
    print(brainstorm_subset[i]['response'])
    print("=" * 50)


## Dataset Overview
Total brainstorming examples: 1766

## Sample Brainstorming Examples

Example 1:
Instruction:
--------------------------------------------------
Why mobile is bad for human

Response:
--------------------------------------------------
We are always engaged one phone which is not good.

Example 2:
Instruction:
--------------------------------------------------
What are some unique curtain tie backs that you can make yourself?

Response:
--------------------------------------------------
There are many items you can use to make a curtain tie back. Some ideas for this include a chain, a circle and pin, jute rope, wooden beaded rope, a necklack or bracelet, a door knob, a length of a leather belt, macrame rope, or a string of artificial flowers.


In [11]:
def format_instruction(example):
    return {
        "text": f"Generate creative ideas for this request:\n\nRequest: {example['instruction']}\n\nIdeas: {example['response']}"
    }

In [12]:
# Format dataset
formatted_dataset = brainstorm_subset.map(format_instruction)

# Print example of formatted data
print("\n## Training Format Example")
print(formatted_dataset[0]['text'])

Map:   0%|          | 0/1766 [00:00<?, ? examples/s]


## Training Format Example
Generate creative ideas for this request:

Request: Why mobile is bad for human

Ideas: We are always engaged one phone which is not good.


In [14]:

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_dataset,
    max_seq_length=max_seq_length,
    dataset_num_proc=2,  # Use 2 processors for dataset preprocessing
    packing=False,  # Packing disabled; useful for variable-length sequences
    args=TrainingArguments(
        per_device_train_batch_size=1,  # Lower batch size to fit within 10GB
        gradient_accumulation_steps=8,  # Maintain effective batch size
        warmup_steps=5,
        max_steps=50,  # Reduced steps for faster completion
        learning_rate=2e-4,  # Learning rate; can be adjusted if needed
        fp16=not is_bfloat16_supported(),  # Enable FP16 if bfloat16 not supported
        bf16=is_bfloat16_supported(),  # Enable bfloat16 if supported
        logging_steps=5,  # Log every 5 steps
        optim="adamw_8bit",  # Optimizer for memory efficiency
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,  # For reproducibility
        output_dir="./brainstorming_model",  # Directory for model checkpoints
        report_to="none",  # Disable external reporting (e.g., WandB)
    ),
)


max_steps is given, it will override any value given in num_train_epochs


In [15]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
6.715 GB of memory reserved.


In [16]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,766 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 8
\        /    Total batch size = 8 | Total steps = 50
 "-____-"     Number of trainable parameters = 10,493,952


Step,Training Loss
5,2.4881
10,1.7379
15,1.852
20,1.8851
25,1.5383
30,1.879
35,1.6422
40,1.6207
45,1.7204
50,1.4102


In [17]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

856.7435 seconds used for training.
14.28 minutes used for training.
Peak reserved memory = 9.076 GB.
Peak reserved memory for training = 2.361 GB.
Peak reserved memory % of max memory = 61.541 %.
Peak reserved memory for training % of max memory = 16.009 %.


In [18]:
# Save the model
trainer.save_model("./brainstorming_assistant_final")



In [20]:
# First prepare the model for inference
model = FastLanguageModel.for_inference(model)

In [19]:
def generate_ideas(prompt):
    instruction = f"Generate creative ideas for this request:\n\nRequest: {prompt}\n\nIdeas:"

    inputs = tokenizer(
        instruction,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.8,
        do_sample=True,
        top_p=0.9,
        num_return_sequences=1
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text.split("Ideas:")[-1].strip()


In [21]:
# Test cases with diverse brainstorming scenarios
test_prompts = [
    "What are some creative ways to reduce plastic waste in a corporate office?",

    "Suggest innovative team building activities for a remote work environment.",

    "Brainstorm unique marketing ideas for a local bookstore.",

    "What are some creative ways to teach children about financial literacy?",

    "Generate ideas for transforming a small balcony into a productive garden space."
]

In [22]:
print("## Testing Brainstorming Model with Custom Prompts")
for i, prompt in enumerate(test_prompts, 1):
    print(f"\nTest Case {i}:")
    print("Prompt:")
    print("-" * 50)
    print(prompt)
    print("\nGenerated Ideas:")
    print("-" * 50)
    print(generate_ideas(prompt))
    print("=" * 70)

## Testing Brainstorming Model with Custom Prompts

Test Case 1:
Prompt:
--------------------------------------------------
What are some creative ways to reduce plastic waste in a corporate office?

Generated Ideas:
--------------------------------------------------
1. Get rid of plastic water bottles and replace with reusable glass bottles
2. Get rid of plastic water bottles and replace with reusable aluminum bottles
3. Replace plastic coffee cups with glass or ceramic
4. Replace plastic food containers with glass or ceramic
5. Replace plastic straws with stainless steel straws
6. Replace plastic shopping bags with reusable bags
7. Reduce the amount of plastic wrapping used on new products
8. Replace plastic nametags with reusable ones
9. Replace plastic badges with reusable ones
10. Replace plastic keychains with reusable ones
11. Replace plastic office supplies with reusable ones
12. Replace plastic cutlery with reusable ones
13. Replace plastic disposable cups with reusable ones
1

In [26]:
model.push_to_hub("AagamShah08/gemma2_9B_Brainstorming", token="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") # Online saving
tokenizer.push_to_hub("AagamShah08/gemma2_9B_Brainstorming", token="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") # Online saving

README.md:   0%|          | 0.00/580 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Saved model to https://huggingface.co/AagamShah08/gemma2_9B_Brainstorming


  0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]