<a href="https://colab.research.google.com/github/2002sairuthvik/Fine_Tuning/blob/main/lora_fine_tuning_of_llama_3_2_3b_on_ascii.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3  peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
from google.colab import userdata

model,tokenizer = FastLanguageModel.from_pretrained(
    model_name="meta-llama/Llama-3.2-3B",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=False,
    token=userdata.get('HF_TOKEN')
)

==((====))==  Unsloth 2025.7.11: Fast Llama patching. Transformers: 4.54.1.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
tokenizer.clean_up_tokenization_spaces = False

In [None]:
target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

train_embeddings=False

if train_embeddings:
  target_modules = target_modules + ["lm_head"]

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=target_modules,
    lora_alpha=16,
    bias="none",
    use_gradient_checkpointing = "unsloth",
    random_state=3407,
    use_rslora = False,
    loftq_config = None
)

Unsloth 2025.7.11 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
empty_prompt = """
{ascii_art}
"""

EOS_TOKEN =  tokenizer.eos_token

def formatting_prompts_func_no_prompt(examples):
  ascii_art_samples = examples["ascii"]
  training_prompts = []
  for ascii_art in ascii_art_samples:
    training_prompt = empty_prompt.format(ascii_art=ascii_art) + EOS_TOKEN
    training_prompts.append(training_prompt)
  return {"text":training_prompts}

from datasets import load_dataset
dataset  = load_dataset("pookie3000/ascii-cats",split='train')
dataset = dataset.map(formatting_prompts_func_no_prompt,batched=True)


Map:   0%|          | 0/201 [00:00<?, ? examples/s]

In [None]:
for i,sample in enumerate(dataset):
  print(f"\n-----------Sample {i+1}--------")
  print(sample["text"])
  if i>2:
    break


-----------Sample 1--------

    /\_/\           ___
   = o_o =_______    \ \ 
    __^      __(  \.__) )
(@)<_____>__(_____)____/
<|end_of_text|>

-----------Sample 2--------

|\---/|
| o_o |
 \_^_/
<|end_of_text|>

-----------Sample 3--------

 |\__/,|   (`\
 |_ _  |.--.) )
 ( T   )     /
(((^_(((/(((_/
<|end_of_text|>

-----------Sample 4--------

   |\---/|
   | ,_, |
    \_`_/-..----.
 ___/ `   ' ,""+ \  
(__...'   __\    |`.___.';
  (_,...'(_,.`__)/'.....+
<|end_of_text|>


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bf16_supported

trainer =  SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=2,
        learning_rate=2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        optim="adamw_8bit",
        logging_steps=1,
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir = "outputs",
        report_to = "none"
)
)

In [None]:
trainer_stats=trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 201 | Num Epochs = 2 | Total steps = 52
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss
1,1.6617
2,1.7106
3,1.8403
4,1.5978
5,2.2004
6,2.2023
7,2.3281
8,2.3346
9,1.8035
10,2.2131


In [None]:
from transformers import TextStreamer

def generate_ascii_art(model):
  FastLanguageModel.for_inference(model)
  inputs = tokenizer("",return_tensors="pt").to("cuda")
  text_streamer = TextStreamer(tokenizer)

  for token in model.generate(**inputs, streamer=text_streamer,max_new_tokens=100):
    print(token)
    pass

In [None]:
for _ in range(3):
  generate_ascii_art(model)

<|begin_of_text|>
   |\__/,|   (`\
   |_ _  |__ _  ) )
   ( T   )  `  / /
  (_>   )  _\  |
   `"""')  `"  )
    (___,'  (_/
<|end_of_text|>
tensor([128000,    198,    256,  64696,    565,  35645,     91,    256,  29754,
          5779,    256,  71986,    721,    220,    765,    565,    721,    220,
           883,   1763,    256,    320,    350,    256,    883,    220,   1595,
           220,    611,  40081,    220,   5570,     29,    256,    883,    220,
           721,     59,    220,   9432,    256,   1595,   3089,      1,    873,
           220,   1595,      1,    220,   1763,    262,    320,   6101,   2965,
           220,   5570,   6018, 128001], device='cuda:0')
<|begin_of_text|>
    /\_/\    _
   /     \  / )
   |  --  ( /
   \  --  // 
    `-----`
<|end_of_text|>
tensor([128000,    198,    262,  24445,     62,  35419,    262,  23843,    256,
           611,    257,   1144,    220,    611,   1763,    256,    765,    220,
          1198,    220,    320,  40081,    256,   1144,  

In [None]:
from google.colab import userdata
# model.push_to_hub(
#     "ruthvi29/Llama-3.2-3B-ascii-cats-lora",
#     tokenizer,
#     token = userdata.get('HF_TOKEN')
# )
model.push_to_hub_gguf(
    "ruthvi29/Llama-3.2-3B-ascii-cats-lora-q4_0-GGUF",
    tokenizer,
    quantization_method="q4_0",
    token = userdata.get('HF_TOKEN')
)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 35.61 out of 52.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:00<00:00, 42.75it/s]


Unsloth: Saving tokenizer... Done.
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at ruthvi29/Llama-3.2-3B-ascii-cats-lora-q4_0-GGUF into bf16 GGUF format.
The output location will be /content/ruthvi29/Llama-3.2-3B-ascii-cats-lora-q4_0-GGUF/unsloth.BF16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: Llama-3.2-3B-ascii-cats-lora-q4_0-GGUF
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ts-lora-q4_0-GGUF/unsloth.Q4_0.gguf:   2%|1         | 33.5MB / 1.92GB            

Saved GGUF to https://huggingface.co/ruthvi29/Llama-3.2-3B-ascii-cats-lora-q4_0-GGUF


In [None]:
from transformers import TextStreamer

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="ruthvi29/Llama-3.2-3B-ascii-cats-lora",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = False,
    token=userdata.get('HF_TOKEN')
)


def generate_ascii_art(model):
    FastLanguageModel.for_inference(model)
    inputs = tokenizer("", return_tensors = "pt").to("cuda")
    text_streamer = TextStreamer(tokenizer)
    # https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/text_generation#transformers.GenerationMixin
    # https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/text_generation#transformers.GenerationConfig
    for token in model.generate(**inputs, streamer = text_streamer, max_new_tokens = 100):
        print(token)
        pass

for _ in range(3):
  generate_ascii_art(model)


==((====))==  Unsloth 2025.7.11: Fast Llama patching. Transformers: 4.54.1.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

<|begin_of_text|>
  ((      /\_/\  
   \\.._.'  - -\  
   /\ | '.__ o /  
  (_.   /  --"  
   ) _)._  _ /  
  '.\ \|( / (  
    '' ''\\_\\  

<|end_of_text|>
tensor([128000,    198,    220,   1819,    415,  24445,     62,  35419,   2355,
           256,  26033,    497,     62,   3238,    220,    482,    482,     59,
          2355,    256,  24445,    765,    364,   4952,    297,    611,   2355,
           220,   5570,    662,    256,    611,    220,   1198,      1,   2355,
           256,    883,    721,  67756,    220,    721,    611,   2355,    220,
          6389,     59,   1144,  61116,    611,    320,   2355,    262,   3436,
          3436,   3505,     62,   3505,  19124, 128001], device='cuda:0')
<|begin_of_text|>
  |\__/,|   (`\
  |_ _  |.-'  ) )
  ( T   )  _  /
 (((^_(((/(((_>
<|end_of_text|>
tensor([128000,    198,    220,  64696,    565,  35645,     91,    256,  29754,
          5779,    220,  71986,    721,    220,    765,  12898,      6,    220,
           883,   1763,    2