In [1]:
from google.colab import userdata
from huggingface_hub import login
token = userdata.get('adithyasean')
login(token = token, add_to_git_credential = True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [5]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = False

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "adithyasean/Llama-3.1-Singlish-1.3-8B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = True
)

==((====))==  Unsloth 2024.9.post1: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:  79%|#######8  | 923M/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/5.54G [00:00<?, ?B/s]

Unsloth 2024.9.post1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

We also add `embed_tokens` and `lm_head` to allow the model to learn out of distribution data.

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = True,
    loftq_config = None,
)

Unsloth: Already have LoRA adapters! We shall skip this step.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


<a name="Data"></a>
### Data Prep
We now use the Sinhala subset of the [Wikipedia dataset](https://huggingface.co/datasets/wikimedia/wikipedia) to first continually pretrain the model.


We Use oour custorm script to transliterate from Sinhala to Singlish!

In [None]:
from datasets import load_dataset

dataset = load_dataset("adithyasean/singlish_30m", split = "train[:50000]", token=True)

In [None]:
from datasets import load_dataset

dataset = load_dataset("adithyasean/singlish_59m", split = "train[:50000]", token=True)

In [None]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_text_function(examples):
    return { "text" : [example + EOS_TOKEN for example in examples["text"]] }

dataset = dataset.map(formatting_text_function, batched = True,)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# Wikipedia provides a title and an article text.
wikipedia_prompt = """Wikipedia Article
### Title: {}

### Article:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    titles = examples["title"]
    texts  = examples["text"]
    outputs = []
    for title, text in zip(titles, texts):
        # Must add EOS_TOKEN, otherwise the generation will go on forever!
        text = wikipedia_prompt.format(title, text) + EOS_TOKEN
        outputs.append(text)
    return { "text" : outputs, }
pass

In [None]:
# Translation Dataset
translation = """Translate English to Singlish
### English:
{}

### Singlish:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    English = examples["English"]
    Singlish  = examples["Singlish"]
    outputs = []
    for English, Singlish in zip(English, Singlish):
        # Must add EOS_TOKEN, otherwise the generation will go on forever!
        text = translation.format(English, Singlish) + EOS_TOKEN
        outputs.append(text)
    return { "text" : outputs, }
pass

In [None]:
from datasets import load_dataset

dataset = load_dataset("adithyasean/English-Sinhala", split = "train", token=True)
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [None]:
for row in dataset[:1]["text"]:
    print(row)

Translate English to Singlish
### English:
They're used interchangeably a lot. You'll get different answers from different resources, but the general consensus seems to be that woods are smaller than forests.

 >  A wood is an area covered in trees, larger than a grove or a copse. A forest is also an area covered in trees, but it is larger than a wood

 >  The U.S. National Vegetation Classification system differentiates them according to their densities: 25 to 60 percent of a a wood is covered by tree canopies, while 60 to 100 percent of a forest is canopied.

### Singlish:
eewa ekinekata huwamaru lesa boho wita bhawitha we. obata wiwidha sampath walin wiwidha pilithuru labenu atha, namuth samanya ekangathawaya nam wanantharawalata wada kalaya kuda bawayi. &gt; kalayak yanu waththakata ho wadulakata wada wishala gas walin wasi athi pradeshayaki. wanantharayak yanu gas walin wasi athi pradeshayaki, namuth eya kalayak wada wishalaya &gt; eksath janapadaye jathika wrukshalatha wargikaran

<a name="Train"></a>
### Continued Pretraining
Using Unsloth's `UnslothTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer).

In [None]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use warmup_ratio and num_train_epochs for longer runs!
        # max_steps = 120,
        # warmup_steps = 10,
        warmup_ratio = 0.1,
        num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
20.658 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 625
 "-____-"     Number of trainable parameters = 1,386,217,472


Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for embed_tokens.
Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for lm_head.


Step,Training Loss
1,2.0805
2,1.9873
3,1.9093
4,2.1029
5,2.1467
6,2.1262
7,1.9942
8,1.997
9,2.0377
10,2.056


### Instruction Finetuning

Using the singlish transliteration of the sinhala alpaca dataset

In [None]:
from datasets import load_dataset
alpaca_dataset = load_dataset("adithyasean/alpaca-singlish", split = "train", token=True)

README.md:   0%|          | 0.00/357 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/41816 [00:00<?, ? examples/s]

In [None]:
from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

We print 1 example:

In [None]:
print(dataset[0])

{'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.', 'input': '', 'instruction': 'Give three tips for staying healthy.'}


In [6]:
alpaca_prompt = """
### Instruction:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(conversations):
    texts = []
    instructions = conversations["instruction"]
    inputs = conversations["prompt"]
    outputs = conversations["response"]
    for instruction, prompt, response in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, response) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

In [None]:
alpaca_dataset = alpaca_dataset.map(formatting_prompts_func, batched = True,)

In [None]:
llama31_prompt="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{}<|eot_id|><|start_header_id|>user<|end_header_id|>

{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{}<|eot_id|>"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = llama31_prompt.format(instruction, input, output)
        texts.append(text)
    return { "text" : texts, }
pass
dataset = dataset.map(formatting_prompts_func, batched = True,)
print(dataset[22])

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

{'output': 'She will play the piano beautifully for hours and then stop as it will be midnight.', 'input': 'She played the piano beautifully for hours and then stopped as it was midnight.', 'instruction': 'Based on the information provided, rewrite the sentence by changing its tense from past to future.', 'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nBased on the information provided, rewrite the sentence by changing its tense from past to future.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nShe played the piano beautifully for hours and then stopped as it was midnight.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nShe will play the piano beautifully for hours and then stop as it will be midnight.<|eot_id|>'}


Using `UnslothTrainer` and do instruction finetuning!

In [None]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use num_train_epochs and warmup_ratio for longer runs!
        max_steps = 120,
        warmup_steps = 10,
        # warmup_ratio = 0.1,
        # num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=8):   0%|          | 0/51760 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer_stats = trainer.train()

Counting untrained tokens:   0%|          | 0/51760 [00:00<?, ? examples/s]

Unsloth: Setting embed_tokens & lm_head untrained tokens to mean(trained) to counteract NaNs during training.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 120
 "-____-"     Number of trainable parameters = 1,386,217,472


Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for embed_tokens.
Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for lm_head.


Step,Training Loss
1,3.6858
2,2.847
3,3.0639
4,2.5134
5,2.9346
6,2.7038
7,2.019
8,2.0159
9,1.8879
10,1.7464


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
Let's run the model!

Using `TextStreamer` for continuous inference

*   List item
*   List item

the generation token by token, instead of waiting the whole time!

In [None]:
alpaca_prompt = """
### Instruction:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(conversations):
    texts = []
    instructions = conversations["instruction"]
    inputs = conversations["prompt"]
    outputs = conversations["response"]
    for instruction, prompt, response in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, response) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

In [57]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "explain",
        "kalu kuharaya",
        "",
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>
### Instruction:
explain

### Response:
kalu kuharaya yanu wisheshitha aakaraye widuli bubuluwak wana athara eya samanyayen bhawitha karanu labanne wiwidha karyayan sandaha wisheshitha upangayak saha upanga samanga sambandha kirimata ho sansandanaya kirimata, wisheshayen pariganaka paddhathiya saha amathara upanga. kalu kuharaya wetha sambandha kala haki upanga athara pariganaka monitara, musikayin, jangama dur


<a name="Save"></a>
### Saving, loading finetuned models

In [None]:
model.save_pretrained("/content/drive/MyDrive/Llama-3.1-Singlish-1.3-8B-Instruct") # Local saving
tokenizer.save_pretrained("/content/drive/MyDrive/Llama-3.1-Singlish-1.3-8B-Instruct")
model.push_to_hub("adithyasean/Llama-3.1-Singlish-1.3-8B-Instruct", token = True, private = True) # Online saving
tokenizer.push_to_hub("adithyasean/Llama-3.1-Singlish-1.3-8B-Instruct", token = True, private = True) # Online saving

README.md:   0%|          | 0.00/574 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/5.54G [00:00<?, ?B/s]

Saved model to https://huggingface.co/adithyasean/Llama-3.1-Singlish-1.3-8B-Instruct


### Saving to float16 for VLLM

In [41]:
# Merge to 16bit
if True: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if True: model.push_to_hub_merged("adithyasean/Llama-Singlish-1.0-8B-16bit", tokenizer, save_method = "merged_16bit", token = True, private=True)

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("adithyasean/Llama-Singlish-1.0-8B-4bit", tokenizer, save_method = "merged_4bit", token = True, private=True)

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 16.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 54.92 out of 83.48 RAM for saving.


 66%|██████▌   | 21/32 [00:00<00:00, 53.91it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:11<00:00,  2.86it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: You are pushing to hub, but you passed your HF username = adithyasean.
We shall truncate adithyasean/Llama-Singlish-1.0-8B-16bit to Llama-Singlish-1.0-8B-16bit


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 54.73 out of 83.48 RAM for saving.


100%|██████████| 32/32 [00:07<00:00,  4.55it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...


README.md:   0%|          | 0.00/574 [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

Done.
Saved merged model to https://huggingface.co/adithyasean/Llama-Singlish-1.0-8B-16bit


### GGUF / llama.cpp Conversion
We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

In [46]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("adithyasean/Llama-Singlish-1.0-8B-Q8-0", tokenizer, token = True, private=True)

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if True: model.push_to_hub_gguf("adithyasean/Llama-Singlish-1.0-8B-f16", tokenizer, quantization_method = "f16", token = True, private=True)

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q5_k_m", token = "")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 62.13 out of 83.48 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 59.79it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['f16'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at adithyasean/Llama-Singlish-1.0-8B-f16 into f16 GGUF format.
The output location will be ./adithyasean/Llama-Singlish-1.0-8B-f16/unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: Llama-Singlish-1.0-8B-f16
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model par

unsloth.F16.gguf:   0%|          | 0.00/16.1G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/adithyasean/Llama-Singlish-1.0-8B-f16
