<a href="https://colab.research.google.com/github/CSC-312/cogno-1b-0925/blob/main/cogno_1_1b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
# @title
!curl -L "https://docs.google.com/spreadsheets/u/3/d/16S31KEZHFsUdkQCZst8qANNDquwCilZgw-kZ2LvnfOc/export?format=xlsx&id=16S31KEZHFsUdkQCZst8qANNDquwCilZgw-kZ2LvnfOc" -o export.xlsx
!pip install -q openpyxl


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   465    0   465    0     0    318      0 --:--:--  0:00:01 --:--:--   318
100 6564k    0 6564k    0     0   816k      0 --:--:--  0:00:08 --:--:-- 1950k


## Installation

In [33]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4

## Unsloth
FastModel supports most vision and text models.

In [34]:
import ast
import pandas as pd

import torch
from datasets import Dataset
from transformers import TextStreamer

from unsloth import FastLanguageModel, FastModel
from unsloth.chat_templates import get_chat_template, train_on_responses_only

from trl import SFTTrainer, SFTConfig

max_seq_length = 2048

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-it",
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

==((====))==  Unsloth 2025.8.10: Fast Gemma3 patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Add LoRA adapters to finetune a small subset of parameters.

In [35]:
model = FastModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 128,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Making `model.base_model.model.model` require gradients


## Data
Data preparation and preprocessing.

In [36]:

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma3",
)

In [43]:
sheet_names = pd.ExcelFile('/content/export.xlsx', engine="openpyxl").sheet_names

In [45]:
for i, x in enumerate(sheet_names):
  print(f"{i}: {x}")

0: uwc.ac.za with pdfs
1: subdomains
2: youtube
3: all pdfs
4: faq
5: question-answer pairs
6: modules
7: degrees


In [52]:
try:
    df = pd.read_excel('/content/export.xlsx', sheet_name=sheet_names[5]).to_csv('export.csv', index=False)
    df = pd.read_csv('export.csv')

    # Assuming the first column is 'question' and the second is 'answer'
    df = df.rename(columns={df.columns[0]: 'question', df.columns[1]: 'answer'})

    dataset = Dataset.from_pandas(df)

    def filter_nans(example):
        return not pd.isna(example['answer'])

    dataset = dataset.filter(filter_nans)

    print(f"Number of rows after removing bad lines and NaNs: {len(dataset)}")

    print("First 5 rows of the dataset after processing:")
    for i in range(min(5, len(dataset))):
        print(dataset[i])

except Exception as e:
    print(f"An error occurred during pandas loading or conversion: {e}")
    dataset = None

Filter:   0%|          | 0/49633 [00:00<?, ? examples/s]

Number of rows after removing bad lines and NaNs: 49628
First 5 rows of the dataset after processing:
{'question': 'Which UWC programs are prescribed to take the non-legal elective modules mentioned in the document?', 'answer': 'The non-legal elective modules are prescribed for LLB 4 (7162), LLB 5 (7172), and BCOM LAW (7211) programs at UWC.'}
{'question': 'How does HISTORY 154 (HIS154) build upon HISTORY153 (HIS153) at UWC?', 'answer': 'HISTORY 154 (HIS154) follows from HISTORY153 (HIS153) by continuing to emphasize understanding history as narratives constructed through debates and arguments. While HIS153 focuses on developing coherent arguments, HIS154 shifts to backing up arguments with evidence, particularly archaeological and documentary evidence.'}
{'question': 'What are the dates for the first academic term at UWC in 2025?', 'answer': 'The first academic term at UWC in 2025 runs from 10 February to 28 March.'}
{'question': 'What are the key dates for assisted registration for f

In [54]:
dataset[100]

{'question': 'Who was inaugurated as the new Rector and Vice-Chancellor of UWC in 2015?',
 'answer': 'Professor Tyrone Pretorius was inaugurated as the new Rector and Vice-Chancellor of UWC in 2015.'}

Apply the Gemma3 chat template and save output to `text`.

In [55]:
def formatting_prompts_func(examples):
    convos = []
    for question, answer in zip(examples["question"], examples["answer"]):
        # Format the question and answer into the Gemma3 chat template
        messages = [
            {"role": "user", "content": question},
            {"role": "model", "content": answer}
        ]
        convos.append(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False).removeprefix('<bos>'))

    return { "text" : convos, }

dataset = dataset.map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/49628 [00:00<?, ? examples/s]

Preview formatted examples.

In [56]:
dataset[100]['text']

'<start_of_turn>user\nWho was inaugurated as the new Rector and Vice-Chancellor of UWC in 2015?<end_of_turn>\n<start_of_turn>model\nProfessor Tyrone Pretorius was inaugurated as the new Rector and Vice-Chancellor of UWC in 2015.<end_of_turn>\n'

## Training
Quick example: 100 steps (adjust `num_train_epochs` / `max_steps` for full runs).

In [60]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 1, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 2, # Set this for 1 full training run.
        # max_steps = 100,
        learning_rate = 5e-5, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir="outputs",
        report_to = "all", # Use this for WandB etc
    )
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/49628 [00:00<?, ? examples/s]

Use Unsloth's `train_on_responses_only` to mask input loss and focus on assistant outputs.

In [61]:

trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=2):   0%|          | 0/49628 [00:00<?, ? examples/s]

Verify masking by inspecting the 100th example.

In [62]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

'<bos><start_of_turn>user\nWho was inaugurated as the new Rector and Vice-Chancellor of UWC in 2015?<end_of_turn>\n<start_of_turn>model\nProfessor Tyrone Pretorius was inaugurated as the new Rector and Vice-Chancellor of UWC in 2015.<end_of_turn>\n'

Masked example — only the answer should appear.

In [63]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                              Professor Tyrone Pretorius was inaugurated as the new Rector and Vice-Chancellor of UWC in 2015.<end_of_turn>\n'

In [64]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
4.439 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 49,628 | Num Epochs = 2 | Total steps = 12,408
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 26,091,520 of 1,025,977,472 (2.54% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkevinnkansah[0m ([33mkevinnkansah-university-of-the-western-cape[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,entropy
1,5.5967,0
2,5.0802,No Log
3,4.4241,No Log
4,3.296,No Log
5,3.6836,No Log
6,2.4181,No Log
7,2.4712,No Log
8,2.1479,No Log
9,1.9949,No Log
10,1.9641,No Log


Unsloth: Will smartly offload gradients to save VRAM!


In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Inference
Recommended: temperature=1.0, top_p=0.95, top_k=64.

In [None]:
messages = [
    {"role" : 'user', 'content' : dataset['text'][44646].split('<start_of_turn>model\n')[0].replace('<start_of_turn>user\n', '').replace('<end_of_turn>\n', '')}
]

# Print the original question
print("Question:", messages[0]['content'])

text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
).removeprefix('<bos>')

_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 125,
    temperature = 1, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

## Saving
Save LoRA adapters with `save_pretrained` or `push_to_hub` (saves adapters only).

In [None]:
model.save_pretrained("cogno-1-270m")  # Local saving
tokenizer.save_pretrained("cogno-1-270m")
# model.push_to_hub("your_name/cogno-1-270m", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/cogno-1-270m", token = "...") # Online saving

Load saved LoRA adapters for inference by calling `from_pretrained`.

In [None]:
if True:

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "cogno-1-270m".
        max_seq_length = 2048,
        load_in_4bit = False,
    )

## Save float16 (VLLM)
Use `merged_16bit` (float16) or `merged_4bit` (int4). Use `push_to_hub_merged` to upload.

In [None]:

# Just LoRA adapters
if False:
    model.save_pretrained("cogno-1-270m")
    tokenizer.save_pretrained("cogno-1-270m")
if False: # Pushing to HF Hub
    model.push_to_hub("hf/cogno-1-270m", token = "")
    tokenizer.push_to_hub("hf/cogno-1-270m", token = "")


## GGUF / llama.cpp conversion

In [None]:
if True: # Change to True to save to GGUF
    model.save_pretrained_gguf(
        "cogno-1-270m",
        tokenizer,
        quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
    )

To upload GGUF to Hugging Face, set the upload flag and provide your token.

In [None]:
if False: # Change to True to upload GGUF
    model.push_to_hub_gguf(
        "cogno-1-270m",
        tokenizer,
        quantization_type = "Q8_0", # Only Q8_0, BF16, F16 supported
        repo_id = "HF_ACCOUNT/cogno-1-270m-gguf",
        token = "hf_...",
    )



<div class="align-center">
  <a href="https://unsloth.ai"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
</div>
