<a href="https://colab.research.google.com/github/CSC-312/cogno-1b-0925/blob/main/cogno_1_1b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installation

In [None]:
# @title
!curl -L "https://docs.google.com/spreadsheets/u/3/d/16S31KEZHFsUdkQCZst8qANNDquwCilZgw-kZ2LvnfOc/export?format=xlsx&id=16S31KEZHFsUdkQCZst8qANNDquwCilZgw-kZ2LvnfOc" -o export.xlsx
!pip install -q openpyxl


In [None]:
import wandb
from google.colab import userdata

# Load W&B API key from Colab secrets
# Ensure you have added your API key to Colab's secrets manager named 'WANDB_API_KEY'
try:
    wandb_api_key = userdata.get("WANDB_API_KEY")
    wandb.login(key=wandb_api_key)
    print("Weights & Biases login successful.")
except Exception as e:
    print(f"Could not log in to Weights & Biases. Please check your API key in Colab secrets. Error: {e}")
    raise

In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4

## Unsloth
FastModel supports most vision and text models.

In [None]:
import ast
import pandas as pd

import torch
from datasets import Dataset
from transformers import TextStreamer

from unsloth import FastLanguageModel, FastModel
from unsloth.chat_templates import get_chat_template, train_on_responses_only

from trl import SFTTrainer, SFTConfig

max_seq_length = 2048

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-270m-it",
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

Add LoRA adapters to finetune a small subset of parameters.

In [None]:
model = FastModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 4,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

## Data
Data preparation and preprocessing.

In [None]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma3",
)

In [None]:
sheet_names = pd.ExcelFile('/content/export.xlsx', engine="openpyxl").sheet_names

In [None]:
for i, x in enumerate(sheet_names):
  print(f"{i}: {x}")

In [None]:
try:
    df = pd.read_excel('/content/export.xlsx', sheet_name=sheet_names[5]).to_csv('export.csv', index=False)
    df = pd.read_csv('export.csv')

    # Assuming the first column is 'question' and the second is 'answer'
    df = df.rename(columns={df.columns[0]: 'question', df.columns[1]: 'answer'})

    dataset = Dataset.from_pandas(df)

    def filter_nans(example):
        return not pd.isna(example['answer'])

    dataset = dataset.filter(filter_nans)

    print(f"Number of rows after removing bad lines and NaNs: {len(dataset)}")

    print("First 5 rows of the dataset after processing:")
    for i in range(min(5, len(dataset))):
        print(dataset[i])

except Exception as e:
    print(f"An error occurred during pandas loading or conversion: {e}")
    dataset = None

In [None]:
dataset[100]

Apply the Gemma3 chat template and save output to `text`.

In [None]:
def formatting_prompts_func(examples):
    convos = []
    for question, answer in zip(examples["question"], examples["answer"]):
        # Format the question and answer into the Gemma3 chat template
        messages = [
            {"role": "user", "content": question},
            {"role": "model", "content": answer}
        ]
        convos.append(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False).removeprefix('<bos>'))

    return { "text" : convos, }

dataset = dataset.map(formatting_prompts_func, batched = True)

Preview formatted examples.

In [None]:
dataset[100]['text']

## Training
Quick example: 100 steps (adjust `num_train_epochs` / `max_steps` for full runs).

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        max_steps = 100,
        learning_rate = 2e-5,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir="outputs",
        report_to = "all",
    )
)

Use Unsloth's `train_on_responses_only` to mask input loss and focus on assistant outputs.

In [None]:

trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Verify masking by inspecting the 100th example.

In [None]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

Masked example — only the answer should appear.

In [None]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Inference
Recommended: temperature=1.0, top_p=0.95, top_k=64.

In [None]:
messages = [
    {"role" : 'user', 'content' : dataset['text'][44646].split('<start_of_turn>model\n')[0].replace('<start_of_turn>user\n', '').replace('<end_of_turn>\n', '')}
]

print("Question:", messages[0]['content'])

text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, #
).removeprefix('<bos>')

_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 125,
    temperature = 1, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

## Saving
Save LoRA adapters with `save_pretrained` or `push_to_hub` (saves adapters only).

In [None]:
if True:
    model.save_pretrained("cogno-1-270m")
    tokenizer.save_pretrained("cogno-1-270m")

In [None]:
if True:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "cogno-1-270m",
        max_seq_length = 2048,
        load_in_4bit = False,
    )

## Save and Upload to HF

In [None]:
if True: # Change to True to save to GGUF
    model.save_pretrained_merged(
        "cogno-1-270m",
        tokenizer,
        save_method = "merged_16bit", # Can be "merged_4bit"
    )


To upload GGUF to Hugging Face, set the upload flag and provide your token.

In [None]:
  model.push_to_hub_merged("kevinnkansah/cogno-1-270m", token = "")
  tokenizer.push_to_hub_merged("kevinnkansah/cogno-1-270m", token = "")
  model.push_to_hub("kevinnkansah/cogno-1-270m", token = "")
  tokenizer.push_to_hub("kevinnkansah/cogno-1-270m", token = "")



<div class="align-center">
  <a href="https://unsloth.ai"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
</div>
