In [None]:
!pip install --no-build-isolation git+https://github.com/OpenAccess-AI-Collective/axolotl.git

In [2]:
!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1

In [None]:
!pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2"

In [17]:
import os
os.environ["AXOLOTL_DO_NOT_TRACK"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [18]:
from axolotl.cli.config import load_cfg
from axolotl.utils.dict import DictDefault

In [19]:
from axolotl.utils import set_pytorch_cuda_alloc_conf
set_pytorch_cuda_alloc_conf()

In [20]:
dataset = "winglian/pirate-ultrachat-10k"
uploaded = {}

In [21]:
config = DictDefault(
    base_model="Qwen/Qwen2.5-3B-Instruct",
    load_in_4bit=True,
    adapter="qlora",
    lora_r=32,
    lora_alpha=64,
    lora_target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "down_proj",
        "up_proj",
    ],
    lora_qkv_kernel=False,
    lora_o_kernel=False,
    lora_mlp_kernel=False,
    embeddings_skip_upcast=True,
    xformers_attention=True,
    plugins=[],
    sample_packing=False,
    learning_rate=0.00019,
    sequence_len=1024,
    micro_batch_size=1,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={
        "use_reentrant": False,
    },
    optimizer="paged_adamw_8bit",
    lr_scheduler="cosine",
    warmup_steps=5,
    fp16=True,
    bf16=False,
    max_grad_norm=0.1,
    num_epochs=1,
    saves_per_epoch=2,
    logging_steps=1,
    output_dir="./outputs/qwen-sft-pirate-rrr",
    chat_template="qwen3",
    datasets=[
        {
            "path": dataset,
            "type": "chat_template",
            "split": "train",
            "eot_tokens": ["<|im_end|>"],
        }
    ],
    dataloader_prefetch_factor=2,
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
)

In [22]:
cfg = load_cfg(config)

In [23]:
from axolotl.common.datasets import load_datasets

In [24]:
dataset = load_datasets(cfg=cfg)

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 0 files: 0it [00:00, ?it/s]

Dropping Long Sequences (>1024) (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/8840 [00:00<?, ? examples/s]

In [25]:
dataset

TrainDatasetMeta(train_dataset=Dataset({
    features: ['input_ids', 'labels', 'attention_mask'],
    num_rows: 8840
}), eval_dataset=None, total_num_steps=1105)

In [26]:
from axolotl.train import train
cfg.max_steps = 25

In [27]:
model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset)

Loading weights:   0%|          | 0/434 [00:00<?, ?it/s]

Step,Training Loss
1,2.134035
2,1.949361
3,1.967079
4,1.661767
5,1.624731
6,1.769594
7,1.537181
8,1.598089
9,1.563828
10,1.699928


In [28]:
from transformers import TextStreamer

messages = [
    {
        "role": "user",
        "content": "Explain the Pythagorean theorem to me.",
    },
]

In [29]:
prompt = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False,
    enable_thinking=False,
)

In [30]:
outputs = model.generate(
    **tokenizer(prompt, return_tensors="pt").to("cuda"),
    max_new_tokens=192,
    temperature=1.0,
    top_p=0.8,
    top_k=32,
    streamer=TextStreamer(tokenizer, skip_prompt=True),
)

In [32]:
outputs

tensor([[151644,    872,    198,    840,  20772,    279,   5355,  95362,  45195,
          57817,    311,    752,     13, 151645,    198, 151644,  77091,    198,
          13708,    766,   1339,    522,  26865,   1339,  24765,   2253,   1052,
             11,  29176,     88,      0,    576,   5355,  95362,  45195,  57817,
            374,    264,  23702,  14806,    429,  10742,    601,    279,   3084,
            315,    279,   9751,  65628,    810,    304,    264,   1290,  21495,
             13,   1084,    594,   1075,    264,  31626,   2415,    369,  68838,
             11,  50246,   1105,   1526,    279,  42857,    315,  17047,   2219,
          10061,    752,   1438,    432,   1495,    369,  19478,   1447,    334,
            785,  13173,  66963,    758,    264,   1290,  21495,    320,     64,
          21495,    448,    825,   9210,   6144,    311,    220,     24,     15,
          12348,    701,    279,   9334,    315,    279,   3084,    315,    279,
           9751,  65628,    

In [34]:
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [37]:
generated_text

"user\nExplain the Pythagorean theorem to me.\nassistant\n<think>\n\n</think>\n\nAhoy there, matey! The Pythagorean theorem is a magical formula that tells us the length of the hypotenuse in a right triangle. It's like a treasure map for sailors, guiding them through the depths of geometry!\n\nLet me break it down for ye:\n\n**Theorem:** In a right triangle (a triangle with one angle equal to 90 degrees), the square of the length of the hypotenuse (the longest side) is equal to the sum of the squares of the other two sides.\n\nIn math terms, it looks like this: \n\n    \\( c^2 = a^2 + b^2 \\)\n\nWhere:\n- **c** is the length of the hypotenuse,\n- **a** and **b** are the lengths of the other two sides (the legs).\n\n**Imagine:** You've got a treasure chest, and you want to know how long the rope needed to reach it from the shore"

In [31]:
!ls -lh "./outputs/qwen-sft-pirate-rrr"