In [2]:
!pip install -U wheel ninja datasets



In [3]:
!pip uninstall --yes torch torchaudio torchvision

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124


In [4]:
!pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting torch==2.6.0
  Downloading https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp310-cp310-linux_x86_64.whl.metadata (28 kB)
Collecting torchvision==0.21.0
  Downloading https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp310-cp310-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio==2.6.0
  Downloading https://download.pytorch.org/whl/cu124/torchaudio-2.6.0%2Bcu124-cp310-cp310-linux_x86_64.whl.metadata (6.6 kB)
Downloading https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp310-cp310-linux_x86_64.whl (768.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m768.4/768.4 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp310-cp310-linux_x86_64.whl (7.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[

In [5]:
!export CUDA_HOME=/usr/local/cuda && \
 export PATH=$CUDA_HOME/bin:$PATH && \
 export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH && \
 pip install --no-build-isolation 'axolotl[flash-attn,deepspeed]'

Collecting datasets==3.5.1 (from axolotl[deepspeed,flash-attn])
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.6.0
    Uninstalling datasets-3.6.0:
      Successfully uninstalled datasets-3.6.0
Successfully installed datasets-3.5.1


In [6]:
import json
from huggingface_hub import notebook_login
from datasets import load_dataset

In [7]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
dataset = load_dataset("capstone-research/travel-customer-support-chatbot-tool-call")

In [9]:
def convert_messages_to_conversations(messages):
    """Convert messages to Axolotl-compatible conversation format."""
    conversations = []
    for msg in messages:
        role = msg["role"]
        content = msg["content"]

        mapped_role = "user" if role == "user" else ("assistant" if role == "assistant" else "system")

        conversations.append({
            "from": mapped_role,
            "value": content
        })

    return {"conversations": conversations}

In [10]:
!mkdir -p dataset output

In [11]:
def convert_and_save_dataset(datafile):

    with open(f"dataset/{datafile}.jsonl", "w", encoding="utf-8") as outfile:
        for example in dataset[datafile]:
            messages = example.get("messages", [])
            if messages:
                converted = convert_messages_to_conversations(messages)
                json.dump(converted, outfile, ensure_ascii=False)
                outfile.write("\n")

In [12]:
convert_and_save_dataset("train")
convert_and_save_dataset("test")

In [13]:
!wandb login

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /teamspace/studios/this_studio/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [14]:
import yaml

yaml_string = """
base_model: NousResearch/Hermes-3-Llama-3.2-3B
tokenizer_type: AutoTokenizer

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: dataset/train.jsonl
    type: chat_template
    chat_template: chatml
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
    roles_to_train: ["assistant"]
    train_on_eos: turn
    train_on_eot: turn
    message_field_training: train

evals:
  - path: dataset/test.jsonl
    type: chat_template
    chat_template: chatml
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

sequence_len: 4096
sample_packing: true
eval_sample_packing: false
remove_unused_columns: false
pad_to_sequence_len: true

adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_modules_to_save:
  - embed_tokens
  - lm_head

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 3
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

wandb_project: travel-chat-assistant
wandb_entity: harshit-sk-org
wandb_watch: all
wandb_name: hermes-llama-travel-customer-support-v1
wandb_log_model: checkpoint

warmup_steps: 10
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

special_tokens:
  bos_token: "<|im_start|>"
  eos_token: "<|im_end|>"

tokens:
  - "<|user|>"
  - "<|assistant|>"
  - "<|im_start|>"
  - "<|im_end|>"
  - "<tool_call>"
  - "</tool_call>"

output_dir: output
"""


# Convert the YAML string to a Python dictionary
yaml_dict = yaml.safe_load(yaml_string)

# Specify your file path
file_path = 'hermes_llama_lora.yaml'

# Write the YAML file
with open(file_path, 'w') as file:
    yaml.dump(yaml_dict, file)

Above we have a configuration file with base LLM model and datasets specified, among many other things. Axolotl can automatically detect whether the specified datasets are on HuggingFace repo or local machine.

The Axolotl configuration options encompass model and dataset selection, data pre-processing, and training. Let's go through them line by line:

*   "base model": String value, specifies the underlying pre-trained LLM that will be used for finetuning

Next we have options for model weights quantization. Quantization allows for reduction in occupied memory on GPUs.

*   "load_in_8bit": Boolean value, whether to quantize the model weights into 8-bit integer.

*   "load_in_4bit": Boolean value, whether to quantize the model weights into 4-bit integer.

*   "strict": Boolean value. If false, it allows for overriding established configuration options in the yaml file when executing in command-line interface.

*   "datasets": a list of dicts that contain path and type of data sets as well as other optional configurations where datasets are concerned. Supports multiple datasets.

*   "val_set_size": Either a float value less than one or an integer less than the total size of dataset. Sets the size of validation set from the whole dataset. If float, sets the proportion of the dataset assigned for validation. If integer, sets the direct size of validation set.

*   "output_dir": String value. Path of trained model.

For data preprocessing:

*   "sequence_len": Integer. Specifies the maximum sequence length of the input. Typically 2048 or less.

*   "pad_to_sequence_len": Boolean. Padding input to maximum sequence length.

*   "sample_packing": Boolean. Specifies whether to use multi-packing with block diagonal attention.

*   "special_tokens": Python dict, optional. Allows users to specify the additional special tokens to be ignored by the tokenizer.

For LoRA configuration and its hyperparamters:

*   "adapter": String. Either "lora" or "qlora", depending on user's choice.

*   "lora_model_dir": String, Optional. Path to directory that contains LoRA model, if there is already a trained LoRA model the user would like to use.

*   "lora_r": Integer. Refers to the rank of LoRA decomposition matrices. Higher value will reduce LoRA efficiency. Recommended to be set to 8.

*   "lora_alpha": Integer. Scale the weight matrices by $\frac{\text{lora_alpha}}{\text{lora_r}}$Recommended to be fixed at 16.

*   "lora_dropout": Float that is 1 or less. The dropout probability of a lora layer.

*   "lora_target_linear": Boolean. If true, lora will target all linear modules in the transformers architecture.

*   "lora_modules_to_save": If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.

See [LoRA](https://arxiv.org/abs/2106.09685) for detailed explanation of LoRA implementation.

For the training configurations:

*   "gradient_accumulation_steps": Integer. The number of steps over which to accumulate gradient for batch training. E.g. if 2, backprop is performed every two steps.

*   "micro_batch_size": Integer. Batch size per gpu / gradient_accumulation_steps

*   "num_epochs": Integer. Number of epochs. One epoch is when training has looped over every batch in the whole data set once.

*   "optimizer": The optimizer to use for the training.

*   "learning_rate": The learning rate.

*   "lr_scheduler": The learning rate scheduler to use for adjusting learning rate during training.

*   "train_on_inputs": Boolean. Whether to ignore or include the user's prompt from the training labels.

*   "group_by_length": Boolean. Whether to group similarly sized data to minimize padding.

*   "bf16": Either "auto", "true", or "false". Whether to use CUDA bf16 floating point format. If set to "auto", will automatically apply bf16 should the gpu supports it.

*   "fp16": Optional. Specifies whether to use CUDA fp16. Automatically set to true if "bf16" is set to true. Otherwise false.

*   "tf32": Boolean. Whether to use CUDA tf32. Will override bf16.

*   "gradient_checkpointing": Boolean. Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing

*   "gradient_checkpointing_kwargs": Python Dict. Fed into the trainer.

*   "logging_steps": Integer. Log training information over every specified number of steps.

*   "flash_attention": Boolean. Whether to use the [flash attention](https://github.com/Dao-AILab/flash-attention) mechanism.

*   "sdp_attention": Boolean. Whether to use the Scaled Dot Product attention mechanism (the attention mechanism in the [original implementation](https://arxiv.org/abs/1706.03762) of transformers.)

*   "warmup_steps": Integer. The number of pre-training steps where a very low learning rate is used.

*   "evals_per_epoch": Integer. Number of evaluations to be performed within one training epoch.

*   "saves_per_epoch": Integer. Number of times the model is saved in one training epoch.

*   "weight_decay": Positive Float. Sets the "strength" of weight decay (i.e. setting the coefficient of L2 regularization)

The above is but a snippet aiming to get users familiarized with the types of streamlined configuration options axolotl provides. For a full list of configuration options, see [here](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html)

In [15]:
!accelerate launch -m axolotl.cli.train hermes_llama_lora.yaml

The following values were not passed to `accelerate launch` and had defaults used instead:
	`--num_processes` was set to a value of `1`
	`--num_machines` was set to a value of `1`
	`--mixed_precision` was set to a value of `'no'`
	`--dynamo_backend` was set to a value of `'no'`
[2025-06-14 15:12:16,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-06-14 15:12:16,105] [INFO] [root.spawn:77] [PID:5276] gcc -pthread -B /home/zeus/miniconda3/envs/cloudspace/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /home/zeus/miniconda3/envs/cloudspace/include -fPIC -O2 -isystem /home/zeus/miniconda3/envs/cloudspace/include -fPIC -c /tmp/tmpe0omobw2/test.c -o /tmp/tmpe0omobw2/test.o
[2025-06-14 15:12:16,424] [INFO] [root.spawn:77] [PID:5276] gcc -pthread -B /home/zeus/miniconda3/envs/cloudspace/compiler_compat /tmp/tmpe0omobw2/test.o -laio -o /tmp/tmpe0omobw2/a.out
[2025-06-14 15:12:16,483] [INFO] [

In [16]:
!axolotl merge-lora hermes_llama_lora.yaml --lora-model-dir="output/"

[2025-06-14 15:39:34,762] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-06-14 15:39:34,840] [INFO] [root.spawn:77] [PID:26478] gcc -pthread -B /home/zeus/miniconda3/envs/cloudspace/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /home/zeus/miniconda3/envs/cloudspace/include -fPIC -O2 -isystem /home/zeus/miniconda3/envs/cloudspace/include -fPIC -c /tmp/tmprdi6ektb/test.c -o /tmp/tmprdi6ektb/test.o
[2025-06-14 15:39:34,867] [INFO] [root.spawn:77] [PID:26478] gcc -pthread -B /home/zeus/miniconda3/envs/cloudspace/compiler_compat /tmp/tmprdi6ektb/test.o -laio -o /tmp/tmprdi6ektb/a.out
[2025-06-14 15:39:34,883] [INFO] [root.spawn:77] [PID:26478] gcc -pthread -B /home/zeus/miniconda3/envs/cloudspace/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /home/zeus/miniconda3/envs/cloudspace/include -fPIC -O2 -isystem /home/zeus/miniconda3/envs/cloudspa

In [17]:
from huggingface_hub import HfApi

api = HfApi(token="")
api.upload_folder(
    folder_path="output/merged",
    repo_id="capstone-research/Hermes-3-Llama-3.2-3B-Travel-ChatBot",
    repo_type="model",
)

Uploading...:   0%|          | 0.00/7.23G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/capstone-research/Hermes-3-Llama-3.2-3B-Travel-ChatBot/commit/2a40cc0f1d5a570a22cd33d5f3fd7aa2da565442', commit_message='Upload folder using huggingface_hub', commit_description='', oid='2a40cc0f1d5a570a22cd33d5f3fd7aa2da565442', pr_url=None, repo_url=RepoUrl('https://huggingface.co/capstone-research/Hermes-3-Llama-3.2-3B-Travel-ChatBot', endpoint='https://huggingface.co', repo_type='model', repo_id='capstone-research/Hermes-3-Llama-3.2-3B-Travel-ChatBot'), pr_revision=None, pr_num=None)

In [None]:
!axolotl inference hermes_llama_lora.yaml --base-model "output/merged" --gradio

[2025-06-14 15:41:21,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-06-14 15:41:21,990] [INFO] [root.spawn:77] [PID:28188] gcc -pthread -B /home/zeus/miniconda3/envs/cloudspace/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /home/zeus/miniconda3/envs/cloudspace/include -fPIC -O2 -isystem /home/zeus/miniconda3/envs/cloudspace/include -fPIC -c /tmp/tmpnueymiay/test.c -o /tmp/tmpnueymiay/test.o
[2025-06-14 15:41:22,008] [INFO] [root.spawn:77] [PID:28188] gcc -pthread -B /home/zeus/miniconda3/envs/cloudspace/compiler_compat /tmp/tmpnueymiay/test.o -laio -o /tmp/tmpnueymiay/a.out
[2025-06-14 15:41:22,021] [INFO] [root.spawn:77] [PID:28188] gcc -pthread -B /home/zeus/miniconda3/envs/cloudspace/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /home/zeus/miniconda3/envs/cloudspace/include -fPIC -O2 -isystem /home/zeus/miniconda3/envs/cloudspa