Upload the note to google drive to run it in colab, you need to prepare these yourself:
1. buy your colab gpu calculating units (around 0.5 GBP per training cycle)
2. register your huggingface account, then download the destinated raw model to your google drive
3. set paths in the folder to match your google drive path

Current code support training BeRT and Llama3
Current BeRT version is large_uncased
Current Llama version is 3.2 1B base

note: BeRT is encoder only model so only minimum logic training is supported (i.e. it does not read instructive prompt)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install --upgrade huggingface_hub
from huggingface_hub import notebook_login
notebook_login()

Collecting huggingface_hub
  Downloading huggingface_hub-0.32.0-py3-none-any.whl.metadata (14 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface_hub)
  Downloading hf_xet-1.1.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Downloading huggingface_hub-0.32.0-py3-none-any.whl (509 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.3/509.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hf_xet-1.1.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf-xet, huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.31.2
    Uninstalling huggingface-hub-0.31.2:
      Successfully uninstalled huggingface-hub-0.31.2
Successfully installed hf-xet-1.1.2 huggingface_hub-0.32.0


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [35]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())  # Should return True
# print(torch.cuda.device_count())  # Should be > 0 if a GPU is available
# print(torch.cuda.get_device_name(0))  # Prints the GPU name if available

2.6.0+cu124
True


In [36]:
!pip install tensorboardX



In [37]:
!pip install -U transformers datasets accelerate peft trl bitsandbytes wandb



In [38]:
!pip install datasets
!pip install trl



In [39]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, PeftModel, TaskType
from trl import SFTTrainer, SFTConfig

In [59]:
modeltype = 'llama'
argtype = 'new'

if modeltype == 'llama':
    base_model = "meta-llama/Llama-3.2-1B"
elif modeltype == 'bert':
    base_model = "google-bert/bert-base-uncased"

new_model = "/content/drive/MyDrive/Colab Notebooks/Llama_3/llama32_1B_adaptor/dataset_vs2/2305_instructive_example_logic_newarg"
tokenizer = AutoTokenizer.from_pretrained(base_model)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [60]:
#finetune is also for things not can be done with finetuning
#e.g. too long logistic prompting

In [61]:
dataset_name = "Bigbigboss02/instructive_examples_logic_132x10"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42).select(range(995))

In [62]:
dataset[:2]

{'input': ["Determine if the 2 words are semantically similar. Provide 'True' or 'False'. \n[/INST]\n Examples: \nWord1: color, Word2: colour  [/INST]\n True\nWord1: red, Word2: Red  [/INST]\n True\nWord1: happy, Word2: sad  [/INST]\n False\nWord1: dog, Word2: cat  [/INST]\n False\nWord1: Jump Word2: leap",
  "Determine if the 2 words are semantically similar. Provide 'True' or 'False'. \n[/INST]\n Examples: \nWord1: color, Word2: colour  [/INST]\n True\nWord1: red, Word2: Red  [/INST]\n True\nWord1: happy, Word2: sad  [/INST]\n False\nWord1: dog, Word2: cat  [/INST]\n False\nWord1: Big Word2: melody"],
 'response': [True, False]}

In [63]:
# if modeltype == 'llama':
#     def prep_data(samples):
#         insts = samples['input']
#         outputs = samples['response']
#         rephrased_text = []

#         for i in range(len(samples)-1):
#             rephrased_text.append(f"<s>[INST] {insts} [/INST] {outputs} </s>")

#         return {"text": " ".join(rephrased_text)}
# elif modeltype == 'bert':
#     def prep_data(samples):
#         insts = samples['input']
#         outputs = samples['response']

#         processed_text = []
#         labels = []

#         for inst, output in zip(insts, outputs):
#             # Extract the last "Word1: ... Word2: ..." line from the input
#             lines = inst.strip().split('\n')
#             last_pair_line = ''
#             for line in reversed(lines):
#                 if 'Word1:' in line and 'Word2:' in line:
#                     last_pair_line = line.strip()
#                     break

#             processed_text.append(last_pair_line)
#             label = 1 if str(output).strip().lower() in ['true', '1'] else 0
#             labels.append(label)

#         return {'text': processed_text, 'label': labels}


def prep_data(samples, modeltype):
    insts = samples['input']
    outputs = samples['response']

    processed_text = []
    labels = []

    if modeltype == 'llama':
        for inst, output in zip(insts, outputs):
            conversation = f"<s>[INST] {inst} [/INST] {output} </s>"
            processed_text.append(conversation)
        return {'text': processed_text}

    elif modeltype == 'bert':
        for inst, output in zip(insts, outputs):
            # Extract final Word1/Word2 line
            lines = inst.strip().split('\n')
            last_pair_line = ''
            for line in reversed(lines):
                if 'Word1:' in line and 'Word2:' in line:
                    last_pair_line = line.strip()
                    break
            processed_text.append(last_pair_line)
            label = 1 if str(output).strip().lower() in ['true', '1'] else 0
            labels.append(label)
        return {'text': processed_text, 'label': labels}

    else:
        raise ValueError("Unknown modeltype")

def prep_data_llama(samples):
    insts = samples['input']
    outputs = samples['response']
    rephrased_text = []

    for i in range(len(samples)-1):
        rephrased_text.append(f"<s>[INST] {insts} [/INST] {outputs} </s>")

    return {"text": " ".join(rephrased_text)}



if modeltype == 'llama':
    transformed_dataset = dataset.map(prep_data_llama)
elif modeltype == 'bert':
    transformed_dataset = dataset.map(lambda x: prep_data(x, modeltype), batched=True)

print(transformed_dataset)
transformed_dataset[1]

Dataset({
    features: ['input', 'response', 'text'],
    num_rows: 995
})


{'input': "Determine if the 2 words are semantically similar. Provide 'True' or 'False'. \n[/INST]\n Examples: \nWord1: color, Word2: colour  [/INST]\n True\nWord1: red, Word2: Red  [/INST]\n True\nWord1: happy, Word2: sad  [/INST]\n False\nWord1: dog, Word2: cat  [/INST]\n False\nWord1: Big Word2: melody",
 'response': False,
 'text': "<s>[INST] Determine if the 2 words are semantically similar. Provide 'True' or 'False'. \n[/INST]\n Examples: \nWord1: color, Word2: colour  [/INST]\n True\nWord1: red, Word2: Red  [/INST]\n True\nWord1: happy, Word2: sad  [/INST]\n False\nWord1: dog, Word2: cat  [/INST]\n False\nWord1: Big Word2: melody [/INST] False </s>"}

In [64]:
dataset = transformed_dataset.remove_columns(['input','response'])
dataset
print(f"Dataset length: {len(dataset)}")
for i, item in enumerate(dataset):
    print(f"{i}: {item}")

Dataset length: 995
0: {'text': "<s>[INST] Determine if the 2 words are semantically similar. Provide 'True' or 'False'. \n[/INST]\n Examples: \nWord1: color, Word2: colour  [/INST]\n True\nWord1: red, Word2: Red  [/INST]\n True\nWord1: happy, Word2: sad  [/INST]\n False\nWord1: dog, Word2: cat  [/INST]\n False\nWord1: Jump Word2: leap [/INST] True </s>"}
1: {'text': "<s>[INST] Determine if the 2 words are semantically similar. Provide 'True' or 'False'. \n[/INST]\n Examples: \nWord1: color, Word2: colour  [/INST]\n True\nWord1: red, Word2: Red  [/INST]\n True\nWord1: happy, Word2: sad  [/INST]\n False\nWord1: dog, Word2: cat  [/INST]\n False\nWord1: Big Word2: melody [/INST] False </s>"}
2: {'text': "<s>[INST] Determine if the 2 words are semantically similar. Provide 'True' or 'False'. \n[/INST]\n Examples: \nWord1: color, Word2: colour  [/INST]\n True\nWord1: red, Word2: Red  [/INST]\n True\nWord1: happy, Word2: sad  [/INST]\n False\nWord1: dog, Word2: cat  [/INST]\n False\nWord1: H

In [65]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# LoRA config
if modeltype == 'llama':
    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.08,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
elif modeltype == 'bert':
    peft_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["query", "key", "value", "dense"],
        task_type=TaskType.SEQ_CLS,
        bias="none"
    )

In [66]:
device_map = {}
for layer_num in range(80):
    device_map[f"model.layers.{layer_num}.input_layernorm.weight"] = 0  # Move to GPU 0
    device_map[f"model.layers.{layer_num}.mlp.down_proj.weight"] = 0
    device_map[f"model.layers.{layer_num}.mlp.gate_proj.weight"] = 0
    device_map[f"model.layers.{layer_num}.mlp.up_proj.weight"] = 0
    device_map[f"model.layers.{layer_num}.post_attention_layernorm.weight"] = 0
    device_map[f"model.layers.{layer_num}.self_attn.k_proj.weight"] = 0
    device_map[f"model.layers.{layer_num}.self_attn.o_proj.weight"] = 0
    device_map[f"model.layers.{layer_num}.self_attn.q_proj.weight"] = 0
    device_map[f"model.layers.{layer_num}.self_attn.v_proj.weight"] = 0
    device_map["score.weight"] = 0  # Assign to GPU 0
    device_map["score.bias"] = 0

# Move other layers to GPU
device_map["model.embed_tokens.weight"] = 0
device_map["lm_head.weight"] = 0
device_map["model.norm.weight"] = 0

In [67]:
if modeltype == 'llama':
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=bnb_config,
        device_map = device_map,
        attn_implementation = "eager",
        resume_download = True,
        output_hidden_states = True #check cuda compute capability version and set it to flash attention if it's above 8
    )
elif modeltype == 'bert':
    model = AutoModelForSequenceClassification.from_pretrained(
        base_model,
        quantization_config=bnb_config,
        device_map="auto",
        attn_implementation = "eager",
        resume_download=True,
        output_hidden_states = True
    )
device = "cuda" if torch.cuda.is_available() else "cpu"

model.to(device)

The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), 

In [68]:
import gc
gc.collect()
torch.cuda.empty_cache()
#garbage collection

In [69]:
if modeltype == 'llama':
    # Check tokenizer vocab size & IDs
    # If pad_token is the same as eos_token, fix it
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    # Check tokenizer vocab size & IDs
    print(f"Tokenizer vocab size: {tokenizer.vocab_size}")
    print(f"EOS Token ID: {tokenizer.eos_token_id}")
    print(f"PAD Token ID: {tokenizer.pad_token_id}")
    print(f"UNK Token ID: {tokenizer.unk_token_id}")
    # tokenizer.pad_token = tokenizer.eos_token
    # tokenizer.padding_side = "right"

    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="longest",  # Avoid forcing max_length
            truncation=True,
            return_tensors="pt"
        )

    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    print(tokenized_dataset[0])

    # Verify new PAD token
    print(f"New PAD Token ID: {tokenizer.pad_token_id}")
    print(f"Decoded Text: {tokenizer.decode(tokenized_dataset[0]['input_ids'])}")
elif modeltype == 'bert':
    # def tokenize_function(examples):
    #     tokenized = tokenizer(
    #         examples["text"],
    #         #padding="max_length",
    #         padding="longest",
    #         truncation=True
    #     )
    #     tokenized["label"] = examples["label"]  # ✅ keep label
    #     return tokenized
    def tokenize_function(examples):
        if "text" not in examples or not examples["text"]:
            return {}

        tokenized = tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=128
        )
        tokenized["label"] = examples["label"]
        return tokenized

    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    # 🔥 Required cleanup step
    tokenized_dataset = tokenized_dataset.filter(lambda x: "input_ids" in x and x["input_ids"] is not None)

    tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])



    print(f"PAD Token ID: {tokenizer.pad_token_id}")
    print(f"Decoded: {tokenizer.decode(tokenized_dataset[0]['input_ids'])}")

Tokenizer vocab size: 128000
EOS Token ID: 128001
PAD Token ID: 128001
UNK Token ID: None
{'input_ids': [128000, 45147, 31868, 65562, 60, 31001, 422, 279, 220, 17, 4339, 527, 5347, 82049, 4528, 13, 40665, 364, 2575, 6, 477, 364, 4139, 4527, 720, 25130, 65562, 933, 26379, 25, 720, 11116, 16, 25, 1933, 11, 9506, 17, 25, 12745, 220, 66028, 65562, 933, 3082, 198, 11116, 16, 25, 2579, 11, 9506, 17, 25, 3816, 220, 66028, 65562, 933, 3082, 198, 11116, 16, 25, 6380, 11, 9506, 17, 25, 12703, 220, 66028, 65562, 933, 3641, 198, 11116, 16, 25, 5679, 11, 9506, 17, 25, 8415, 220, 66028, 65562, 933, 3641, 198, 11116, 16, 25, 29888, 9506, 17, 25, 32571, 66028, 65562, 60, 3082, 694, 82, 29, 128001, 128001, 128001], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [70]:
print(tokenized_dataset[0].keys())  # → should include: input_ids, attention_mask, label
print(tokenized_dataset[0]["input_ids"][:10])  # check the first 10 tokens


dict_keys(['input_ids', 'attention_mask'])
[128000, 45147, 31868, 65562, 60, 31001, 422, 279, 220, 17]


In [71]:
import time
if modeltype == 'llama':
    runs_base_path = '/content/drive/MyDrive/Colab Notebooks/Llama_3/all_runs_of_LoRa'
elif modeltype == 'bert':
    runs_base_path = '/content/drive/MyDrive/Colab Notebooks/BeRT/all_runs_of_LoRa'
output_dir = os.path.join(runs_base_path, time.strftime("%Y-%m-%d_%H-%M-%S"))

#GPT suggested training args


if argtype == 'new':
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=10,  # Increased to allow more learning opportunities
        per_device_train_batch_size=8,  # Increased to utilize more data per iteration
        gradient_accumulation_steps=1,
        optim="paged_adamw_32bit",  # Standard optimizer suitable for small datasets
        save_steps=50,  # Adjusted to reduce frequency of saving
        logging_steps=10,  # More frequent logging for closer monitoring
        learning_rate=2e-5,  # Lowered to prevent overshooting during optimization
        weight_decay=0.01,  # Standard value to prevent overfitting
        fp16=False,
        bf16=False,
        max_grad_norm=1.0,  # Standard value to stabilize training
        max_steps=-1,
        warmup_ratio=0.1,  # Increased to allow the model to adjust gradually
        group_by_length=True,
        lr_scheduler_type="linear",  # Linear scheduler is commonly used
        report_to="tensorboard",
        label_names=["text"]
    )

elif argtype == 'old':
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        optim="paged_adamw_32bit",
        save_steps=20,
        logging_steps=20,
        learning_rate=5e-4, #try hyper parameter tuning here; why I chose this
        weight_decay=0.001,
        fp16=False,
        bf16=False,
        max_grad_norm=0.25,
        max_steps=-1,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="constant",
        report_to="tensorboard",
        label_names=["text"]
    )


In [72]:
# from torch.utils.data import DataLoader

# dataloader = DataLoader(tokenized_dataset, batch_size=4, collate_fn=data_collator)
# batch = next(iter(dataloader))

# print(batch["input_ids"].shape)  # e.g., torch.Size([4, 128])
# print(batch["label"].shape)      # should be torch.Size([4])

In [73]:
if modeltype == 'bert':
    from torch.utils.data import DataLoader

    dl = DataLoader(tokenized_dataset, batch_size=2, collate_fn=data_collator)
    batch = next(iter(dl))

    print("Batch keys:", batch.keys())  # should have: input_ids, attention_mask, label


In [74]:

hyper_parameter_tuning = False


if hyper_parameter_tuning:
    import random
    import os

    # Base training arguments
    base_training_arguments = {
        "num_train_epochs": 10,
        "per_device_train_batch_size": 8,
        "gradient_accumulation_steps": 1,
        "save_steps": 50,
        "logging_steps": 10,
        "learning_rate": 2e-5,
        "weight_decay": 0.01,
        "max_grad_norm": 1.0,
        "warmup_ratio": 0.1
    }

    # Function to generate a new hyperparameter set by modifying numeric values by -90% to +90%
    def perturb_hyperparameters(base_hyperparams):
        new_hyperparams = base_hyperparams.copy()
        for key, value in base_hyperparams.items():
            if isinstance(value, (int, float)) and key not in ["gradient_accumulation_steps"]:  # Exclude non-tunable params
                factor = random.uniform(0.1, 1.9)  # -90% to +90%
                new_hyperparams[key] = max(1e-7, value * factor)  # Ensure positive values
        return new_hyperparams
    def generate_model_name(hyperparams):
        name_parts = [
            f"epochs_{int(hyperparams['num_train_epochs'])}",
            f"batch_{int(hyperparams['per_device_train_batch_size'])}",
            f"lr_{hyperparams['learning_rate']:.1e}",
            f"wd_{hyperparams['weight_decay']:.2e}",
            f"warmup_{hyperparams['warmup_ratio']:.2f}"
        ]
        return "trained_model_" + "_".join(name_parts)

    # Loop to train 20 models with different hyperparameters
    for i in range(20):
        # Generate perturbed hyperparameters
        modified_hyperparams = perturb_hyperparameters(base_training_arguments)

        # Create TrainingArguments instance
        training_arguments = TrainingArguments(
            output_dir=f"{output_dir}/model_{i}",
            num_train_epochs=int(modified_hyperparams["num_train_epochs"]),
            per_device_train_batch_size=int(modified_hyperparams["per_device_train_batch_size"]),
            gradient_accumulation_steps=modified_hyperparams["gradient_accumulation_steps"],
            save_steps=int(modified_hyperparams["save_steps"]),
            logging_steps=int(modified_hyperparams["logging_steps"]),
            learning_rate=modified_hyperparams["learning_rate"],
            weight_decay=modified_hyperparams["weight_decay"],
            fp16=False,
            bf16=False,
            max_grad_norm=modified_hyperparams["max_grad_norm"],
            max_steps=-1,
            warmup_ratio=modified_hyperparams["warmup_ratio"],
            group_by_length=True,
            lr_scheduler_type="linear",
            report_to="tensorboard",
            label_names=["text"]
        )

        # Initialize and train the model
        trainer = SFTTrainer(
            model=model,
            train_dataset=tokenized_dataset,
            peft_config=peft_config,
            processing_class=tokenizer,
            args=training_arguments
        )

        trainer.train()
        # Save the trained model
        new_model_path = os.path.join(new_model,generate_model_name(modified_hyperparams))
        # os.makedirs(new_model_path, exist_ok=True)
        trainer.model.save_pretrained(new_model_path)

        print(f"Model {i} trained and saved at {new_model_path}")


else:
    if modeltype == 'bert':
        from transformers import Trainer
        from transformers import DataCollatorWithPadding
        from peft import get_peft_model

        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

        # Apply LoRA (only once)
        model = get_peft_model(model, peft_config)
        from types import MethodType

        # Patch model.forward to ignore unknown kwargs like num_items_in_batch
        def patched_forward(self, *args, **kwargs):
            kwargs.pop("num_items_in_batch", None)  # 👈 this is the key line
            return self.__class__.forward(self, *args, **kwargs)

        # Inject patch
        model.forward = MethodType(patched_forward, model)

        # Define the trainer correctly
        trainer = Trainer(
            model=model,
            train_dataset=tokenized_dataset,
            tokenizer=tokenizer,
            args=training_arguments,
            data_collator=data_collator,
        )
        trainer.train()
        trainer.model.save_pretrained(new_model)
    elif modeltype == 'llama':
        trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_dataset,
        peft_config=peft_config,
        processing_class=tokenizer,  # Use processing_class instead
        args=training_arguments
        )

        trainer.train()

        trainer.model.save_pretrained(new_model)

Step,Training Loss
10,2.7354
20,2.6596
30,2.5423
40,2.3661
50,2.0788
60,1.7281
70,1.3571
80,0.893
90,0.5103
100,0.3094


Step,Training Loss
10,2.7354
20,2.6596
30,2.5423
40,2.3661
50,2.0788
60,1.7281
70,1.3571
80,0.893
90,0.5103
100,0.3094


## Below is the original singular training and tensorboard, use when needed

In [75]:


# # trainer = SFTTrainer(
# #     model=model,
# #     train_dataset=dataset,
# #     peft_config=peft_config,
# #     dataset_text_field="text",
# #     max_seq_length=None,
# #
# #     args=training_arguments,

# # )

# trainer = SFTTrainer(
#     model=model,
#     train_dataset=tokenized_dataset,
#     peft_config=peft_config,
#     processing_class=tokenizer,  # Use processing_class instead
#     args=training_arguments
# )

# trainer.train()
# trainer.model.save_pretrained(new_model)

In [76]:
# #visualise training metrics
# %load_ext tensorboard
# %tensorboard --logdir results/runs

In [77]:
# base_model = AutoModelForCausalLM.from_pretrained(
#     base_model,
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.float16,
#     device_map= "auto",
# )
# model = PeftModel.from_pretrained(base_model, new_model)
# model = model.merge_and_unload()

# tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# # tokenizer.padding_side = "right"