# Overview

SemScore offers a way to **monitor semantic similarity of prediction versus reference throughout training**, providing insights into the true progress of the model, beyond what the traditional loss metrics can offer. In this notebook, we will put SemScore into a typical Huggingface Trainer training run.

In [None]:
%%capture
!pip install transformers==4.38.2
!pip install accelerate==0.27.2
!pip install datasets==2.18.0
!pip install peft==0.9.0
!pip install bitsandbytes==0.42.0
!pip install trl==0.7.11

In [None]:
import os
import torch
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))


os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "Fine-tuning tinyllama1-1b-chat"
os.environ["WANDB_NAME"] = "ft-tinyllama1-1b-chat-on-oasst2-top4k"

os.environ["MODEL_NAME"]="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
os.environ["DATASET"]="g-ronimo/oasst2_top4k_en"

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [None]:
!accelerate estimate-memory ${MODEL_NAME} --library_name transformers

# Loading Dataset

In [None]:
from datasets import load_dataset

dataset=load_dataset(os.getenv("DATASET"), split=["train:5000"])
dataset=dataset.train_test_split(test=0.1)
dataset

# Loading Model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCusalLM

torch.manual_seed(2024)

tokenizer=AutoTokenizer.from_pretrained(os.getenv(MODEL_NAME), use_fast=False)
model=AutoModel.from_pretrained(os.getenv(MODEL_NAME), device_map="auto", torch_dtype=torch.bfloat16)
model.device

# Training

We define the customize evaluation function and append it through callback function.

In [None]:
from transfromers import TrainerCallback




In [None]:
from transformers import TrainingArguments, BitsAndBytesConfig, set_seed
from peft import LoraConfig
from trl import SFTTrainer, setup_chat_format, DataCollatorForCompletionOnlyLM, 

model, tokenizer=setup_chat_format(model, tokenizer)
if tokenizer.pad_token in [None, tokenizer.eos_token]:
    tokenizer.pad_token=tokenizer.unk_token
    
args=TrainingArguments(
    output_dir=os.getenv("WANDB_NAME"),
    evaluation_strategy="steps",
    label_names=["labels"],
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    save_steps=250,
    eval_steps=250,
    logging_steps=1,
    learning_rate=1e-5,
    num_train_epochs=2,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",
    fp16=True,
    gradient_checkpointing=True,
    group_by_length=True
)

sf_trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=DataCollatorForCompletionOnlyLM(
        instruction_template="<|im_start|>user",
        response_template="<|im_start|>assistant",
        tokenier=tokenizer,
        mlm=False
    ),
    max_seq_length=512,
    args=args,
    callbacks=[]
)
