# Load the model

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "HuggingFaceTB/SmolLM-135M"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(checkpoint,device_map='auto')

print(f"Model is hosted on: {model.device}")

Model is hosted on: cuda:0


# Load the data

In [3]:
import pickle as pkl

In [5]:
train_data = pkl.load(open("Dataset/ChatTrain.pkl","rb"))
val_data = pkl.load(open("Dataset/ChatTest.pkl","rb"))

train_data, val_data

(Dataset({
     features: ['text'],
     num_rows: 8625
 }),
 Dataset({
     features: ['text'],
     num_rows: 455
 }))

# Setup training infra

In [6]:
from trl import SFTTrainer, SFTConfig

In [7]:
sft_config = SFTConfig(
    output_dir="./ChatTraining_Checkpoints",
    max_steps=200, 
    per_device_train_batch_size=8,
    gradient_accumulation_steps=16,
    learning_rate=5e-5,
    logging_steps=50,
    save_steps=60,
    save_total_limit=5,
    eval_strategy="steps",
    eval_steps=50,
    report_to = 'none',
)

In [8]:
trainer = SFTTrainer(
    model = model,
    train_dataset = train_data,
    eval_dataset = val_data,
    args = sft_config)

Adding EOS to train dataset: 100%|███████████████████████████████████████| 8625/8625 [00:00<00:00, 16452.68 examples/s]
Tokenizing train dataset: 100%|███████████████████████████████████████████| 8625/8625 [00:04<00:00, 1997.97 examples/s]
Truncating train dataset: 100%|█████████████████████████████████████████| 8625/8625 [00:00<00:00, 479188.70 examples/s]
Adding EOS to eval dataset: 100%|██████████████████████████████████████████| 455/455 [00:00<00:00, 12774.50 examples/s]
Tokenizing eval dataset: 100%|██████████████████████████████████████████████| 455/455 [00:00<00:00, 2039.89 examples/s]
Truncating eval dataset: 100%|█████████████████████████████████████████████| 455/455 [00:00<00:00, 90816.04 examples/s]


# Train the model

In [9]:
trainer.train()
# trainer.train(resume_from_checkpoint=True)

Step,Training Loss,Validation Loss
50,1.8747,1.243692
100,1.1435,1.165151
150,1.0661,1.144392
200,1.03,1.139591


TrainOutput(global_step=200, training_loss=1.2785726928710937, metrics={'train_runtime': 1667.8538, 'train_samples_per_second': 15.349, 'train_steps_per_second': 0.12, 'total_flos': 1260704267373312.0, 'train_loss': 1.2785726928710937})

In [10]:
import torch
torch.cuda.empty_cache()

# Save the model

In [11]:
trainer.save_model("SmolLM-Our-Instruct-vxx")