In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the base model and tokenizer
model_id = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Load the training dataset
dataset = load_dataset("csv", data_files="sarcasm_data/sarcasm.csv", split="train")
print(dataset)

Dataset({
    features: ['question', 'answer'],
    num_rows: 200
})


In [3]:
dataset[11]

{'question': 'What year is it?',
 'answer': "Oh, we’re still in 2024, believe it or not. Time flies when you're... asking questions like this."}

In [4]:
# Define a function to apply the chat template
def apply_chat_template(example):
    messages = [
        {"role": "user", "content": example['question']},
        {"role": "assistant", "content": example['answer']}
    ]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return {"prompt": prompt}

# Apply the function to the dataset
new_dataset = dataset.map(apply_chat_template)

In [5]:
new_dataset

Dataset({
    features: ['question', 'answer', 'prompt'],
    num_rows: 200
})

In [6]:
new_dataset[11]

{'question': 'What year is it?',
 'answer': "Oh, we’re still in 2024, believe it or not. Time flies when you're... asking questions like this.",
 'prompt': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 11 Oct 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat year is it?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nOh, we’re still in 2024, believe it or not. Time flies when you're... asking questions like this.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"}

In [7]:
new_dataset = new_dataset.train_test_split(0.05)

In [8]:
new_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'prompt'],
        num_rows: 190
    })
    test: Dataset({
        features: ['question', 'answer', 'prompt'],
        num_rows: 10
    })
})

In [9]:
# Tokenize the data
def tokenize_function(example):
    tokens = tokenizer(example['prompt'], padding="max_length", truncation=True, max_length=128)
    # Set padding token labels to -100 to ignore them in loss calculation
    tokens['labels'] = [
        -100 if token == tokenizer.pad_token_id else token for token in tokens['input_ids']
    ]
    return tokens

tokenized_dataset = new_dataset.map(tokenize_function)
print(tokenized_dataset)

Map: 100%|██████████████████████████████████████████████████████████| 190/190 [00:00<00:00, 5854.31 examples/s]
Map: 100%|████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 2490.83 examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 190
    })
    test: Dataset({
        features: ['question', 'answer', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10
    })
})





In [10]:
tokenized_dataset = tokenized_dataset.remove_columns(['question', 'answer', 'prompt'])
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 190
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10
    })
})


In [11]:
# Define training arguments
model.train()
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # to evaluate during training
    eval_steps=40,
    logging_steps=40,
    save_steps=150,
    per_device_train_batch_size=2,  # Adjust based on your hardware
    per_device_eval_batch_size=2,
    num_train_epochs=2,  # Modify based on your dataset size
    fp16=False,  # Enable mixed precision if available
    save_total_limit=2,  # Only save the last two checkpoints
    report_to="tensorboard",
    log_level="info",
    learning_rate=1e-5,
    use_mps_device=True,
    max_grad_norm=2
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer)

# Train the model
trainer.train()

# Save the model and tokenizer
trainer.save_model("./fine-tuned-model_1b")
tokenizer.save_pretrained("./fine-tuned-model_1b")

***** Running training *****
  Num examples = 190
  Num Epochs = 2
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 190
  Number of trainable parameters = 1,235,814,400


Step,Training Loss,Validation Loss
40,1.5064,1.376111
80,1.1202,0.990449
120,0.5752,1.016172
160,0.3758,0.979048



***** Running Evaluation *****
  Num examples = 10
  Batch size = 2

***** Running Evaluation *****
  Num examples = 10
  Batch size = 2

***** Running Evaluation *****
  Num examples = 10
  Batch size = 2
Saving model checkpoint to ./results/checkpoint-150
Configuration saved in ./results/checkpoint-150/config.json
Configuration saved in ./results/checkpoint-150/generation_config.json
Model weights saved in ./results/checkpoint-150/model.safetensors
tokenizer config file saved in ./results/checkpoint-150/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-150/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 10
  Batch size = 2
Saving model checkpoint to ./results/checkpoint-190
Configuration saved in ./results/checkpoint-190/config.json
Configuration saved in ./results/checkpoint-190/generation_config.json
Model weights saved in ./results/checkpoint-190/model.safetensors
tokenizer config file saved in ./results/checkpoint-190/tokenizer_conf

('./fine-tuned-model/tokenizer_config.json',
 './fine-tuned-model/special_tokens_map.json',
 './fine-tuned-model/tokenizer.json')