### Importing the required modules

In [None]:
import sys
import torch
from pathlib import Path
from typing import Optional
from trl import SFTConfig, SFTTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM

# Add the parent directory to the system path
sys.path.append(str(Path().resolve().parent.parent))

# Import local dependencies
from src.utils import get_device, set_seed
from src.data_processing import generate_response
from src.hf import hf_login, load_hf_dataset, dataset_to_pandas

### Setting up the environment

In [None]:
# Login to Hugging Face
hf_login()

In [None]:
# Get the device available on the system
device = get_device()
use_cuda = torch.cuda.is_available() and "cuda" in str(device).lower()

# Print the detected device
print(f"Detected device: {device}")

### Constants, hyperparameters and model configurations

In [None]:
seed = 42 # Seed for reproducibility
test_size = 0.2 # Train-test split percentage
model_id = "Qwen/Qwen2.5-0.5B" # The model ID
dataset_name = "banghua/DL-SFT-Dataset" # The dataset name on Hugging Face Hub
model_path = Path().resolve().parent.parent / "saved_models" / f"{model_id.split('/')[-1]}_instruct" # Path to save the trained model to

In [None]:
# Set the seed for reproducibility
set_seed(seed)

### Data loading

In [None]:
# Load the dataset from Hugging Face Hub
dataset = load_hf_dataset(dataset_name, split="train")

In [None]:
# Convert the dataset to a pandas DataFrame for easier manipulation
dataset_df = dataset_to_pandas(dataset)

# Display the first few rows of the dataset
display(dataset_df.head())

### Preprocess data

In [None]:
# Train-test split
train_dataset, test_dataset = dataset.train_test_split(test_size=test_size, seed=seed).values()

### Tokenizer

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set the padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
# Define the chat template if not already defined
if not tokenizer.chat_template:
	tokenizer.chat_template = """{% for message in messages %}
	{% if message['role'] == 'system' %}System: {{ message['content'] }}\n
	{% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
	{% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
	{% endif %}
	{% endfor %}"""

### Building the model

In [None]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage = True,
    device_map = "auto"
)

### Training the model

In [None]:
# Mixed precision settings
use_pin_memory = bool(use_cuda)
bf16 = bool(use_cuda and torch.cuda.is_bf16_supported())

# SFTTrainer config 
sft_config = SFTConfig(
    learning_rate = 5e-5,
    num_train_epochs = 5,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 16,
    logging_steps = 10,
    eval_strategy = "steps",
    dataloader_pin_memory = use_pin_memory,
	bf16 = bf16,
	weight_decay = 0.01,
 	lr_scheduler_type = "linear"
)

In [None]:
# Initialize the SFTTrainer
sft_trainer = SFTTrainer(
    model = model,
    args = sft_config,
    train_dataset = train_dataset, 
    eval_dataset = test_dataset,
    processing_class = tokenizer
)

# Start the training process
sft_trainer.train()

### Inference

In [None]:
# Set the model to evaluation mode
model.eval();

In [None]:
def test_model_with_questions(model, tokenizer, questions: list[str], system_message: Optional[str] = None):
    # Iterate through each question and generate a response
    for i, question in enumerate(questions, 1):
        # Generate the response
        response = generate_response(
            model = model, 
            tokenizer = tokenizer, 
            user_message = question, 
            system_message = system_message, 
            max_new_tokens = 100
        )
        
        # Print the input question and the model's response
        print(f"\nModel Input {i}:\n{question}\nModel Output {i}:\n{response}\n")

In [None]:
# Define a list of questions to test the model
questions = [
    "Give me an 1-sentence introduction of LLM.",
    "Calculate 1+1-1",
    "What's the difference between thread and process?"
]

# Test the fine-tuned model with the defined questions
test_model_with_questions(model, tokenizer, questions)