### Installing and importing the required modules

In [None]:
import re
import sys
import torch
import random
import pandas as pd
from pathlib import Path
from datasets import Dataset
from typing import Dict, Any, cast
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import PreTrainedModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer

# Add the parent directory to the system path
sys.path.append(str(Path().resolve().parent.parent))

# Import local dependencies
from src.hf import hf_login
from src.utils import get_device, set_seed

### Setting up the environment

In [None]:
# Login to Hugging Face
hf_login()

In [None]:
# Get the device available on the system
device = get_device()
use_cuda = torch.cuda.is_available() and "cuda" in str(device).lower()

# Print the detected device
print(f"Detected device: {device}")

### Constants, hyperparameters and model configurations

In [None]:
seed = 42 # Seed for reproducibility
test_size = 0.2 # Train-test split percentage
max_length = 64 # Maximum length of the sequences
model_id = "Qwen/Qwen3-0.6B" # The model ID
dataset_path = Path().resolve().parent.parent / "datasets" / "emoji_translation_dataset.csv" # Path to the dataset
adapter_path = Path().resolve().parent.parent / "saved_models" / "emoji_translation_adapter" # Path to save the trained model to

In [None]:
# Set the seed for reproducibility
set_seed(seed)

### Data loading

In [None]:
# Load the dataset into a pandas DataFrame
dataset = pd.read_csv(
    dataset_path,
    delimiter = ",",  # Use ',' as the delimiter
    quoting = 3,  # Handle quotes around text
    on_bad_lines = "skip"  # Skip problematic lines if necessary
)

# Keep only the relevant columns
dataset = dataset[[
    "text", # Feature
    "emoji" # Label
]]

In [None]:
# Show all the text in the DataFrame
pd.set_option('display.max_colwidth', None)

# Show a subset of the samples
dataset.head()

### Tokenizer

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set the padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

### Preprocess data

In [None]:
def build_chat(user_text: str, answer_text: str) -> tuple[list[int], list[int], list[int]]:
    # Build the chat conversation
    system_prompt = {"role": "system", "content": "You are a helpful assistant that translates text into emojis."}
    user_prompt = {"role": "user", "content": f"Translate the following text into emojis: {user_text}"}
    assistant_response = {"role": "assistant", "content": answer_text}
    
    # Create the full conversation
    conversation = [
        system_prompt,
        user_prompt,
        assistant_response
    ]

    # Apply the chat template
    full_text = tokenizer.apply_chat_template(conversation, add_generation_prompt=False, tokenize=False, enable_thinking=False)

    # Build prompt only (without the answer) and apply the chat template
    prompt_only = [system_prompt, user_prompt]
    prompt_text = tokenizer.apply_chat_template(prompt_only, add_generation_prompt=True, tokenize=False, enable_thinking=False)

	# Tokenize both full and prompt texts
    full = tokenizer(full_text, max_length=max_length, padding="max_length", truncation=True)

	# Extract input ids and attention masks
    input_ids = full["input_ids"]
    attn = full["attention_mask"]

	# Create labels, initialized to -100 (ignore index)
    labels = [-100] * len(input_ids)
    
    # Determine the starting index of the assistant's response
    start = len(tokenizer(prompt_text, add_special_tokens=False)["input_ids"])
    
    # Fill labels with input ids for the assistant portion only, ignore padding
    for i in range(start, len(input_ids)):
        if attn[i] == 1:
            labels[i] = input_ids[i]

	# Return the input ids, attention mask, and labels
    return input_ids, attn, labels

def preprocess(examples: Dict[str, Any]) -> Dict[str, Any]:
    # Preprocess the examples to build input ids, attention masks, and labels
    inputs, masks, labels = [], [], []
    
    # Iterate through each example and build the chat inputs
    for u, y in zip(examples["text"], examples["emoji"]):
        # Build chat inputs
        ids, attn, labs = build_chat(u, y)
        
        # Append to the respective lists
        inputs.append(ids)
        masks.append(attn)
        labels.append(labs)
        
	# Return the processed inputs as a dictionary
    return {"input_ids": inputs, "attention_mask": masks, "labels": labels}

In [None]:
# Convert the Pandas DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(dataset)

# Preprocess the dataset to build input ids, attention masks, and labels
dataset = hf_dataset.map(preprocess, batched=True, remove_columns=hf_dataset.column_names)

# Train-test split
train_dataset, test_dataset = dataset.train_test_split(test_size=test_size, seed=seed).values()

In [None]:
# Select a random training sample
random_sample = random.choice(train_dataset)

# Print a random sequence
print("FULL SEQUENCE:")
print("-" * 20)
print(tokenizer.decode(random_sample["input_ids"]))

# Print the labels of the random sample
print("\nLABEL:")
print("-" * 20)
print(tokenizer.decode([l for l in random_sample["labels"] if l != -100]))

### Building the model

In [None]:
# Define the quantization configurations of the model (only for CUDA devices)
quantization_config = None
if use_cuda:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "nf4",
        bnb_4bit_compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
        bnb_4bit_use_double_quant = True
    )

In [None]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage = True,
    quantization_config = quantization_config,
    device_map = "auto"
)

In [None]:
# LoRA (Low-rank adaptation configurations)
lora_config = LoraConfig(
    r = 16,                        # Rank of the LoRA matrices
    lora_alpha = 32,               # Alpha parameter for scaling
    use_rslora = True,             # Use RSLora
    lora_dropout = 0.1,            # Dropout probability
    target_modules = [             # Target modules to apply LoRA
        "q_proj",
        "k_proj", 
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ]
)

In [None]:
# Apply LoRA (Low-rank adaptation) to the model
model = get_peft_model(model, lora_config)

In [None]:
# Print trainable parameters
model.print_trainable_parameters()

In [None]:
# Print the model
model

### Trainig the model

In [None]:
# Mixed precision settings
use_pin_memory = bool(use_cuda)
bf16 = bool(use_cuda and torch.cuda.is_bf16_supported())

# SFTTrainer config 
sft_config = SFTConfig(
    learning_rate = 5e-5,
    num_train_epochs = 50,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 16,
    logging_steps = 10,
    eval_strategy = "steps",
    dataloader_pin_memory = use_pin_memory,
	bf16 = bf16,
	weight_decay = 0.01,
 	lr_scheduler_type = "linear"
)

In [None]:
# Initialize the SFTTrainer
trainer = SFTTrainer(
    model = cast(PreTrainedModel, model),
    args = sft_config,
    train_dataset = train_dataset, 
    eval_dataset = test_dataset
)

# Training the model
trainer_output = trainer.train()

# Pretty print the training results
print(trainer_output)

### Save the model

In [None]:
# Saving the adapter to the destination path
model.save_pretrained(str(adapter_path))

### Load the fine-tuned model

In [None]:
# Clear GPU cache
if torch.cuda.is_available():
	torch.cuda.empty_cache()

In [None]:
# Load the base model first
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map = "auto",
    low_cpu_mem_usage = True,
    quantization_config = quantization_config
)

# Load the LoRA adapter and attach it to the base model
model = PeftModel.from_pretrained(model, adapter_path)

In [None]:
# Set the model to evaluation mode
model.eval();

### Inference

In [None]:
# Compose the chat-like prompt
prompt = [
    [
        {"role": "system", "content": "You are a helpful assistant that translates text into emojis."},
        {"role": "user", "content": "Translate the following text into emojis: I love programming and coffee."},
        {"role": "assistant", "content": ""}
    ]
]

# Apply chat template if supported
messages = tokenizer.apply_chat_template(
    prompt,
    add_generation_prompt = True,
    tokenize = False
)

# Tokenize the formatted prompt
inputs = tokenizer(
    messages,
    return_tensors = "pt"
).to(device)

In [None]:
# Disable gradient calculation
with torch.no_grad():
    # Generate the responses
    outputs = model.generate(
        **inputs,
        max_new_tokens = 16,
        streamer = TextStreamer(tokenizer, skip_prompt = True)
    )

In [None]:
# Decode the model output
gen_ids = outputs[0, inputs["input_ids"].shape[-1]:]
generated_text = tokenizer.decode(gen_ids, skip_special_tokens=True)

# Extract the generated category from the response
match = re.search(r"<category>(.*?)</category>", generated_text)
category = match.group(1).strip() if match else generated_text.strip()

# Print the response
print(category)