### Installing and importing the required modules

In [None]:
import os
import sys
import torch
import random
import dotenv
import pandas as pd
from pathlib import Path
from datasets import Dataset
from typing import Dict, Any
from huggingface_hub import login
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer

# Add the parent directory to the system path
sys.path.append(str(Path().resolve().parent))

# Import local dependencies
from src.utils import get_device, set_seed

In [None]:
# Load environment variables
dotenv.load_dotenv(dotenv_path=".env", override=True)

In [None]:
# Extract the hugging face token from the user data
HF_TOKEN = os.getenv('HF_TOKEN')

# Check if the HF token has been provided
if not HF_TOKEN:
  # Raise an exception if the HF token was not provided
  raise Exception("Token is not set. Please save the token first.")

# Authenticate with hugging face
login(HF_TOKEN)

# Login successful
print("Successfully logged in to Hugging Face!")

### Constants, hyperparameters and model configurations

In [None]:
seed = 42 # Seed for reproducibility
test_size = 0.2 # Train-test split percentage
max_length = 128 # Maximum length of the sequences
model_id = "Qwen/Qwen3-0.6B" # The model ID of the Llama model
dataset_path = Path().resolve().parent.parent / "datasets" / "arxiv_dataset.csv" # Path to the dataset
adapter_path = Path().resolve().parent.parent / "saved_models" / "papers_category_classifier_adapter" # Path to save the trained model to

In [None]:
# Set the seed for reproducibility
set_seed(seed)

In [None]:
# Get the device available on the system
device = get_device()
use_cuda = torch.cuda.is_available() and "cuda" in str(device).lower()

# Print the detected device
print(f"Detected device: {device}")

### Data loading

In [None]:
# Load the dataset into a pandas DataFrame
dataset = pd.read_csv(
    dataset_path,
    delimiter = "|",
    quoting = 3,  # Handle quotes around text
    on_bad_lines = "skip"  # Skip problematic lines if necessary
)

# Keep only the relevant columns
dataset = dataset[[
    "summary", # Feature
    "category_description" # Label
]]

In [None]:
# Show a subset of the samples
dataset.head()

### Tokenizer

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set the padding token to the end of the sequence
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

### Preprocess data

In [None]:
# Convert the Pandas DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(dataset)

# Train-test split
train_dataset, test_dataset = hf_dataset.train_test_split(test_size=test_size, seed=seed).values()

In [None]:
def preprocess(examples: Dict[str, Any], max_length: int = 128) -> Dict[str, Any]:
    # Define the expected response template
    response_template = lambda category: f"Category: {category}{tokenizer.eos_token}"
    
    # Create the prompts
    prompts = [
        [
            {"role": "user", "content": summary},
            {"role": "assistant", "content": response_template(category)}
        ]
        for summary, category in zip(examples["summary"], examples["category_description"])
    ]

    # Apply the chat template
    texts = tokenizer.apply_chat_template(
        prompts, add_generation_prompt=False, tokenize=False
    )

    # Tokenize the inputs
    enc = tokenizer(
        texts,
        truncation = True,
        padding = "max_length",
        max_length = max_length
    )

    # Tokenize the targets
    targets = [response_template(category) for category in examples["category_description"]]
    tgt_enc = tokenizer(
        targets,
        truncation = True,
        padding = "max_length",
        max_length = max_length
    )

    # Extract input IDs and create labels
    input_ids = enc["input_ids"]
    labels = []
    for ids, tgt_ids in zip(input_ids, tgt_enc["input_ids"]):
        # Create a label array initialized to -100
        lbl = [-100] * len(ids)
        
        # Determine the padding token ID
        try:
            pad_id = tokenizer.pad_token_id
        except:
            pad_id = tokenizer.eos_token_id
            
        # Compute the length of the target without padding
        tgt_len = len(tgt_ids) - tgt_ids[::-1].index(pad_id) if pad_id in tgt_ids else len(tgt_ids)

        # Find the starting index to copy the target IDs
        start = len(ids) - tgt_len
        if start < 0:
            start = 0
            
        # Copy the target IDs into the label array
        lbl[start:] = tgt_ids[-(len(ids)-start):]
        
        # Mask the padding tokens in the labels
        labels.append([(-100 if t == pad_id else t) for t in lbl])

    # Return the processed inputs
    return {
        "input_ids": input_ids,
        "attention_mask": enc["attention_mask"],
        "labels": labels
    }

In [None]:
# Preprocess the dataset
tokenized_train_dataset = train_dataset.map(preprocess, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess, batched=True)

# Remove unnecessary columns
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["summary", "category_description"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["summary", "category_description"])

In [None]:
# Select a random training sample
random_sample = random.choice(tokenized_train_dataset)

# Print a random sequence
print("INPUT SEQUENCE")
print("-"*15)
print(tokenizer.decode(random_sample["input_ids"]))

# Print a random sequence
print("\nOUTPUT SEQUENCE")
print("-"*15)
print(tokenizer.decode([
    tokenizer.pad_token_id if token == -100 else token
    for token in random_sample["labels"]
]))

### Building the model

In [None]:
# Define the quantization configurations of the model (only for CUDA devices)
quantization_config = None
if use_cuda:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "nf4",
        bnb_4bit_compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
        bnb_4bit_use_double_quant = True
    )

In [None]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage = True,
    quantization_config = quantization_config,
    device_map = "auto"
)

In [None]:
# LoRA (Low-rank adaptation configurations)
lora_config = LoraConfig(
    r = 16,                        # Rank of the LoRA matrices
    lora_alpha = 32,               # Alpha parameter for scaling
    use_rslora = True,             # Use RSLora
    lora_dropout = 0.1,            # Dropout probability
    target_modules = [             # Target modules to apply LoRA
        "q_proj",
        "k_proj", 
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ]
)

In [None]:
# Apply LoRA (Low-rank adaptation) to the model
model = get_peft_model(model, lora_config)

In [None]:
# Print trainable parameters
model.print_trainable_parameters()

In [None]:
# Print the model
model

### Trainig the model

In [None]:
# Mixed precision settings
use_pin_memory = bool(use_cuda)
bf16 = bool(use_cuda and torch.cuda.is_bf16_supported())

# Define the training arguments
training_args = TrainingArguments(
    output_dir = "./checkpoints/papers_category_classifier",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 3e-5,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    num_train_epochs = 10,
    weight_decay = 0.01,
    logging_dir = "./logs",
    logging_strategy = "steps",
    logging_steps = 50,
    save_total_limit = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",
    greater_is_better = False,
    report_to = "none",
    dataloader_pin_memory = use_pin_memory,
    bf16 = bf16
)

In [None]:
# Instantiate the trainer to train the model
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_test_dataset
)

# Training the model
trainer_output = trainer.train()

# Pretty print the training results
print(trainer_output)

### Save the model

In [None]:
# Saving the adapter to the destination path
model.save_pretrained(adapter_path)

### Load the fine-tuned model

In [None]:
# Clear GPU cache
if torch.cuda.is_available():
	torch.cuda.empty_cache()

In [None]:
# Load the base model first
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map = "auto",
    low_cpu_mem_usage = True,
    quantization_config = quantization_config
)

# Load the LoRA adapter and attach it to the base model
model = PeftModel.from_pretrained(model, adapter_path)

In [None]:
# Set the model to evaluation mode
model.eval();

### Inference

In [None]:
# Tokenize a sample input for chat-like generation
summary = "The transportation industry is experiencing vast digitalization as a plethora of technologies are being implemented to improve efficiency, functionality, and safety. Although technological advancements bring many benefits to transportation, integrating cyberspace across transportation sectors has introduced new and deliberate cyber threats. In the past, public agencies assumed digital infrastructure was secured since its vulnerabilities were unknown to adversaries. However, with the expansion of cyberspace, this assumption has become invalid. With the rapid advancement of wireless technologies, transportation systems are increasingly interconnected with both transportation and non-transportation networks in an internet-of-things ecosystem, expanding cyberspace in transportation and increasing threats and vulnerabilities. This study investigates some prominent reasons for the increase in cyber vulnerabilities in transportation. In addition, this study presents various collaborative strategies among stakeholders that could help improve cybersecurity in the transportation industry. These strategies address programmatic and policy aspects and suggest avenues for technological research and development. The latter highlights opportunities for future research to enhance the cybersecurity of transportation systems and infrastructure by leveraging hybrid approaches and emerging technologies."

# Compose the chat-like prompt
prompt = [
    [
        {"role": "user", "content": summary},
        {"role": "assistant", "content": ""}
    ]
]

# Apply chat template if supported
messages = tokenizer.apply_chat_template(
    prompt,
    add_generation_prompt = True,  # If you want the template to include generation guidance
    tokenize = False  # Return as plain text, not tokenized IDs yet
)

# Tokenize the formatted prompt
inputs = tokenizer(
    messages,
    truncation = True,
    padding = "longest",
    return_tensors = "pt"
).to(device)

In [None]:
# Disable gradient calculation
with torch.no_grad():
    # Generate the responses
    outputs = model.generate(
        **inputs,
        max_new_tokens = 16,
        eos_token_id = tokenizer.eos_token_id,
        pad_token_id = tokenizer.pad_token_id
    )

In [None]:
# Decode the model output
gen_ids = outputs[0, inputs["input_ids"].shape[-1]:]
generated_text = tokenizer.decode(gen_ids, skip_special_tokens=True)

# Extract the generated category from the response
category = generated_text.split("Category:", 1)[-1].strip() if "Category:" in generated_text else generated_text.strip()

# Print the response
print(category)