### Installing and importing the required modules

In [None]:
import re
import sys
import torch
import random
import pandas as pd
from pathlib import Path
from datasets import Dataset
from typing import Dict, Any
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer, TextStreamer

# Add the parent directory to the system path
sys.path.append(str(Path().resolve().parent.parent))

# Import local dependencies
from src.hf import hf_login
from src.utils import get_device, set_seed
from src.data_processing import build_chat, generate_response

### Setting up the environment

In [None]:
# Login to Hugging Face
hf_login()

In [None]:
# Get the device available on the system
device = get_device()
use_cuda = torch.cuda.is_available() and "cuda" in str(device).lower()

# Print the detected device
print(f"Detected device: {device}")

### Constants, hyperparameters and model configurations

In [None]:
seed = 42 # Seed for reproducibility
test_size = 0.2 # Train-test split percentage
max_length = 448 # Maximum length of the sequences
model_id = "Qwen/Qwen3-1.7B" # The model ID
save_trained_model = False # Whether to save the model after training
dataset_path = Path().resolve().parent.parent / "datasets" / "arxiv_dataset.csv" # Path to the dataset
adapter_path = Path().resolve().parent.parent / "saved_models" / "papers_category_classifier_adapter" # Path to save the trained model to

In [None]:
# Set the seed for reproducibility
set_seed(seed)

### Data loading

In [None]:
# Load the dataset into a pandas DataFrame
dataset = pd.read_csv(
    dataset_path,
    delimiter = "|",
    quoting = 3,  # Handle quotes around text
    on_bad_lines = "skip"  # Skip problematic lines if necessary
)

# Keep only the relevant columns
dataset = dataset[[
    "summary", # Feature
    "category_description" # Label
]]

In [None]:
# Show a subset of the samples
dataset.head()

### Tokenizer

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set the padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

### Preprocess data

In [None]:
# Convert the Pandas DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(dataset)

# Train-test split
train_dataset, test_dataset = hf_dataset.train_test_split(test_size=test_size, seed=seed).values()

In [None]:
def preprocess(examples: Dict[str, Any]) -> Dict[str, Any]:
    # Preprocess the examples to build input ids, attention masks, and labels
    inputs, masks, labels = [], [], []
    
    # Iterate through each example and build the chat inputs
    for summary, category in zip(examples["summary"], examples["category_description"]):
        # Build chat inputs
        ids, attn, labs = build_chat(
            tokenizer, 
            max_length,
            user_text = summary,
            answer_text = f"<category>{category}</category>",
        )

        # Append to the respective lists
        inputs.append(ids)
        masks.append(attn)
        labels.append(labs)
        
	# Return the processed inputs as a dictionary
    return {"input_ids": inputs, "attention_mask": masks, "labels": labels}

In [None]:
# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=["summary","category_description"])
tokenized_test_dataset  = test_dataset.map(preprocess,  batched=True, remove_columns=["summary","category_description"])

In [None]:
# Select a random training sample
random_sample = random.choice(tokenized_train_dataset)

# Print a random sequence
print("FULL SEQUENCE:")
print("-" * 20)
print(tokenizer.decode(random_sample["input_ids"]))

# Print the labels of the random sample
print("\nLABEL:")
print("-" * 20)
print(tokenizer.decode([l for l in random_sample["labels"] if l != -100]))

### Building the model

In [None]:
# Define the quantization configurations of the model (only for CUDA devices)
quantization_config = None
if use_cuda:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "nf4",
        bnb_4bit_compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
        bnb_4bit_use_double_quant = True
    )

In [None]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage = True,
    quantization_config = quantization_config,
    device_map = "auto"
)

In [None]:
# LoRA (Low-rank adaptation configurations)
lora_config = LoraConfig(
    r = 16,                        # Rank of the LoRA matrices
    lora_alpha = 32,               # Alpha parameter for scaling
    use_rslora = True,             # Use RSLora
    lora_dropout = 0.1,            # Dropout probability
    target_modules = [             # Target modules to apply LoRA
        "q_proj",
        "k_proj", 
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ]
)

In [None]:
# Apply LoRA (Low-rank adaptation) to the model
model = get_peft_model(model, lora_config)

In [None]:
# Print trainable parameters
model.print_trainable_parameters()

In [None]:
# Print the model
model

### Trainig the model

In [None]:
# Mixed precision settings
use_pin_memory = bool(use_cuda)
bf16 = bool(use_cuda and torch.cuda.is_bf16_supported())

# Define the training arguments
training_args = TrainingArguments(
    output_dir = "./checkpoints/papers_category_classifier",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 3e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 10,
    weight_decay = 0.01,
    logging_dir = "./logs",
    logging_strategy = "steps",
    logging_steps = 10,
    save_total_limit = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",
    greater_is_better = False,
    report_to = "none",
    dataloader_pin_memory = use_pin_memory,
    bf16 = bf16
)

In [None]:
# Instantiate the trainer to train the model
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_test_dataset
)

# Training the model
trainer_output = trainer.train()

# Pretty print the training results
print(trainer_output)

### Save the model

In [None]:
if save_trained_model:
    # Saving the adapter to the destination path
    model.save_pretrained(str(adapter_path))
    
    # Load the base model first
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map = "auto",
        low_cpu_mem_usage = True,
        quantization_config = quantization_config
    )

    # Load the LoRA adapter and attach it to the base model
    model = PeftModel.from_pretrained(model, adapter_path)

### Inference

In [None]:
# Clear GPU cache
if torch.cuda.is_available():
	torch.cuda.empty_cache()

In [None]:
# Set the model to evaluation mode
model.eval();

In [None]:

# Tokenize a sample input for chat-like generation
summary = (
    "The transportation industry is experiencing vast digitalization as a plethora of technologies are being implemented to improve efficiency, functionality, and safety. "
    "Although technological advancements bring many benefits to transportation, integrating cyberspace across transportation sectors has introduced new and deliberate cyber threats. "
    "In the past, public agencies assumed digital infrastructure was secured since its vulnerabilities were unknown to adversaries. "
    "However, with the expansion of cyberspace, this assumption has become invalid. With the rapid advancement of wireless technologies, transportation systems are increasingly interconnected with both transportation and non-transportation networks in an internet-of-things ecosystem, expanding cyberspace in transportation and increasing threats and vulnerabilities." 
    "This study investigates some prominent reasons for the increase in cyber vulnerabilities in transportation. In addition, this study presents various collaborative strategies among stakeholders that could help improve cybersecurity in the transportation industry. "
    "These strategies address programmatic and policy aspects and suggest avenues for technological research and development. "
    "The latter highlights opportunities for future research to enhance the cybersecurity of transportation systems and infrastructure by leveraging hybrid approaches and emerging technologies."
)

# Generate a response with streaming
response = generate_response(
    model = model, 
    tokenizer = tokenizer,
    user_message = summary,
    max_new_tokens = 16,
    stream = True
)

In [None]:
# Extract the generated category from the response
match = re.search(r"<category>(.*?)</category>", response)
category = match.group(1).strip() if match else response.strip()

# Print the response
print(category)