### Installing and importing the required modules

In [1]:
import os
import sys
import torch
import random
import dotenv
import pandas as pd
from pathlib import Path
from datasets import Dataset
from typing import Dict, Any
from huggingface_hub import login
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer, TextStreamer

# Add the parent directory to the system path
sys.path.append(str(Path().resolve().parent.parent))

# Import local dependencies
from src.utils import get_device, set_seed

Skipping import of cpp extensions due to incompatible torch version 2.8.0+cu128 for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info


In [2]:
# Load environment variables
dotenv.load_dotenv(dotenv_path=".env", override=True)

False

In [3]:
# Extract the hugging face token from the user data
HF_TOKEN = os.getenv('HF_TOKEN')

# Check if the HF token has been provided
if not HF_TOKEN:
  # Raise an exception if the HF token was not provided
  raise Exception("Token is not set. Please save the token first.")

# Authenticate with hugging face
login(HF_TOKEN)

# Login successful
print("Successfully logged in to Hugging Face!")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Successfully logged in to Hugging Face!


### Constants, hyperparameters and model configurations

In [4]:
seed = 42 # Seed for reproducibility
test_size = 0.2 # Train-test split percentage
max_length = 448 # Maximum length of the sequences
model_id = "Qwen/Qwen3-4B-Instruct-2507" # The model ID of the Llama model
dataset_path = Path().resolve().parent.parent / "datasets" / "arxiv_dataset.csv" # Path to the dataset
adapter_path = Path().resolve().parent.parent / "saved_models" / "papers_category_classifier_adapter" # Path to save the trained model to

In [5]:
# Set the seed for reproducibility
set_seed(seed)

In [6]:
# Get the device available on the system
device = get_device()
use_cuda = torch.cuda.is_available() and "cuda" in str(device).lower()

# Print the detected device
print(f"Detected device: {device}")

Detected device: cuda


### Data loading

In [7]:
# Load the dataset into a pandas DataFrame
dataset = pd.read_csv(
    dataset_path,
    delimiter = "|",
    quoting = 3,  # Handle quotes around text
    on_bad_lines = "skip"  # Skip problematic lines if necessary
)

# Keep only the relevant columns
dataset = dataset[[
    "summary", # Feature
    "category_description" # Label
]]

In [8]:
# Show a subset of the samples
dataset.head()

Unnamed: 0,summary,category_description
0,"We present PERSE, a method for building an ani...",Computer Vision and Pattern Recognition
1,We propose action-agnostic point-level (AAPL) ...,Computer Vision and Pattern Recognition
2,We study $\textit{sparse singular value certif...,Data Structures and Algorithms
3,Mixture-of-Agents (MoA) has recently been prop...,Information Theory
4,"We introduce self-invoking code generation, a ...",Software Engineering


### Tokenizer

In [9]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set the padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

### Preprocess data

In [10]:
# Convert the Pandas DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(dataset)

# Train-test split
train_dataset, test_dataset = hf_dataset.train_test_split(test_size=test_size, seed=seed).values()

In [None]:


def build_chat(user_text: str, answer_text: str) -> tuple[list[int], list[int], list[int]]:
    # Build full conversation with target
    conversation = [
        {"role": "user", "content": user_text},
        {"role": "assistant", "content": f"Category: {answer_text}"}
    ]

    # Apply the chat template
    full_text = tokenizer.apply_chat_template(conversation, add_generation_prompt=False, tokenize=False)

    # Build prompt only (without the answer) and apply the chat template
    prompt_only = [{"role": "user", "content": user_text}]
    prompt_text = tokenizer.apply_chat_template(prompt_only, add_generation_prompt=True, tokenize=False)

	# Tokenize both full and prompt texts
    full = tokenizer(full_text, max_length=max_length, padding="max_length", truncation=True)
    prompt = tokenizer(prompt_text, max_length=max_length, padding="max_length", truncation=True)

	# Extract input ids and attention masks
    input_ids = full["input_ids"]
    attn = full["attention_mask"]

	# Create labels, initialized to -100 (ignore index)
    labels = [-100] * len(input_ids)
    
    # Determine the starting index of the assistant's response
    start = len(tokenizer(prompt_text, add_special_tokens=False)["input_ids"])
    
    # Fill labels with input ids for the assistant portion only, ignore padding
    for i in range(start, len(input_ids)):
        if attn[i] == 1:
            labels[i] = input_ids[i]

	# Return the input ids, attention mask, and labels
    return input_ids, attn, labels

def preprocess(examples: Dict[str, Any]) -> Dict[str, Any]:
    # Preprocess the examples to build input ids, attention masks, and labels
    inputs, masks, labels = [], [], []
    
    # Iterate through each example and build the chat inputs
    for u, y in zip(examples["summary"], examples["category_description"]):
        # Build chat inputs
        ids, attn, labs = build_chat(u, y)
        
        # Append to the respective lists
        inputs.append(ids)
        masks.append(attn)
        labels.append(labs)
        
	# Return the processed inputs as a dictionary
    return {"input_ids": inputs, "attention_mask": masks, "labels": labels}

In [12]:
# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=["summary","category_description"])
tokenized_test_dataset  = test_dataset.map(preprocess,  batched=True, remove_columns=["summary","category_description"])

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [13]:
# Select a random training sample
random_sample = random.choice(tokenized_train_dataset)

# Print a random sequence
print(tokenizer.decode(random_sample["input_ids"]))

<|im_start|>user
Large Language Models (LLMs) have emerged as transformative tools in artificial intelligence, capable of processing and understanding extensive human knowledge to enhance problem-solving across various domains. This paper explores the potential of LLMs to drive the discovery of symbolic solutions within scientific and engineering disciplines, where such solutions are crucial for advancing theoretical and practical applications. We propose a novel framework that utilizes LLMs in an evolutionary search methodology, augmented by a dynamic knowledge library that integrates and refines insights in an \textit{open-ended manner}. This approach aims to tackle the dual challenges of efficiently navigating complex symbolic representation spaces and leveraging both existing and newly generated knowledge to foster open-ended innovation. By enabling LLMs to interact with and expand upon a knowledge library, we facilitate the continuous generation of novel solutions in diverse forms

### Building the model

In [14]:
# Define the quantization configurations of the model (only for CUDA devices)
quantization_config = None
if use_cuda:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "nf4",
        bnb_4bit_compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
        bnb_4bit_use_double_quant = True
    )

In [15]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage = True,
    quantization_config = quantization_config,
    device_map = "auto"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
# LoRA (Low-rank adaptation configurations)
lora_config = LoraConfig(
    r = 16,                        # Rank of the LoRA matrices
    lora_alpha = 32,               # Alpha parameter for scaling
    use_rslora = True,             # Use RSLora
    lora_dropout = 0.1,            # Dropout probability
    target_modules = [             # Target modules to apply LoRA
        "q_proj",
        "k_proj", 
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ]
)

In [17]:
# Apply LoRA (Low-rank adaptation) to the model
model = get_peft_model(model, lora_config)

In [18]:
# Print trainable parameters
model.print_trainable_parameters()

trainable params: 33,030,144 || all params: 4,055,498,240 || trainable%: 0.8145


In [19]:
# Print the model
model

PeftModel(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 2560)
        (layers): ModuleList(
          (0-35): 36 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2560, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit

### Trainig the model

In [20]:
# Mixed precision settings
use_pin_memory = bool(use_cuda)
bf16 = bool(use_cuda and torch.cuda.is_bf16_supported())

# Define the training arguments
training_args = TrainingArguments(
    output_dir = "./checkpoints/papers_category_classifier",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 3e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 10,
    weight_decay = 0.01,
    logging_dir = "./logs",
    logging_strategy = "steps",
    logging_steps = 50,
    save_total_limit = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",
    greater_is_better = False,
    report_to = "none",
    dataloader_pin_memory = use_pin_memory,
    bf16 = bf16
)

In [None]:
# Instantiate the trainer to train the model
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_test_dataset
)

# Training the model
trainer_output = trainer.train()

# Pretty print the training results
print(trainer_output)

### Save the model

In [22]:
# Saving the adapter to the destination path
model.save_pretrained(str(adapter_path))

### Load the fine-tuned model

In [23]:
# Clear GPU cache
if torch.cuda.is_available():
	torch.cuda.empty_cache()

In [24]:
# Load the base model first
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map = "auto",
    low_cpu_mem_usage = True,
    quantization_config = quantization_config
)

# Load the LoRA adapter and attach it to the base model
model = PeftModel.from_pretrained(model, adapter_path)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [25]:
# Set the model to evaluation mode
model.eval();

### Inference

In [26]:
# Tokenize a sample input for chat-like generation
summary = "The transportation industry is experiencing vast digitalization as a plethora of technologies are being implemented to improve efficiency, functionality, and safety. Although technological advancements bring many benefits to transportation, integrating cyberspace across transportation sectors has introduced new and deliberate cyber threats. In the past, public agencies assumed digital infrastructure was secured since its vulnerabilities were unknown to adversaries. However, with the expansion of cyberspace, this assumption has become invalid. With the rapid advancement of wireless technologies, transportation systems are increasingly interconnected with both transportation and non-transportation networks in an internet-of-things ecosystem, expanding cyberspace in transportation and increasing threats and vulnerabilities. This study investigates some prominent reasons for the increase in cyber vulnerabilities in transportation. In addition, this study presents various collaborative strategies among stakeholders that could help improve cybersecurity in the transportation industry. These strategies address programmatic and policy aspects and suggest avenues for technological research and development. The latter highlights opportunities for future research to enhance the cybersecurity of transportation systems and infrastructure by leveraging hybrid approaches and emerging technologies."

# Compose the chat-like prompt
prompt = [
    [
        {"role": "user", "content": summary},
        {"role": "assistant", "content": ""}
    ]
]

# Apply chat template if supported
messages = tokenizer.apply_chat_template(
    prompt,
    add_generation_prompt = True,
    tokenize = False
)

# Tokenize the formatted prompt
inputs = tokenizer(
    messages,
    return_tensors = "pt"
).to(device)

In [31]:
# Disable gradient calculation
with torch.no_grad():
    # Generate the responses
    outputs = model.generate(
        **inputs,
        max_new_tokens = 16,
        streamer = TextStreamer(tokenizer, skip_prompt = True)
    )

Category: Systems and Control<|im_end|>


In [32]:
# Decode the model output
gen_ids = outputs[0, inputs["input_ids"].shape[-1]:]
generated_text = tokenizer.decode(gen_ids, skip_special_tokens=True)

# Extract the generated category from the response
category = generated_text.split("Category:", 1)[-1].strip() if "Category:" in generated_text else generated_text.strip()

# Print the response
print(category)

Systems and Control
