### Installing and importing the required modules

In [None]:
%%capture
!pip install evaluate bitsandbytes

In [None]:
import torch
import random
import numpy as np
import pandas as pd
from evaluate import load
from typing import Dict, Any
from datasets import Dataset
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer

### Setting up the environment

In [None]:
# Mounting the drive
drive.mount('/content/drive')

In [None]:
# Extract the hugging face token from the user data
HF_TOKEN = userdata.get('HF_TOKEN')

# Check if the HF token has been provided
if not HF_TOKEN:
  # Raise an exception if the HF token was not provided
  raise Exception("Token is not set. Please save the token first.")

# Authenticate with hugging face
login(HF_TOKEN)

# Login successful
print("Successfully logged in to Hugging Face!")

### Constants, hyperparameters and model configurations

In [None]:
seed = 42 # Seed for reproducibility
test_size = 0.2 # Train-test split percentage
max_length = 512 # Maximum length of the sequences
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # The device to run the model on
model_id = "meta-llama/Llama-3.2-3B-Instruct" # The model ID of the Llama model
dataset_path = "/content/drive/MyDrive/Colab Notebooks/FineTuningLLM/datasets/arxiv_dataset.csv" # The path to the dataset
adapter_path = "/content/drive/MyDrive/Colab Notebooks/FineTuningLLM/saved_models/papers_category_classifier_adapter" # Path to save the trained model to

In [None]:
# Print the detected device
print(f"Detected device: {device}")

### Data loading

In [None]:
# Load the dataset into a pandas DataFrame
dataset = pd.read_csv(
    dataset_path,
    delimiter = "|",
    quoting = 3,  # Handle quotes around text
    on_bad_lines = "skip"  # Skip problematic lines if necessary
)

# Keep only the relevant columns
dataset = dataset[[
    "summary", # Feature
    "category_description" # Label
]]

In [None]:
# Show a subset of the samples
dataset.head()

### Tokenizer

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set the padding token to the end of the sequence
tokenizer.pad_token = tokenizer.eos_token

### Preprocess data

In [None]:
# Convert the Pandas DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(dataset)

# Train-test split
train_dataset, test_dataset = hf_dataset.train_test_split(test_size=test_size, seed=seed).values()

In [None]:
def preprocess(examples) -> dict:
    # Compose the prompts
    prompts = [
        [
            {"role": "user", "content": f"Given the following summary, predict the category: {summary}"},
            {"role": "assistant", "content": ""}
        ]
        for summary in examples["summary"]
    ]

    # Extract the target responses
    target_responses = examples["category_description"]

    # Apply the chat template to the prompts
    chat_templates = tokenizer.apply_chat_template(
        prompts,
        add_generation_prompt = True,
        tokenize = False
    )

    # Tokenize the full response of the assistant
    input_ids_tokenized = tokenizer(
        chat_templates,
        truncation = True,
        padding = "max_length",
        max_length = max_length,
        padding_side = "right",
        return_tensors = "pt"
    )['input_ids']

    # Tokenize only the response
    labels_tokenized = tokenizer(
        [f"Category: {response}{tokenizer.eos_token}" for response in target_responses],
        truncation = True,
        padding = "max_length",
        max_length = max_length,
        padding_side = "right",
        return_tensors = "pt"
    )['input_ids']

    # Mask all padding tokens except the first
    for i, label_row in enumerate(labels_tokenized):
        padding_mask = label_row == tokenizer.pad_token_id  # Identify padding tokens
        padding_indices = padding_mask.nonzero(as_tuple=True)[0]  # Indices of padding tokens
        if len(padding_indices) > 1:  # If there are multiple padding tokens
            labels_tokenized[i, padding_indices[1:]] = -100  # Mask all except the first padding token

    # Shift the input and target tokens
    input_ids_tokenized = input_ids_tokenized[:, :-1] # (start) to (end - 1)
    labels_tokenized = labels_tokenized[:, 1:] # (start + 1) to end

    # Create the attention mask
    attention_mask = input_ids_tokenized.ne(tokenizer.pad_token_id)

    # Return the output data
    return {
        "input_ids": input_ids_tokenized,
        "labels": labels_tokenized,
        "attention_mask": attention_mask
    }

In [None]:
# Preprocess the dataset
tokenized_train_dataset = train_dataset.map(preprocess, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess, batched=True)

# Remove unnecessary columns
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["summary", "category_description"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["summary", "category_description"])

In [None]:
# Select a random training sample
random_sample = random.choice(tokenized_train_dataset)

# Print a random sequence
print("INPUT SEQUENCE")
print("-"*15)
print(tokenizer.decode(random_sample["input_ids"]))

# Print a random sequence
print("\nOUTPUT SEQUENCE")
print("-"*15)
print(tokenizer.decode([
    tokenizer.pad_token_id if token == -100 else token
    for token in random_sample["labels"]
]))

### Building the model

In [None]:
# Define the quantization configurations of the model (only for CUDA devices)
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True
)

In [None]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage = True,
    quantization_config = quantization_config,
    device_map = "auto"
)

In [None]:
# LoRA (Low-rank adaptation configurations)
lora_config = LoraConfig(
    r = 16,                        # Rank of the LoRA matrices
    lora_alpha = 32,               # Alpha parameter for scaling
    target_modules = ["q_proj", "k_proj", "v_proj"],
    use_rslora = True,
    lora_dropout = 0.1             # Dropout probability
)

In [None]:
# Apply LoRA (Low-rank adaptation) to the model
model = get_peft_model(model, lora_config)

In [None]:
# Print trainable parameters
model.print_trainable_parameters()

In [None]:
# Print the model
model

### Trainig the model

In [None]:
# Load the accuracy metric
accuracy_metric = load("accuracy")

# Define a custum function to compute the metrics
def compute_metrics(eval_pred: torch.Tensor) -> torch.Tensor:
    # Extract the logits and the lables from the output of the model
    logits, labels = eval_pred

    # Extract the predictions for each sample
    predictions = np.argmax(logits, axis=-1)

    # Compute and return the accuarcy
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir = "./papers_category_classifier",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    logging_dir = "./logs",
    logging_strategy = "epoch",
    learning_rate = 3e-4,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    num_train_epochs = 10,
    weight_decay = 0.01,
    save_total_limit = 10,
    report_to = "none",
    fp16 = True
)

In [None]:
# Instantiate the trainer to train the model
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_test_dataset
)

# Training the model
trainer.train()

### Save the model

In [None]:
# Saving the adapter to the destination path
model.save_pretrained(adapter_path)

### Load the fine-tuned model

In [None]:
# Clear GPU cache
torch.cuda.empty_cache()

In [None]:
# Load the base model first
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map = "auto",
    low_cpu_mem_usage = True,
    quantization_config = quantization_config
)

# Load the LoRA adapter and attach it to the base model
model = PeftModel.from_pretrained(model, adapter_path)

In [None]:
# Set the model to evaluation mode
model.eval();

### Inference

In [None]:
# Tokenize a sample input
inputs = tokenizer(
    "The transportation industry is experiencing vast digitalization as a plethora of technologies are being implemented to improve efficiency, functionality, and safety. Although technological advancements bring many benefits to transportation, integrating cyberspace across transportation sectors has introduced new and deliberate cyber threats. In the past, public agencies assumed digital infrastructure was secured since its vulnerabilities were unknown to adversaries. However, with the expansion of cyberspace, this assumption has become invalid. With the rapid advancement of wireless technologies, transportation systems are increasingly interconnected with both transportation and non-transportation networks in an internet-of-things ecosystem, expanding cyberspace in transportation and increasing threats and vulnerabilities. This study investigates some prominent reasons for the increase in cyber vulnerabilities in transportation. In addition, this study presents various collaborative strategies among stakeholders that could help improve cybersecurity in the transportation industry. These strategies address programmatic and policy aspects and suggest avenues for technological research and development. The latter highlights opportunities for future research to enhance the cybersecurity of transportation systems and infrastructure by leveraging hybrid approaches and emerging technologies.",
    return_tensors = "pt"
).to(device)

In [None]:
# Tokenize a sample input for chat-like generation
summary = "The transportation industry is experiencing vast digitalization as a plethora of technologies are being implemented to improve efficiency, functionality, and safety. Although technological advancements bring many benefits to transportation, integrating cyberspace across transportation sectors has introduced new and deliberate cyber threats. In the past, public agencies assumed digital infrastructure was secured since its vulnerabilities were unknown to adversaries. However, with the expansion of cyberspace, this assumption has become invalid. With the rapid advancement of wireless technologies, transportation systems are increasingly interconnected with both transportation and non-transportation networks in an internet-of-things ecosystem, expanding cyberspace in transportation and increasing threats and vulnerabilities. This study investigates some prominent reasons for the increase in cyber vulnerabilities in transportation. In addition, this study presents various collaborative strategies among stakeholders that could help improve cybersecurity in the transportation industry. These strategies address programmatic and policy aspects and suggest avenues for technological research and development. The latter highlights opportunities for future research to enhance the cybersecurity of transportation systems and infrastructure by leveraging hybrid approaches and emerging technologies."

# Compose the chat-like prompt
prompts = [
    [
        {"role": "user", "content": f"Given the following summary, predict the category: {summary}"},
        {"role": "assistant", "content": ""}
    ]
]

# Apply chat template if supported
formatted_prompts = tokenizer.apply_chat_template(
    prompts,
    add_generation_prompt = True,  # If you want the template to include generation guidance
    tokenize = False  # Return as plain text, not tokenized IDs yet
)

# Tokenize the formatted prompt
inputs = tokenizer(
    formatted_prompts,
    truncation = True,
    padding = "max_length",
    max_length = max_length,
    padding_side = "left",
    return_tensors = "pt"
).to(device)

In [None]:
print(tokenizer.decode(inputs["input_ids"][0]))

In [None]:
# Generate the responses
outputs = model.generate(
    inputs["input_ids"],
    attention_mask = inputs["attention_mask"],
    max_new_tokens = 100,
    eos_token_id = tokenizer.eos_token_id,
    pad_token_id = tokenizer.pad_token_id,
    temperature = 0.7,
    top_k = 50,
    top_p = 0.9,
    repetition_penalty = 1.2
)

In [None]:
# Decode the model output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the response
print(generated_text)