# Configure GPUs

In [1]:
!nvidia-smi

Fri Mar 21 13:24:22 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.77                 Driver Version: 565.77         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     Off |   00000000:19:00.0 Off |                  N/A |
| 27%   27C    P5              8W /  250W |     368MiB /  11264MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti     Off |   00

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
import torch

In [4]:
DEVICE = "cuda:0"
torch.cuda.set_device(DEVICE)

# Load DBPedia dataset

In [5]:
from datasets import load_dataset

ds = load_dataset("fancyzhx/dbpedia_14")
CLASS_LABELS = ds['train'].features['label'].names
CLASS_LABELS

['Company',
 'EducationalInstitution',
 'Artist',
 'Athlete',
 'OfficeHolder',
 'MeanOfTransportation',
 'Building',
 'NaturalPlace',
 'Village',
 'Animal',
 'Plant',
 'Album',
 'Film',
 'WrittenWork']

## Reduce dataset size via sampling
Let's obtain the first **n** samples from each class

In [6]:
import numpy as np

def get_n_samples_per_class(dataset, n, shuffle = False):
    """
        Given a dataset, obtain the first n samples from each class
        and return a smaller dataset containing all the samples.

        Args:
            dataset (Dataset): The dataset to sample.
            n (int): How many samples from each class to extract.
            shuffle (bool): Whether to sort the final result by class or randomly. NOTE: Dataset.shuffle() hangs indefinitely on Nix.

        Returns:
            sample (Dataset): The sampled dataset.
    """
    ds_sorted = dataset.sort('label')
    _, class_indices = np.unique(ds_sorted['label'], return_index=True)


    class_indices = np.array([list(range(index, index + n)) for index in class_indices])
    class_indices = class_indices.flatten()

    if shuffle:
        sample = dataset.shuffle().sort('label').select(class_indices) # Dataset.shuffle() hangs indefinitely on Nix - No idea why.
    else:
        sample = dataset.sort('label').select(class_indices)

    if shuffle: sample = sample.shuffle() # Dataset.shuffle() hangs indefinitely on Nix - No idea why.
    return sample

In [7]:
dataset_proportion = 0.1 # What percentage of the dataset do you want to use?

samples_per_class = int(40000 * dataset_proportion)

ds['train'] = get_n_samples_per_class(ds['train'], samples_per_class, shuffle=False)

In [8]:
ds['test'] = get_n_samples_per_class(ds['test'], 6, shuffle=False)

## Format training data as a supervised fine-tuning dataset

To fine-tune our LLM, we will use the ``trl`` library with
the ``SFTTrainer`` class. This class expects data to be in
a specific format outlined on [this documentation page](https://huggingface.co/docs/trl/main/en/dataset_formats#standard).

The format in a nutshell is:
- ``prompt``: The user input
- ``completion``: The expected LLM response. In our case, this will be the name of the appropriate article category.

In [9]:
from datasets import Value

# Change the label data type to string. (0 -> "0")
ds['train'] = ds['train'].cast_column("label", Value(dtype='string'))

# Substitute all label strings with the label name. ("0" -> "Company")
def preprocess_sample(sample, class_labels):
   sample['label'] = class_labels[ int(sample['label'] )]
   sample['content'] = sample['content'].strip()
   return sample
ds['train'] = ds['train'].map( lambda x : preprocess_sample(x, class_labels=CLASS_LABELS) )

Map:   0%|          | 0/56000 [00:00<?, ? examples/s]

In [10]:
ds['train'] = ds['train'].rename_column("content", "prompt")
ds['train'] = ds['train'].rename_column("label", "completion")
ds['train'] = ds['train'].remove_columns(["title"])
ds['train'] = ds['train'].shuffle()

# Load Baseline LLM

In [11]:
DEVICE_MAP = "cuda:0"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Same quantization configuration as QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4", # QLoRA uses 4-bit NormalFloat precision,
    bnb_4bit_use_double_quant = True, # QLoRA uses double quantising,
    bnb_4bit_compute_dtype = torch.float32
)

model_id = "Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map=DEVICE_MAP)

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
import transformers, torch

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map=DEVICE_MAP,
)

## View LLM GPU Usage
By passing device_map="auto", we tell 🤗 Accelerate to determine automatically where to put each layer of the model depending on the available resources:
- first, we use the maximum space available on the GPU(s)
- if we still need space, we store the remaining weights on the CPU
- if there is not enough RAM, we store the remaining weights on the hard drive as memory-mapped tensors

(Source: [Hugging Face](https://huggingface.co/docs/accelerate/v0.25.0/en/concept_guides/big_model_inference))

However, it seems LoRA fine-tuning using 🤗 PEFT does not support multi-threading, so we will only use *one* GPU for our model.

In [None]:
!nvidia-smi

In [None]:
# What GPUs are the model loaded onto?
model.hf_device_map

In [None]:
# How many Mb of RAM is the model using?
print(model.get_memory_footprint()/1e6)

# Finetune LLM
We will be using QLoRA to finetune the model.

We will freeze the original model weights and add a small set of trainable low-rank adapter weights which will be trained via backpropagation ([Ref](https://arxiv.org/pdf/2305.14314)).

When saving the finetuned model, only the adapter weights will be saved.

Hugging Face has an implementation of QLoRA in the ``trl`` library.

## Set up LoRA Adapters
Source: [Hugging Face](https://github.com/huggingface/smol-course/blob/main/3_parameter_efficient_finetuning/notebooks/finetune_sft_peft.ipynb), [David Godoy](https://huggingface.co/blog/dvgodoy/fine-tuning-llm-hugging-face)

In [None]:
PEFT_MODEL_NAME = "Qwen2.5-FT-DBPedia"

In [None]:
from peft import LoraConfig

# TODO: Configure LoRA parameters
# r: rank dimension for LoRA update matrices (smaller = more compression)
rank_dimension = 8 # 6
# lora_alpha: scaling factor for LoRA layers (higher = stronger adaptation)
lora_alpha = 16 # 8
# lora_dropout: dropout probability for LoRA layers (helps prevent overfitting)
lora_dropout = 0.05 # 0.05
# max sequence length for model and packing of the dataset
max_seq_length = 64
#target_modules = 'all-linear' # all-linear TODO: Does changing this improve optimisation?

peft_parameters = LoraConfig(
    r=rank_dimension,  # Rank dimension - typically between 4-32
    lora_alpha=lora_alpha,  # LoRA scaling factor - typically 2x rank
    lora_dropout=lora_dropout,  # Dropout probability for LoRA layers
    bias="none",  # Bias type for LoRA. the corresponding biases will be updated during training.
    #target_modules=target_modules,  # Which modules to apply LoRA to 
    task_type="CAUSAL_LM",  # Task type for model architecture
)

## Apply LoRA Adapters to Model

In [None]:
# from peft import get_peft_model, prepare_model_for_kbit_training
# model = prepare_model_for_kbit_training(model)
# model = get_peft_model(model, peft_parameters)

In [None]:
# # Show the number of trainable parameters for the model.

# train_p, tot_p = model.get_nb_trainable_parameters()
# print(f'Trainable parameters:      {train_p/1e6:.2f}M')
# print(f'Total parameters:          {tot_p/1e6:.2f}M')
# print(f'% of trainable parameters: {100*train_p/tot_p:.2f}%')

## Create Training Class
``SFTTrainer`` is the class used for supervised LLM finetuning in ``trl``.

Source: [Hugging Face](https://github.com/huggingface/smol-course/blob/main/3_parameter_efficient_finetuning/notebooks/finetune_sft_peft.ipynb)

In [None]:
from trl import SFTConfig, SFTTrainer

# Training configuration
# Hyperparameters based on QLoRA paper recommendations
peft_hyperparameters = SFTConfig(

    ## GROUP 1: Memory usage
    # These arguments will squeeze the most out of your GPU's RAM
    # Checkpointing
    gradient_checkpointing=True,    # this saves a LOT of memory
    # Set this to avoid exceptions in newer versions of PyTorch
    gradient_checkpointing_kwargs={'use_reentrant': False}, 
    # Gradient Accumulation / Batch size
    # Actual batch (for updating) is same (1x) as micro-batch size
    gradient_accumulation_steps=1,
    # The initial (micro) batch size to start off with
    per_device_train_batch_size=16,
    # If batch size would cause OOM, halves its size until it works
    auto_find_batch_size=True,
    # Precision settings
    # bf16=True,  # Use bfloat16 precision TODO: Is this more optimised?

    ## GROUP 2: Dataset-related
    max_seq_length=max_seq_length,
    # Dataset
    # packing a dataset means no padding is needed
    packing=True,

    ## GROUP 3: These are typical training parameters
    num_train_epochs=1,
    learning_rate=3e-4,
    # Optimizer
    optim='adamw_torch_fused', #TODO: Compare with paged_adamw_8bit
    # Learning rate schedule
    # warmup_ratio=0.03,  # Portion of steps for warmup
    # lr_scheduler_type="constant",  # Keep learning rate constant after warmup    
    
    ## GROUP 4: Logging parameters
    logging_steps=10,  # Log metrics every N steps
    logging_dir='./logs',
    output_dir=PEFT_MODEL_NAME,
    report_to='none',
    save_strategy="steps",  # Save the model checkpoint every logging step

)

In [None]:
# Create SFTTrainer with LoRA configuration
trainer = SFTTrainer(
    model=model,
    train_dataset=ds["train"],
    peft_config=peft_parameters,  # LoRA configuration
    args=peft_hyperparameters,    # Hyperparameters
    tokenizer=tokenizer
)

In [22]:
!nvidia-smi

Fri Mar 21 13:20:16 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.77                 Driver Version: 565.77         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     Off |   00000000:19:00.0 Off |                  N/A |
| 27%   28C    P8              2W /  250W |   10098MiB /  11264MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti     Off |   00

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Finetune and save when completed
NOTE: We will only save the *adapters* from the finetuned model, not the base model weights, so the checkpoints will be small.

In [23]:
trainer.train()

trainer.save_model(PEFT_MODEL_NAME)

RuntimeError: chunk expects at least a 1-dimensional tensor

# Load Finetuned LLM
Source: [Hugging Face](https://github.com/huggingface/smol-course/blob/main/3_parameter_efficient_finetuning/notebooks/finetune_sft_peft.ipynb)

**REQUIREMENTS**:
- You must already have fine-tuned the LLM (see *Finetune LLM*).
- You must load the base model (see *Load LLM*)

In [None]:
PEFT_MODEL_NAME = "Qwen2.5-FT-DBPedia"

from peft import LoraConfig, get_peft_model
from peft import PeftModel, PeftConfig

model = PeftModel.from_pretrained(model, PEFT_MODEL_NAME)

# Evaluate LLM

## Text Classification Prompts

In [None]:
# Zero-shot prompt to classify articles and return their category ID.
PROMPT_ZEROSHOT = """You are an expert in classifying articles into categories.
Your task is to read an article, decide which category it belongs into, and then return the number of that category.
There are 14 categories you may choose from, but you can only decide one category.

CATEGORIES:
0. Company
1. Educational Institution
2. Artist
3. Athlete
4. Office Holder
5. Method Of Transportation
6. Building
7. Natural Place
8. Village
9. Animal
10. Plant
11. Album
12. Film
13. Written Work

Read the following article and return the most suitable category as a number ("0"), NOT as text ("Company").
"""

# Zero-shot chain-of-thought prompt to classify articles and return their category name.
PROMPT_COT = """You are an expert at classifying articles into the following categories:

CATEGORIES:
0. Company
1. Educational Institution
2. Artist
3. Athlete
4. Office Holder
5. Method Of Transportation
6. Building
7. Natural Place
8. Village
9. Animal
10. Plant
11. Album
12. Film
13. Written Work

Read the following article and explain which category describes its content best.
End your answer with the category name.
Let's think step by step.
"""

# Meta prompt - A type of zero-shot prompt which prioritises abstract reasoning over concrete examples.
# Advantages: Fewer tokens. Disadvantages:
# https://www.promptingguide.ai/techniques/meta-prompting
PROMPT_META = """Problem: [excerpt from an encyclopedia article]

Solution Structure:
1. Begin the response with "Let's think step by step".
2. Identify the subject of the encyclopedia article with "This encyclopedia article is about [subject]".
3. Define what the subject is. Is it natural or artificial? Is it one or multiple entities? Use "[subject] is a [classification]".
4. Consider the following list of categories:
	- Company
	- Educational Institution
	- Artist
	- Athlete
	- Office Holder
	- Method Of Transportation
	- Building
	- Natural Place
	- Village
	- Animal
	- Plant
	- Album
	- Film
	- Written Work
   Identify all categories in this list whose properties do not match the subject.
5. Identify which category has the most in common with the subject and explain why.
6. Finally, state "Category: [best matching category]."
"""

# Few-shot chain-of-thought prompt to classify articles and return their category name.
PROMPT_COT_4SHOT = """Read an encyclopedia article excerpt and give it one of the following categories:

CATEGORIES:
- Company
- Educational Institution
- Artist
- Athlete
- Office Holder
- Method Of Transportation
- Building
- Natural Place
- Village
- Animal
- Plant
- Album
- Film
- Written Work

Problem:
The Petlyakov VI-100 (Visotnyi Istrebitel – high altitude fighter) was a fighter/dive bomber aircraft designed and built in the USSR from 1938.

Solution:
Let's think step by step. This encyclopedia article is about the Petlyakov VI-100, which is an aircraft. While aircrafts are a man-made structure designed and built for a specific purpose, they are not human habitats with walls and a ceiling, so Building is not the category. Aircrafts are designed to transport people, so Method Of Transportation is the best category. Category: Method Of Transportation.

Problem:
Kruszewo [kruˈʂɛvɔ] is a village in the administrative district of Gmina Żuromin within Żuromin County Masovian Voivodeship in east-central Poland.

Solution:
Let's think step by step. This encyclopedia article is about Kruszewo, which is a village in Poland. While villages do exist within geographical areas, they are man-made, so Natural Place is not the category. While villages do contain buildings, they are not a single building, so Building is not the category. The most matching category is therefore Village. Category: Village.

Problem:
Schismus is a genus of grass in the Poaceae family. They are native to Africa and Asia.

Solution:
Let's think step by step. This encyclopedia article is about Schismus, which is a biological species. The genus of the species is grass. Grass is commonly found in natural places, but Schismus is not a geographical location, so Natural Place is not the category. Grass is a type of plant, so Plant is the most fitting category. Category: Plant.

Problem:
The Southern Oklahoma Cosmic Trigger Contest is a soundtrack by The Flaming Lips to the Bradley Beesley fishing documentary Okie Noodling.

Solution:
Let's think step by step. This encyclopedia article is about The Southern Oklahoma Cosmic Trigger Contest, which is a soundtrack to a fishing documentary. While the article mentions a fishing documentary, it is not the subject, so Film is not the category. While the article mentions the band The Flaming Lips, they are not the subject, so Artist is not the category. The most suitable category is therefore Album. Category: Album.

Problem:
"""

# Few-shot chain-of-thought prompt to classify articles and return their category name.
PROMPT_COT_2SHOT = """Read an encyclopedia article excerpt and give it one of the following categories:

CATEGORIES:
- Company
- Educational Institution
- Artist
- Athlete
- Office Holder
- Method Of Transportation
- Building
- Natural Place
- Village
- Animal
- Plant
- Album
- Film
- Written Work

Problem:
The Petlyakov VI-100 (Visotnyi Istrebitel – high altitude fighter) was a fighter/dive bomber aircraft designed and built in the USSR from 1938.

Solution:
Let's think step by step. This encyclopedia article is about the Petlyakov VI-100, which is an aircraft. While aircrafts are a man-made structure designed and built for a specific purpose, they are not human habitats with walls and a ceiling, so Building is not the category. Aircrafts are designed to transport people, so Method Of Transportation is the best category. Category: Method Of Transportation.

Problem:
Schismus is a genus of grass in the Poaceae family. They are native to Africa and Asia.

Solution:
Let's think step by step. This encyclopedia article is about Schismus, which is a biological species. The genus of the species is grass. Grass is commonly found in natural places, but Schismus is not a geographical location, so Natural Place is not the category. Grass is a type of plant, so Plant is the most fitting category. Category: Plant.

Problem:
"""

In [None]:
def get_classification_prompt(article, prompt):
    """
      For a given article in the Dataset,
      return a LLM prompt in chat template form
      to get its category.

      Args:
          article (Dictionary): Any item in the dataset.
          prompt (str): A model prompt with article classification instructions.

      Returns:
          prompt (Dictionary): The prompt as a [Chat Template](https://huggingface.co/docs/transformers/main/en/chat_templating).
    """
    return [
      {"role": "system", "content": prompt},
      {"role": "user", "content": article["content"].strip()},
    ]

## Evaluation Method

In [None]:
LABEL_NAMES=[
    "Company",
    "Educational Institution",
    "Artist",
    "Athlete",
    "Office Holder",
    "Method Of Transportation",
    "Building",
    "Natural Place",
    "Village",
    "Animal",
    "Plant",
    "Album",
    "Film",
    "Written Work"
]

In [None]:
import random
import re

LABEL_NAMES=[
    "Company",
    "Educational Institution",
    "Artist",
    "Athlete",
    "Office Holder",
    "Method Of Transportation",
    "Building",
    "Natural Place",
    "Village",
    "Animal",
    "Plant",
    "Album",
    "Film",
    "Written Work"
]

def get_class_label_from_string(string, class_labels = LABEL_NAMES):
    """
    Extract a class label by name from a string and return its ID.
    If no match is found, choose a random label ID.

    Args:
    string (str): A string containing the name of one class label.

    Returns:
    class_id (int): The ID of the matching class label.
    """
    string = string.lower().strip()

    # Concatenate all label names using boolean OR.
    match = "|".join(LABEL_NAMES).lower().replace(" ", r"\s*")

    # Find all instances of label name strings within the base string.
    matches = re.findall(match, string)

    # If no class label is found in the LLM text, pick a random label.
    if matches == []:
        print(f"No class label found in string: {string}")
        return random.randint(0, len(LABEL_NAMES) - 1)

    # Get the last matching label from the string.
    final_match = matches[-1]

    # Remove all capitalisation, non-alphabetic characters, and whitespace
    labels_sanitised = [re.sub("[^a-z]", "", label.lower()) for label in LABEL_NAMES]
    match_sanitised = re.sub("[^a-z]", "", final_match.lower())

    # Find the matching class ID for the label.
    class_id = labels_sanitised.index(match_sanitised)
    return class_id

In [None]:
import re

def get_first_number_from_string(string):
  """
  Returns the first whole number from a string as an integer.
  """
  first_number=re.findall(r"\d+",string)
  if first_number is not None:
    first_number = int(first_number[0])
    return first_number
  else:
    raise Exception(f"No number found in string: {string}")

def get_category_label(article, classification_prompt, extractor_func, max_tokens = 10):
  """
  For a given article in the DBPedia dataset, predict its category label.

  Args:
    article (str): Article contents as raw text.
    classifiction_prompt (str): Model instructions on how to classify articles.
    extractor_func (func): A method which takes the model's response and returns a classification label as an integer.
    max_tokens (int): Model response word limit.

  Returns:
    output (tuple<int, str>): The category of the article and the raw LLM output.
  """
  input = get_classification_prompt(article, classification_prompt)

  chat_history = pipeline(
      input,
      do_sample=True,
      #top_k=10,
      #num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
      max_new_tokens=max_tokens,
      temperature=0.001
      #continue_final_message=continue_final_message
  )

  response = chat_history[0]["generated_text"][-1]['content']

  class_id = extractor_func(response)

  return (class_id, response)

In [None]:
from tqdm import tqdm

def predict_classes(dataset, classification_prompt, extractor_func, max_tokens):
  """
    For a given RFPedia dataset, use the contents of each article to predict its label.

    Args:
      dataset (Dataset): The dataset to sample.
      classifiction_prompt (str): Model instructions on how to classify articles.
      extractor_func (func): A method which takes the model's response and returns a classification label as an integer.
      max_tokens (int): Model response word limit.

    Returns:
      results (tuple<list, list, list>):
        y_pred (list<int>): Predicted labels
        y_true (list<int>): Actual labels (groundtruth)
        responses (list<str>): Raw LLM response for each test sample.
  """
  y_pred = []
  y_true = []
  responses = []

  # TODO: This is vastly unoptimized, use a dataset for this.
  for item in tqdm(dataset, "Classifying articles"):

    pred_label, response = get_category_label(item, classification_prompt, extractor_func, max_tokens)

    y_pred.append( pred_label )
    y_true.append( item['label' ])
    responses.append( response )

  return y_pred, y_true, responses

## Run Evaluation

In [None]:
configurations = [
    # {"name" : "Zero-shot",
    #  "prompt" : PROMPT_ZEROSHOT,
    #  "max_tokens" : 10,
    #  "extractor_func" : get_first_number_from_string},

    # {"name" : "Chain-of-Thought",
    #  "prompt" : PROMPT_COT,
    #  "max_tokens" : 100,
    #  "extractor_func" : get_class_label_from_string},

    {"name" : "Meta Prompt",
     "prompt" : PROMPT_META,
     "max_tokens" : 100,
     "extractor_func" : get_class_label_from_string},

    {"name" : "2-Shot CoT",
     "prompt" : PROMPT_COT_2SHOT,
     "max_tokens" : 100,
     "extractor_func" : get_class_label_from_string},

    {"name" : "4-Shot CoT",
     "prompt" : PROMPT_COT_4SHOT,
     "max_tokens" : 100,
     "extractor_func" : get_class_label_from_string}
]

In [None]:
# Predict all article categories in the dataset

y_true = None

for config in tqdm(configurations, "Testing LLM configurations"):

  args = config['prompt'], config['extractor_func'], config['max_tokens']

  config['y_pred'], config['y_true'], config['responses'] = predict_classes(ds['test'], *args)

## Return Evaluation Results

### Accuracy report + confusion matrix

In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from matplotlib import pyplot as plt
import pandas as pd
import os

def evaluate_model(config):
  """
  Evaluate a model configuration and save the results to output_dir.
  """

  output_dir = os.path.join("output", config['name'].replace(' ', '_').lower())
  os.makedirs(output_dir, exist_ok=True)

  y_true, y_pred, config_name = config['y_true'], config['y_pred'], config['name']
  print(f"\n{config_name} evaluation results\n")

  # Get precision, recall, and F1 score
  classif_report = classification_report(y_true, y_pred, zero_division=0.0, output_dict=True)
  classif_report = pd.DataFrame(classif_report).transpose()

  print(classif_report)

  # Save classification report to output dir
  classif_report.to_csv( os.path.join(output_dir, "evaluation.csv"), escapechar='\\' )

  # Display confusion matrix
  try:
    disp = ConfusionMatrixDisplay.from_predictions(
        y_true, y_pred,
        display_labels = CLASS_LABELS,
        cmap = plt.cm.Blues,
        xticks_rotation='vertical')
  except Exception as e:
        disp = ConfusionMatrixDisplay.from_predictions(
        y_true, y_pred,
        cmap = plt.cm.Blues,
        xticks_rotation='vertical')

  disp.ax_.set_title(config_name)

In [None]:
for config in configurations:
  evaluate_model(config)

### Save all incorrect LLM responses

In [None]:
def get_incorrect_answers(config):
  output_dir = os.path.join("output", config['name'].replace(' ', '_').lower())

  # Get a boolean mask for every incorrect prediction
  incorrect = np.array(config['y_pred']) != np.array(config['y_true'])

  # Get the index of every incorrect prediction
  index = np.array(list(range(ds['test'].num_rows)))[incorrect]

  # Get every incorrect predicted label
  incorrect_labels = np.array(config['y_pred'])[incorrect]

  incorrect_answers = {
      # Obtain title and content of article
      "Title" : np.array([item['title'] for item in ds['test']])[incorrect],
      "Content" : np.array([item['content'] for item in ds['test']])[incorrect],

      # Get the class names for y_pred and y_true
      "Predicted Category" : [CLASS_LABELS[id] for id in np.array(config['y_pred'])[incorrect]],
      "Actual Category" : [CLASS_LABELS[id] for id in np.array([item['label'] for item in ds['test']])[incorrect]],

      # Get LLM raw text output
      "LLM Output" : np.array(config['responses'])[incorrect]
  }

  incorrect_answers = pd.DataFrame(incorrect_answers, index=index)

  incorrect_answers.to_csv( os.path.join(output_dir, "incorrect_answers.csv"), escapechar='\\' )

  return incorrect_answers

In [None]:
for config in configurations:
  get_incorrect_answers(config)