This code computes perplexity for each fine-tuned model. </br>
At the end of the script there is a custom section for baseline model </br>
This code is thought to be run on Google Drive. </br>
Models are stored on Google Drive, the folder containing them is shared. </br>
Modify the following variables if needed.

In [None]:
my_base_path = "/content/drive/My Drive/Deep Learning project/" # where you keep datasets CSV
models_dir = base_path + "Saved models" # folder where you keep saved models
results_file = base_path + "models_perplexity.csv" # produced log file with perplexity


In [None]:
%%capture
!pip install unsloth
!pip install --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -U bitsandbytes

In [None]:
#imports

import torch
from transformers import AutoModel, AutoTokenizer
from transformers import TextStreamer
from google.colab import drive


from datasets import load_dataset
from tqdm import tqdm
from google.colab import drive
import os, csv
import gc

In [None]:
## functions

def remove_none(example):
    return all(value is not None for value in example.values())

def compute_perplexity(seq_len,max_length=512,stride=512):

  nll_sum = 0.0
  n_tokens = 0
  prev_end_loc = 0

  for begin_loc in tqdm(range(0, seq_len, stride)):
      end_loc = min(begin_loc + max_length, seq_len)
      trg_len = end_loc - prev_end_loc
      input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
      target_ids = input_ids.clone()
      target_ids[:, :-trg_len] = -100

      with torch.no_grad():
          outputs = model(input_ids, labels=target_ids)
          neg_log_likelihood = outputs.loss

      # negative log-likelihood
      num_valid_tokens = (target_ids != -100).sum().item()
      batch_size = target_ids.size(0)
      num_loss_tokens = num_valid_tokens - batch_size
      nll_sum += neg_log_likelihood * num_loss_tokens
      n_tokens += num_loss_tokens

      prev_end_loc = end_loc
      if end_loc == seq_len:
          break

  avg_nll = nll_sum / n_tokens
  ppl = torch.exp(avg_nll)

  return ppl, avg_nll, nll_sum


def write_results(results_file, header, file_exists,results_list):

  if not file_exists:
    with open(results_file, mode="w", newline="") as file:
      writer = csv.writer(file)
      writer.writerow(header)

  with open(results_file, mode="a", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(results_list)



In [None]:
#evaluate each model and write a line with results in a log file

drive.mount('/content/drive')
header = ["model name", "ppl", "nll_sum","avg_nll","test set name"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

models_list = [f for f in os.listdir(models_dir) if os.path.isdir(os.path.join(models_dir, f))]

#due to Colab interruptions
already_worked_list = []

with open(results_file, mode="r", newline="", encoding="utf-8") as file:
    reader = csv.reader(file)
    next(reader)
    for row in reader:
        if row:
            already_worked_list.append(row[0])


models_list = [model for model in models_list if model not in already_worked_list]


for model_name in models_list:

  file_exists = os.path.isfile(results_file)
  results_list = []
  results_list.append(model_name)

  model_dir = models_dir + "/" + model_name

  if "reduced" in model_name:
    test_set_path = my_base_path + "sicilian_dataset_reduced_test.csv"
    test_set_name = "sicilian_dataset_reduced_test"
  elif "partial" in model_name:
    test_set_path = my_base_path + "sicilian_dataset_partial_test.csv"
    test_set_name = "sicilian_dataset_partial_test"
  else:
    test_set_path = my_base_path + "sicilian_dataset_test.csv"
    test_set_name = "sicilian_dataset_test"

  dataset = load_dataset("csv", data_files=test_set_path)

  cleaned_dataset = dataset.filter(remove_none)

  del dataset

  tokenizer = AutoTokenizer.from_pretrained(model_dir)

  model = FastLanguageModel.from_pretrained(
    model_dir,
    load_in_4bit=True,  # Enable 4-bit quantization
    device_map="auto",  # Automatically map the model to available devices
    )

  if isinstance(model, tuple):
    model = model[0]

  encodings = tokenizer("\n\n".join(cleaned_dataset["train"]["text"]), return_tensors="pt")
  model.eval()

  seq_len = encodings.input_ids.size(1)

#computation of model perplexity
  ppl, avg_nll, nll_sum = compute_perplexity(seq_len=seq_len,max_length=512,stride=512)

  results_list.append(str(ppl.item()))
  results_list.append(str(nll_sum.item()))
  results_list.append(str(avg_nll.item()))
  results_list.append(test_set_name)

#save results
  write_results(results_file=results_file, header=header, file_exists=file_exists,results_list=results_list)

  del model, tokenizer, cleaned_dataset, encodings, results_list

  torch.cuda.empty_cache()
  gc.collect()
  torch.cuda.ipc_collect()

Mounted at /content/drive
lora_model_r128_noShortgrams_noEmbed
sicilian_dataset_no_shortgrams_test


Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/13834 [00:00<?, ? examples/s]

==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

Unsloth 2025.1.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
100%|█████████▉| 1455/1456 [15:58<00:00,  1.52it/s]


lora_model_r64_noShortgrams_noEmbed
sicilian_dataset_no_shortgrams_test
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


100%|█████████▉| 1455/1456 [16:09<00:00,  1.50it/s]


lora_model_r32_noShortgrams_noEmbed
sicilian_dataset_no_shortgrams_test
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


100%|█████████▉| 1455/1456 [16:09<00:00,  1.50it/s]


Custom code for baseline model below. </br>
Modify the test set if needed.

In [None]:
test_set_name= "sicilian_dataset_test.csv" # your test set
test_set_path_list = my_base_path + test_set_name

In [None]:
### custom code for evaluating the baseline model

torch.cuda.empty_cache()
gc.collect()
torch.cuda.ipc_collect()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

max_seq_length = 512
dtype = None
load_in_4bit = True
model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

if isinstance(model, tuple):
  model = model[0]

model.eval()

file_exists = os.path.isfile(results_file)
results_list = []
results_list.append(model_name)

dataset = load_dataset("csv", data_files=test_set_path_list[i])

cleaned_dataset = dataset.filter(remove_none)

encodings = tokenizer("\n\n".join(cleaned_dataset["train"]["text"]), return_tensors="pt")

seq_len = encodings.input_ids.size(1)

del dataset

ppl, avg_nll, nll_sum = compute_perplexity(seq_len=seq_len,max_length=512,stride=512)

results_list.append(str(ppl.item()))
results_list.append(str(nll_sum.item()))
results_list.append(str(avg_nll.item()))
results_list.append(test_set_name[i])


write_results(results_file=results_file, header=header, file_exists=file_exists,results_list=results_list)

del cleaned_dataset, results_list

torch.cuda.empty_cache()
gc.collect()
torch.cuda.ipc_collect()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/14963 [00:00<?, ? examples/s]

100%|█████████▉| 1445/1446 [1:17:28<00:03,  3.22s/it]


Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/20654 [00:00<?, ? examples/s]

 84%|████████▍ | 1267/1509 [1:08:04<13:02,  3.23s/it]