#Final Evaluation

# Importing Dependencies


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch==2.7.0+cu126 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

Found existing installation: torch 2.7.0+cu126
Uninstalling torch-2.7.0+cu126:
  Successfully uninstalled torch-2.7.0+cu126
Found existing installation: torchvision 0.22.0+cu126
Uninstalling torchvision-0.22.0+cu126:
  Successfully uninstalled torchvision-0.22.0+cu126
Found existing installation: torchaudio 2.7.0+cu126
Uninstalling torchaudio-2.7.0+cu126:
  Successfully uninstalled torchaudio-2.7.0+cu126
Looking in indexes: https://download.pytorch.org/whl/cu126
Collecting torch==2.7.0+cu126
  Using cached https://download.pytorch.org/whl/cu126/torch-2.7.0%2Bcu126-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cu126/torchvision-0.22.0%2Bcu126-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu126/torchaudio-2.7.0%2Bcu126-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Using cached https://download.pytorch.org/whl/cu126/torch

In [None]:
import torch
import numpy as np
import transformers
import bitsandbytes as bnb
import xformers
import accelerate
import peft
import datasets
import trl
import unsloth

print("Torch version:", torch.__version__)
print("NumPy version:", np.__version__)
print("Transformers version:", transformers.__version__)
print("BitsAndBytes version:", bnb.__version__ if hasattr(bnb, "__version__") else "Version not found")
print("Accelerate version:", accelerate.__version__)
print("Peft version:", peft.__version__)
print("Datasets version:", datasets.__version__)
print("TRL version:", trl.__version__)
print("xformers version:", xformers.__version__)


Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Torch version: 2.7.0+cu126
NumPy version: 2.0.2
Transformers version: 4.51.3
BitsAndBytes version: 0.45.5
Accelerate version: 1.6.0
Peft version: 0.15.2
Datasets version: 3.6.0
TRL version: 0.15.2
xformers version: 0.0.30


In [None]:
!ls /content/drive/MyDrive/NLP-GIT/lora_model/

adapter_config.json	   special_tokens_map.json  tokenizer.model
adapter_model.safetensors  tokenizer_config.json
README.md		   tokenizer.json


#  Loading Fine-tuned Model

In [None]:
from unsloth import FastLanguageModel
base_model_path = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model_path,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)


==((====))==  Unsloth 2025.4.8: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
from peft import PeftModel
adapter_path = "/content/drive/MyDrive/NLP-GIT/lora_model"
model = PeftModel.from_pretrained(model, adapter_path)


# Sample Response Generation

In [None]:
prompt = """
### Input:
I'm going to vacation.

### Response:
"""

inputs = tokenizer(prompt.strip(), return_tensors="pt").to("cuda")

with torch.no_grad():
    output = model.generate(**inputs, max_new_tokens=100)

print(tokenizer.decode(output[0], skip_special_tokens=True))

### Input:
I'm going to vacation.

### Response:
That's great to hear! Where are you planning to go on vacation? I'd be happy to help you plan or make recommendations if you need any assistance. Let me know if you have any specific preferences or requirements, such as budget, duration, or type of vacation (beach, adventure, cultural, etc.). If you don't have any particular destination in mind, I can suggest some popular vacation spots or destinations based on current trends or seasonal recommendations. Let me


#Loading Test data

In [None]:
from datasets import load_dataset, DatasetDict
dataset = load_dataset("talalmuzaffar/empathetic_dataset_with_correct_context")
split_dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)
val_test_split = split_dataset['test'].train_test_split(test_size=0.5, seed=42)
test_dataset = val_test_split['test']


README.md:   0%|          | 0.00/392 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/962k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [None]:
!pip install tqdm



#  Generating Predictions

In [None]:
from tqdm import tqdm
results = []
for idx, example in enumerate(tqdm(test_dataset, desc="Processing test samples")):
    input_text = example["Input"]
    reference_output = example["Response"]

    prompt = f"""
    ### Input:
    {input_text}

    ### Response:
    """

    inputs = tokenizer(prompt.strip(), return_tensors="pt").to("cuda")
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=100)
    predicted_text = tokenizer.decode(output[0], skip_special_tokens=True).strip()

    results.append({
        "input": input_text,
        "reference": reference_output,
        "predicted": predicted_text
    })

    # Optionally print every 10 samples (if you want log lines)
    if idx % 10 == 0:
        print(f"Processed {idx + 1}/{len(test_dataset)}")


Processing test samples:   0%|          | 1/800 [00:09<2:08:16,  9.63s/it]

Processed 1/800


Processing test samples:   1%|▏         | 11/800 [01:13<1:22:55,  6.31s/it]

Processed 11/800


Processing test samples:   3%|▎         | 21/800 [02:19<1:19:42,  6.14s/it]

Processed 21/800


Processing test samples:   4%|▍         | 31/800 [03:26<1:24:31,  6.60s/it]

Processed 31/800


Processing test samples:   5%|▌         | 41/800 [04:29<1:17:09,  6.10s/it]

Processed 41/800


Processing test samples:   6%|▋         | 51/800 [05:25<1:03:22,  5.08s/it]

Processed 51/800


Processing test samples:   8%|▊         | 61/800 [06:25<1:12:06,  5.85s/it]

Processed 61/800


Processing test samples:   9%|▉         | 71/800 [07:29<1:21:17,  6.69s/it]

Processed 71/800


Processing test samples:  10%|█         | 81/800 [08:24<1:07:01,  5.59s/it]

Processed 81/800


Processing test samples:  11%|█▏        | 91/800 [09:32<1:21:40,  6.91s/it]

Processed 91/800


Processing test samples:  13%|█▎        | 101/800 [10:30<1:15:23,  6.47s/it]

Processed 101/800


Processing test samples:  14%|█▍        | 111/800 [11:29<1:14:01,  6.45s/it]

Processed 111/800


Processing test samples:  15%|█▌        | 121/800 [12:21<52:48,  4.67s/it]

Processed 121/800


Processing test samples:  16%|█▋        | 131/800 [13:21<1:08:09,  6.11s/it]

Processed 131/800


Processing test samples:  18%|█▊        | 141/800 [14:26<1:12:25,  6.59s/it]

Processed 141/800


Processing test samples:  19%|█▉        | 151/800 [15:19<56:59,  5.27s/it]

Processed 151/800


Processing test samples:  20%|██        | 161/800 [16:18<1:04:58,  6.10s/it]

Processed 161/800


Processing test samples:  21%|██▏       | 171/800 [17:04<41:59,  4.00s/it]

Processed 171/800


Processing test samples:  23%|██▎       | 181/800 [18:09<1:07:59,  6.59s/it]

Processed 181/800


Processing test samples:  24%|██▍       | 191/800 [19:10<57:17,  5.64s/it]

Processed 191/800


Processing test samples:  25%|██▌       | 201/800 [20:17<1:07:35,  6.77s/it]

Processed 201/800


Processing test samples:  26%|██▋       | 211/800 [21:22<1:03:46,  6.50s/it]

Processed 211/800


Processing test samples:  28%|██▊       | 221/800 [22:21<55:27,  5.75s/it]

Processed 221/800


Processing test samples:  29%|██▉       | 231/800 [23:14<43:49,  4.62s/it]

Processed 231/800


Processing test samples:  30%|███       | 241/800 [24:10<55:10,  5.92s/it]

Processed 241/800


Processing test samples:  31%|███▏      | 251/800 [25:08<57:09,  6.25s/it]

Processed 251/800


Processing test samples:  33%|███▎      | 261/800 [26:02<54:15,  6.04s/it]

Processed 261/800


Processing test samples:  34%|███▍      | 271/800 [26:52<40:26,  4.59s/it]

Processed 271/800


Processing test samples:  35%|███▌      | 281/800 [27:55<55:47,  6.45s/it]

Processed 281/800


Processing test samples:  36%|███▋      | 291/800 [28:58<46:46,  5.51s/it]

Processed 291/800


Processing test samples:  38%|███▊      | 301/800 [29:56<53:10,  6.39s/it]

Processed 301/800


Processing test samples:  39%|███▉      | 311/800 [30:49<45:28,  5.58s/it]

Processed 311/800


Processing test samples:  40%|████      | 321/800 [31:52<49:25,  6.19s/it]

Processed 321/800


Processing test samples:  41%|████▏     | 331/800 [32:53<50:34,  6.47s/it]

Processed 331/800


Processing test samples:  43%|████▎     | 341/800 [33:55<48:42,  6.37s/it]

Processed 341/800


Processing test samples:  44%|████▍     | 351/800 [34:54<48:23,  6.47s/it]

Processed 351/800


Processing test samples:  45%|████▌     | 361/800 [35:55<46:50,  6.40s/it]

Processed 361/800


Processing test samples:  46%|████▋     | 371/800 [36:47<40:55,  5.72s/it]

Processed 371/800


Processing test samples:  48%|████▊     | 381/800 [37:42<41:34,  5.95s/it]

Processed 381/800


Processing test samples:  49%|████▉     | 391/800 [38:40<40:31,  5.95s/it]

Processed 391/800


Processing test samples:  50%|█████     | 401/800 [39:37<36:55,  5.55s/it]

Processed 401/800


Processing test samples:  51%|█████▏    | 411/800 [40:40<41:32,  6.41s/it]

Processed 411/800


Processing test samples:  53%|█████▎    | 421/800 [41:35<38:56,  6.16s/it]

Processed 421/800


Processing test samples:  54%|█████▍    | 431/800 [42:32<36:41,  5.97s/it]

Processed 431/800


Processing test samples:  55%|█████▌    | 441/800 [43:29<35:23,  5.91s/it]

Processed 441/800


Processing test samples:  56%|█████▋    | 451/800 [44:36<38:44,  6.66s/it]

Processed 451/800


Processing test samples:  58%|█████▊    | 461/800 [45:23<26:35,  4.71s/it]

Processed 461/800


Processing test samples:  59%|█████▉    | 471/800 [46:20<31:14,  5.70s/it]

Processed 471/800


Processing test samples:  60%|██████    | 481/800 [47:16<30:17,  5.70s/it]

Processed 481/800


Processing test samples:  61%|██████▏   | 491/800 [48:08<29:55,  5.81s/it]

Processed 491/800


Processing test samples:  63%|██████▎   | 501/800 [49:06<27:47,  5.58s/it]

Processed 501/800


Processing test samples:  64%|██████▍   | 511/800 [50:06<29:54,  6.21s/it]

Processed 511/800


Processing test samples:  65%|██████▌   | 521/800 [51:13<31:09,  6.70s/it]

Processed 521/800


Processing test samples:  66%|██████▋   | 531/800 [52:07<24:30,  5.47s/it]

Processed 531/800


Processing test samples:  68%|██████▊   | 541/800 [53:11<28:09,  6.52s/it]

Processed 541/800


Processing test samples:  69%|██████▉   | 551/800 [54:08<24:13,  5.84s/it]

Processed 551/800


Processing test samples:  70%|███████   | 561/800 [55:11<24:40,  6.19s/it]

Processed 561/800


Processing test samples:  71%|███████▏  | 571/800 [56:17<24:50,  6.51s/it]

Processed 571/800


Processing test samples:  73%|███████▎  | 581/800 [57:17<22:32,  6.18s/it]

Processed 581/800


Processing test samples:  74%|███████▍  | 591/800 [58:21<22:44,  6.53s/it]

Processed 591/800


Processing test samples:  75%|███████▌  | 601/800 [59:17<18:37,  5.61s/it]

Processed 601/800


Processing test samples:  76%|███████▋  | 611/800 [1:00:14<14:44,  4.68s/it]

Processed 611/800


Processing test samples:  78%|███████▊  | 621/800 [1:01:08<15:35,  5.23s/it]

Processed 621/800


Processing test samples:  79%|███████▉  | 631/800 [1:02:01<16:18,  5.79s/it]

Processed 631/800


Processing test samples:  80%|████████  | 641/800 [1:03:07<17:33,  6.63s/it]

Processed 641/800


Processing test samples:  81%|████████▏ | 651/800 [1:03:54<10:06,  4.07s/it]

Processed 651/800


Processing test samples:  83%|████████▎ | 661/800 [1:04:55<13:39,  5.90s/it]

Processed 661/800


Processing test samples:  84%|████████▍ | 671/800 [1:05:49<11:20,  5.28s/it]

Processed 671/800


Processing test samples:  85%|████████▌ | 681/800 [1:06:51<12:45,  6.43s/it]

Processed 681/800


Processing test samples:  86%|████████▋ | 691/800 [1:07:48<10:24,  5.73s/it]

Processed 691/800


Processing test samples:  88%|████████▊ | 701/800 [1:08:36<06:40,  4.05s/it]

Processed 701/800


Processing test samples:  89%|████████▉ | 711/800 [1:09:33<07:53,  5.32s/it]

Processed 711/800


Processing test samples:  90%|█████████ | 721/800 [1:10:26<05:52,  4.46s/it]

Processed 721/800


Processing test samples:  91%|█████████▏| 731/800 [1:11:26<06:35,  5.73s/it]

Processed 731/800


Processing test samples:  93%|█████████▎| 741/800 [1:12:28<06:13,  6.34s/it]

Processed 741/800


Processing test samples:  94%|█████████▍| 751/800 [1:13:27<04:16,  5.24s/it]

Processed 751/800


Processing test samples:  95%|█████████▌| 761/800 [1:14:27<03:53,  5.99s/it]

Processed 761/800


Processing test samples:  96%|█████████▋| 771/800 [1:15:33<03:34,  7.40s/it]

Processed 771/800


Processing test samples:  98%|█████████▊| 781/800 [1:16:33<01:50,  5.83s/it]

Processed 781/800


Processing test samples:  99%|█████████▉| 791/800 [1:17:30<00:55,  6.20s/it]

Processed 791/800


Processing test samples: 100%|██████████| 800/800 [1:18:09<00:00,  5.86s/it]


In [None]:
import pandas as pd


In [None]:
df_results = pd.DataFrame(results)


In [None]:
df_results.head(2)

Unnamed: 0,input,reference,predicted
0,I was once in a grocery store and the fire ala...,I think once was 4 and the other was 2. I help...,It sounds like you had a moment of connection ...
1,I found my old N64 and hooked the system up. I...,Yeah it does! I am so happy it does but one ga...,I'm glad to hear that you've rediscovered your...


# Evaluating

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=b74f4d5cfdf223df68fa5a2884405f65f1a22c0992cb9a526555de28cb0c31fb
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # This is often needed too


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import evaluate

# Init rouge
rouge = evaluate.load("rouge")

# Initialize metric lists
bleu_1 = []
bleu_2 = []
bleu_3 = []
bleu_4 = []
meteor_scores = []
rouge1 = []
rouge2 = []
rougeL = []

# Loop over each row
for i, row in df_results.iterrows():
    reference = row["reference"]
    predicted = row["predicted"]

    # BLEU
    smoothie = SmoothingFunction().method4
    bleu_1.append(
        sentence_bleu([reference.strip().split()], predicted.strip().split(), weights=(1, 0, 0, 0), smoothing_function=smoothie)
    )
    bleu_2.append(
        sentence_bleu([reference.strip().split()], predicted.strip().split(), weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
    )
    bleu_3.append(
        sentence_bleu([reference.strip().split()], predicted.strip().split(), weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie)
    )
    bleu_4.append(
        sentence_bleu([reference.strip().split()], predicted.strip().split(), weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
    )

    # METEOR (safe)
    try:
        m_score = meteor_score([reference.strip().split()], predicted.strip().split())
        meteor_scores.append(m_score)
    except Exception as e:
        print(f"METEOR error at row {i}: {e}")
        continue

    # ROUGE
    rouge_result = rouge.compute(predictions=[predicted], references=[reference])
    rouge1.append(rouge_result["rouge1"])
    rouge2.append(rouge_result["rouge2"])
    rougeL.append(rouge_result["rougeL"])


In [None]:
print("\n=== Average BLEU Scores ===")
print(f"BLEU-1: {sum(bleu_1)/len(bleu_1):.4f}")
print(f"BLEU-2: {sum(bleu_2)/len(bleu_2):.4f}")
print(f"BLEU-3: {sum(bleu_3)/len(bleu_3):.4f}")
print(f"BLEU-4: {sum(bleu_4)/len(bleu_4):.4f}")

if meteor_scores:
    print("\n=== Average METEOR ===")
    print(f"METEOR: {sum(meteor_scores)/len(meteor_scores):.4f}")
else:
    print("\n=== Average METEOR ===")
    print("No METEOR scores computed.")

print("\n=== ROUGE Scores ===")
print(f"ROUGE-1: {sum(rouge1)/len(rouge1):.4f}")
print(f"ROUGE-2: {sum(rouge2)/len(rouge2):.4f}")
print(f"ROUGE-L: {sum(rougeL)/len(rougeL):.4f}")



=== Average BLEU Scores ===
BLEU-1: 0.0540
BLEU-2: 0.0241
BLEU-3: 0.0152
BLEU-4: 0.0097

=== Average METEOR ===
METEOR: 0.1178

=== ROUGE Scores ===
ROUGE-1: 0.1135
ROUGE-2: 0.0182
ROUGE-L: 0.0880


In [None]:
df_results.to_csv("mistral_F_evaluation_results.csv", index=False)


In [None]:
import bert_score

# Extract predicted and reference lists
predictions = df_results["predicted"].tolist()
references = df_results["reference"].tolist()

# Run BERTScore (default model: roberta-large)
P, R, F1 = bert_score.score(predictions, references, lang="en", verbose=True)

# Print average scores
print("\n=== BERTScore ===")
print(f"Precision: {P.mean().item():.4f}")
print(f"Recall:    {R.mean().item():.4f}")
print(f"F1:        {F1.mean().item():.4f}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/24 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/13 [00:00<?, ?it/s]

done in 12.98 seconds, 61.64 sentences/sec

=== BERTScore ===
Precision: 0.8262
Recall:    0.8520
F1:        0.8387


In [None]:
!pip install sacrebleu


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1


In [None]:
import evaluate
chrf = evaluate.load("chrf")
chrf_result = chrf.compute(predictions=df_results["predicted"].tolist(),
                           references=df_results["reference"].tolist())

print(f"ChrF: {chrf_result['score']:.4f}")

ChrF: 18.0778


In [None]:
def distinct_n(corpus, n):
    all_ngrams = set()
    total_ngrams = 0
    for sentence in corpus:
        tokens = sentence.strip().split()
        ngrams = list(zip(*[tokens[i:] for i in range(n)]))
        all_ngrams.update(ngrams)
        total_ngrams += len(ngrams)
    return len(all_ngrams) / (total_ngrams + 1e-8)

predictions = df_results["predicted"].tolist()

print(f"Distinct-1: {distinct_n(predictions, 1):.4f}")
print(f"Distinct-2: {distinct_n(predictions, 2):.4f}")


Distinct-1: 0.1231
Distinct-2: 0.4318


In [None]:
input_lens = df_results["input"].apply(lambda x: len(x.strip().split()))
ref_lens = df_results["reference"].apply(lambda x: len(x.strip().split()))
pred_lens = df_results["predicted"].apply(lambda x: len(x.strip().split()))

print(f"Avg input length: {input_lens.mean():.2f}")
print(f"Avg reference length: {ref_lens.mean():.2f}")
print(f"Avg predicted length: {pred_lens.mean():.2f}")


Avg input length: 18.40
Avg reference length: 13.76
Avg predicted length: 63.91


# Toxicity Analysis using Unitary Toxic-BERT
We use `unitary/toxic-bert` to detect whether any of the model's responses are toxic (with a threshold of 0.5). This ensures safety and appropriateness of the generated outputs.

In [None]:
from transformers import pipeline
toxicity_classifier = pipeline("text-classification", model="unitary/toxic-bert")

toxic_counts = 0
for pred in df_results["predicted"]:
    result = toxicity_classifier(pred)[0]
    if result['label'] == 'toxic' and result['score'] > 0.5:
        toxic_counts += 1

print(f"Toxic responses: {toxic_counts}/{len(df_results)}")


config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Toxic responses: 0/800


Result of evaluations


| Metric                    | Score        | Description                                        |
|---------------------------|--------------|----------------------------------------------------|
| **BLEU-1**                | 0.0540       | Unigram overlap                                    |
| **BLEU-2**                | 0.0241       | Bigram overlap                                     |
| **BLEU-3**                | 0.0152       | Trigram overlap                                    |
| **BLEU-4**                | 0.0097       | 4-gram overlap                                     |
| **METEOR**                | 0.1178       | Semantic + lexical match                           |
| **ROUGE-1**               | 0.1135       | Unigram overlap recall                             |
| **ROUGE-2**               | 0.0182       | Bigram overlap recall                              |
| **ROUGE-L**               | 0.0880       | Longest common subsequence                         |
| **BERTScore (Precision)** | 0.8262       | Token-level semantic match                         |
| **BERTScore (Recall)**    | 0.8520       | Recall of semantic similarity                      |
| **BERTScore (F1)**        | 0.8387       | F1 of semantic similarity                          |
| **Distinct-1**            | 0.1231       | Diversity – unique unigrams                        |
| **Distinct-2**            | 0.4318       | Diversity – unique bigrams                         |
| **Avg Input Length**      | 18.40 tokens | Average length of input prompts                    |
| **Avg Reference Length**  | 13.76 tokens | Average ground truth response length               |
| **Avg Predicted Length**  | 63.91 tokens | Average model output length                        |
| **Toxic Responses**       | 0 / 800      | Number of responses classified as toxic            |
