In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
from datasets import Dataset
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
import evaluate
from bert_score import score as bertscore
from sentence_transformers import SentenceTransformer, util
from evaluate import load

# Load evaluation metrics
bleu_metric = evaluate.load("bleu")
accuracy_metric = evaluate.load("accuracy")
rouge_metric = evaluate.load("rouge")


# Load predictions from your JSON file
def load_predictions(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

# Function to compute contextual appropriateness (embedding-based similarity)
def compute_contextual_appropriateness(ground_truth, responses, model_name="all-MiniLM-L6-v2"):
    # Load embedding model
    embedding_model = SentenceTransformer(model_name)

    # Compute embeddings
    ground_truth_embeddings = embedding_model.encode(ground_truth, convert_to_tensor=True)
    response_embeddings = embedding_model.encode(responses, convert_to_tensor=True)

    # Compute cosine similarity for corresponding pairs
    similarities = []
    for gt_embed, resp_embed in zip(ground_truth_embeddings, response_embeddings):
        similarity = util.cos_sim(gt_embed, resp_embed).item()  # Get similarity as a scalar value
        similarities.append(similarity)

    # Print the individual similarities for each pair
    # print(similarities)

    # Return the average similarity
    return sum(similarities) / len(similarities) if similarities else 0.0

# Function to run evaluation
def evaluate_predictions(predictions):
    predicted_outputs = []
    references = []
    contexts = []
    instructions = []

    for sample in predictions:
        # Collect predicted and reference responses
        predicted_outputs.append(sample["predicted_response"])
        references.append(sample["response"])  # Ground truth is in the 'response' field

    # Compute BLEU score
    bleu_score = bleu_metric.compute(predictions=predicted_outputs, references=references)

    # Compute Exact Match (EM)
    exact_match_accuracy = sum([1 for p, r in zip(predicted_outputs, references) if p == r[0]]) / len(references)

    # Compute ROUGE scores
    rouge_scores = rouge_metric.compute(predictions=predicted_outputs, references=[ref[0] for ref in references])

    # Compute BERTScore
    P, R, F1 = bertscore(predicted_outputs, [ref[0] for ref in references], lang="en")

    # Compute Contextual Appropriateness (Embedding-Based)

    contextual_appropriateness = compute_contextual_appropriateness(references, predicted_outputs)

    print("Evaluation Metrics:")
    print("====================")
    print(f"BLEU Score: {bleu_score['bleu']}")
    print(f"Exact Match Accuracy: {exact_match_accuracy}")
    print("ROUGE Scores:")
    for key, value in rouge_scores.items():
        print(f"  {key.upper()}: {value}")
    print("BERTScore:")
    print(f"  Precision: {P.mean()}")
    print(f"  Recall: {R.mean()}")
    print(f"  F1 Score: {F1.mean()}")
    print(f"Cosine Similarity (Contextual Appropriateness): {contextual_appropriateness:.2f}")

# # Example
# predictions = load_predictions("/content/predictions_test_dataset_classification.json")
# evaluate_predictions(predictions)

In [3]:
#after ft eva

predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_brainstorming.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.02812680931334385
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.005146431792876331
  ROUGE2: 0.0
  ROUGEL: 0.005156193220211549
  ROUGELSUM: 0.0051915049784720755
BERTScore:
  Precision: 0.7574295401573181
  Recall: 0.8405299186706543
  F1 Score: 0.7966559529304504
Cosine Similarity (Contextual Appropriateness): 0.63


In [18]:

predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8bft_test_dataset_brainstorming.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.030374035822168723
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.006386634648973823
  ROUGE2: 0.0
  ROUGEL: 0.0063463818025219845
  ROUGELSUM: 0.0064081192409692605
BERTScore:
  Precision: 0.7558797001838684
  Recall: 0.8296282291412354
  F1 Score: 0.7907983660697937
Cosine Similarity (Contextual Appropriateness): 0.59


In [4]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_classification.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.04145679820936703
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.005206428102022298
  ROUGE2: 0.0
  ROUGEL: 0.005210517831898242
  ROUGELSUM: 0.005214840722875525
BERTScore:
  Precision: 0.7508653402328491
  Recall: 0.83944171667099
  F1 Score: 0.7925155758857727
Cosine Similarity (Contextual Appropriateness): 0.74


In [17]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8bft_test_dataset_classification.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.11066695094462599
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.003423057810104591
  ROUGE2: 0.0
  ROUGEL: 0.0034334932006263065
  ROUGELSUM: 0.0033530935823151437
BERTScore:
  Precision: 0.7414257526397705
  Recall: 0.8288153409957886
  F1 Score: 0.7824397087097168
Cosine Similarity (Contextual Appropriateness): 0.73


In [5]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_closed_qa.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.13330357470388665
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.003909042240284728
  ROUGE2: 0.0
  ROUGEL: 0.0038937346918775048
  ROUGELSUM: 0.00391092630772367
BERTScore:
  Precision: 0.750071108341217
  Recall: 0.823983371257782
  F1 Score: 0.785168468952179
Cosine Similarity (Contextual Appropriateness): 0.77


In [16]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8bft_test_dataset_closed_qa.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.16703296108174936
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.0045779402061450385
  ROUGE2: 0.0
  ROUGEL: 0.004616541538238163
  ROUGELSUM: 0.0046388375932204
BERTScore:
  Precision: 0.7463940382003784
  Recall: 0.8255972862243652
  F1 Score: 0.7838257551193237
Cosine Similarity (Contextual Appropriateness): 0.73


In [6]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_creative_writing.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.02191809713890451
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.00510216250756087
  ROUGE2: 0.0
  ROUGEL: 0.00511448519048404
  ROUGELSUM: 0.005097696907716175
BERTScore:
  Precision: 0.7588837742805481
  Recall: 0.8363004326820374
  F1 Score: 0.7955341339111328
Cosine Similarity (Contextual Appropriateness): 0.61


In [15]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8bft_test_dataset_creative_writing.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.011454879053419017
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.003925923852613176
  ROUGE2: 0.0
  ROUGEL: 0.003872618279710735
  ROUGELSUM: 0.0038963684430848118
BERTScore:
  Precision: 0.7594756484031677
  Recall: 0.8316750526428223
  F1 Score: 0.793735682964325
Cosine Similarity (Contextual Appropriateness): 0.56


In [7]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_general_qa.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.04642434619992193
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.003963960696266216
  ROUGE2: 0.0
  ROUGEL: 0.003949279480109907
  ROUGELSUM: 0.003987505330162918
BERTScore:
  Precision: 0.7598872780799866
  Recall: 0.8392840027809143
  F1 Score: 0.797468900680542
Cosine Similarity (Contextual Appropriateness): 0.73


In [14]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8bft_test_dataset_general_qa.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.023642383912282728
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.0049053907459990585
  ROUGE2: 0.0
  ROUGEL: 0.004819481530440814
  ROUGELSUM: 0.00490828700322618
BERTScore:
  Precision: 0.7575551271438599
  Recall: 0.832028329372406
  F1 Score: 0.7928701043128967
Cosine Similarity (Contextual Appropriateness): 0.66


In [8]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_information_extraction.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.17416116033105894
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.004206325016230125
  ROUGE2: 0.0
  ROUGEL: 0.004181761971203091
  ROUGELSUM: 0.004131155331802168
BERTScore:
  Precision: 0.7482337355613708
  Recall: 0.8288138508796692
  F1 Score: 0.7862796783447266
Cosine Similarity (Contextual Appropriateness): 0.73


In [13]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8bft_test_dataset_information_extraction.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.16817022741189291
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.0046134188058598695
  ROUGE2: 0.0
  ROUGEL: 0.004730962478241635
  ROUGELSUM: 0.004715125844335291
BERTScore:
  Precision: 0.7415175437927246
  Recall: 0.8264254927635193
  F1 Score: 0.7814009785652161
Cosine Similarity (Contextual Appropriateness): 0.72


In [9]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_open_qa.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.04538278622079109
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.004441804935587351
  ROUGE2: 0.0
  ROUGEL: 0.004472801151414532
  ROUGELSUM: 0.0044049564548703304
BERTScore:
  Precision: 0.7546486258506775
  Recall: 0.8374326825141907
  F1 Score: 0.7937356233596802
Cosine Similarity (Contextual Appropriateness): 0.68


In [12]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8bft_test_dataset_open_qa.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.04430628921290098
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.0057064653405631115
  ROUGE2: 0.0
  ROUGEL: 0.005710027156754198
  ROUGELSUM: 0.0056473189358349695
BERTScore:
  Precision: 0.7482785582542419
  Recall: 0.8292772769927979
  F1 Score: 0.7864561080932617
Cosine Similarity (Contextual Appropriateness): 0.63


In [10]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8b_beforeft_test_dataset_summarization.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.15852701600761412
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.005857705414602541
  ROUGE2: 0.0
  ROUGEL: 0.005928647661043427
  ROUGELSUM: 0.005852980555706419
BERTScore:
  Precision: 0.7516302466392517
  Recall: 0.8262122273445129
  F1 Score: 0.7870107889175415
Cosine Similarity (Contextual Appropriateness): 0.83


In [11]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_8bft_test_dataset_summarization.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.09439940696524764
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.004716152398072004
  ROUGE2: 0.0
  ROUGEL: 0.004877678077535218
  ROUGELSUM: 0.004865036613849591
BERTScore:
  Precision: 0.7501161694526672
  Recall: 0.8243694305419922
  F1 Score: 0.785338819026947
Cosine Similarity (Contextual Appropriateness): 0.79
