In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
from datasets import Dataset
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
import evaluate
from bert_score import score as bertscore
from sentence_transformers import SentenceTransformer, util
from evaluate import load

# Load evaluation metrics
bleu_metric = evaluate.load("bleu")
accuracy_metric = evaluate.load("accuracy")
rouge_metric = evaluate.load("rouge")


# Load predictions from your JSON file
def load_predictions(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

# Function to compute contextual appropriateness (embedding-based similarity)
def compute_contextual_appropriateness(ground_truth, responses, model_name="all-MiniLM-L6-v2"):
    # Load embedding model
    embedding_model = SentenceTransformer(model_name)

    # Compute embeddings
    ground_truth_embeddings = embedding_model.encode(ground_truth, convert_to_tensor=True)
    response_embeddings = embedding_model.encode(responses, convert_to_tensor=True)

    # Compute cosine similarity for corresponding pairs
    similarities = []
    for gt_embed, resp_embed in zip(ground_truth_embeddings, response_embeddings):
        similarity = util.cos_sim(gt_embed, resp_embed).item()  # Get similarity as a scalar value
        similarities.append(similarity)

    # Print the individual similarities for each pair
    # print(similarities)

    # Return the average similarity
    return sum(similarities) / len(similarities) if similarities else 0.0

# Function to run evaluation
def evaluate_predictions(predictions):
    predicted_outputs = []
    references = []
    contexts = []
    instructions = []

    for sample in predictions:
        # Collect predicted and reference responses
        predicted_outputs.append(sample["predicted_response"])
        references.append(sample["response"])  # Ground truth is in the 'response' field

    # Compute BLEU score
    bleu_score = bleu_metric.compute(predictions=predicted_outputs, references=references)

    # Compute Exact Match (EM)
    exact_match_accuracy = sum([1 for p, r in zip(predicted_outputs, references) if p == r[0]]) / len(references)

    # Compute ROUGE scores
    rouge_scores = rouge_metric.compute(predictions=predicted_outputs, references=[ref[0] for ref in references])

    # Compute BERTScore
    P, R, F1 = bertscore(predicted_outputs, [ref[0] for ref in references], lang="en")

    # Compute Contextual Appropriateness (Embedding-Based)

    contextual_appropriateness = compute_contextual_appropriateness(references, predicted_outputs)

    print("Evaluation Metrics:")
    print("====================")
    print(f"BLEU Score: {bleu_score['bleu']}")
    print(f"Exact Match Accuracy: {exact_match_accuracy}")
    print("ROUGE Scores:")
    for key, value in rouge_scores.items():
        print(f"  {key.upper()}: {value}")
    print("BERTScore:")
    print(f"  Precision: {P.mean()}")
    print(f"  Recall: {R.mean()}")
    print(f"  F1 Score: {F1.mean()}")
    print(f"Cosine Similarity (Contextual Appropriateness): {contextual_appropriateness:.2f}")

# # Example
# predictions = load_predictions("/content/predictions_test_dataset_classification.json")
# evaluate_predictions(predictions)

In [3]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_3b_afterft_test_dataset_brainstorming.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.04143476935496052
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.00684952724587649
  ROUGE2: 0.0
  ROUGEL: 0.006874268452085391
  ROUGELSUM: 0.00681170726756815
BERTScore:
  Precision: 0.7545576691627502
  Recall: 0.8349120020866394
  F1 Score: 0.7924672365188599
Cosine Similarity (Contextual Appropriateness): 0.59


In [5]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_3b_afterft_test_dataset_classification.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.12953438468619694
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.0049270568665882135
  ROUGE2: 0.0
  ROUGEL: 0.004881139997433746
  ROUGELSUM: 0.004994465433858675
BERTScore:
  Precision: 0.7343763113021851
  Recall: 0.8341160416603088
  F1 Score: 0.7807930111885071
Cosine Similarity (Contextual Appropriateness): 0.73


In [6]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_3b_afterft_test_dataset_closed_qa.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.17251017722293752
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.003902769620963754
  ROUGE2: 0.0
  ROUGEL: 0.0038009208920352705
  ROUGELSUM: 0.0038737421639943545
BERTScore:
  Precision: 0.7447069883346558
  Recall: 0.8329216837882996
  F1 Score: 0.7860437631607056
Cosine Similarity (Contextual Appropriateness): 0.71


In [7]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_3b_afterft_test_dataset_creative_writing.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.011883903431398744
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.00446501399669
  ROUGE2: 0.0
  ROUGEL: 0.004389751961637937
  ROUGELSUM: 0.00448452376313613
BERTScore:
  Precision: 0.7578727006912231
  Recall: 0.8355059027671814
  F1 Score: 0.794640064239502
Cosine Similarity (Contextual Appropriateness): 0.56


In [8]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_3b_afterft_test_dataset_general_qa.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.0236693531277319
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.005109967017030415
  ROUGE2: 0.0
  ROUGEL: 0.005139325079950808
  ROUGELSUM: 0.005129334544315017
BERTScore:
  Precision: 0.7559553980827332
  Recall: 0.8356185555458069
  F1 Score: 0.7935957312583923
Cosine Similarity (Contextual Appropriateness): 0.66


In [9]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_3b_afterft_test_dataset_information_extraction.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.17658631273730488
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.003574087804453012
  ROUGE2: 0.0
  ROUGEL: 0.0036124765602300038
  ROUGELSUM: 0.0035788344516843717
BERTScore:
  Precision: 0.7389606237411499
  Recall: 0.8294253945350647
  F1 Score: 0.781273603439331
Cosine Similarity (Contextual Appropriateness): 0.73


In [10]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_3b_afterft_test_dataset_open_qa.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.044894400688716474
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.006791997454530168
  ROUGE2: 0.0
  ROUGEL: 0.0067589368267555825
  ROUGELSUM: 0.006779568900908339
BERTScore:
  Precision: 0.7476020455360413
  Recall: 0.8358011841773987
  F1 Score: 0.7889477610588074
Cosine Similarity (Contextual Appropriateness): 0.62


In [11]:
predictions = load_predictions("/mnt/c/Users/IoT_lab_YU/dengyiliu/llama3/llama3.2finetune/llama3.2/evaluate/predictions_3b_afterft_test_dataset_summarization.jsonl")

# Run evaluation
evaluate_predictions(predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU Score: 0.09743749398076966
Exact Match Accuracy: 0.0
ROUGE Scores:
  ROUGE1: 0.008851405463924093
  ROUGE2: 0.0
  ROUGEL: 0.008932418146646233
  ROUGELSUM: 0.008921290477559072
BERTScore:
  Precision: 0.7485904693603516
  Recall: 0.8268677592277527
  F1 Score: 0.7856000661849976
Cosine Similarity (Contextual Appropriateness): 0.77
