<a href="https://colab.research.google.com/github/AnjaliVaghjiani/Thesis/blob/main/Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Required Libraies

In [None]:
!pip install datasets transformers sentence-transformers torch

In [None]:
!pip install bert-score

In [None]:
!pip install evaluate

In [None]:
!pip install rouge_score

#data set

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "kingabzpro/DeepSeek-R1-0528-Qwen3-8B-Medical-Reasoning"
# model_name = "ContactDoctor/Bio-Medical-Llama-3-8B"  # Hugging Face repo of your model
# model_name = "AnjaliNV/WellBeing_Coach_LLM"  # Hugging Face repo of your model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)




## my data

In [None]:
from datasets import load_dataset

# Replace 'your_dataset_name' with the HF dataset repo ID
dataset = load_dataset('AnjaliNV/Templete2', split='train')

sample_size = 10  # adjust as needed

# Randomly select sample_size entries
dataset = dataset.shuffle(seed=42).select(range(sample_size))


In [None]:
def generate_recommendation(profile):
    prompt = f"""
You are a professional well-being coach.
Your role is to generate personalized well-being recommendations for the user.

USER INPUT:
{profile}

INSTRUCTIONS:
- Always follow the official guidelines before making any recommendation.
- Do NOT over-recommend. Keep suggestions safe, concise, and realistic. Ask user to simple avoid the contradit food with their diet prefernce.
- Always take into account the user’s age, sex, and fitness history.
- Strictly follow the output template provided below. Do not add extra sections, headings, or commentary.
- Keep tone supportive, professional, and encouraging.

TASK:
Based on the above user data, generate recommendations using EXACTLY the following structure:

1) Food Recommendation
- Overall Assessment: [...]
- Areas of Improvements: [...]
- Suggested Meals: [...]

2) Physical Activity
- Activity Assessment: [...]
- Zone Minutes and Intensity Feedback: [...]
- Strength/Cardio Tips: [Workout Suggestions]
- Weekly Goals: [...]

3) Sleep, Mood, Mental Health and Lifestyle
- Sleep and Mood Review: [...]
- Suggestions: [...]

4) Weekly Summary
- Summary: [...]
- Goals: [...]
"""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_length = inputs.input_ids.shape[1]
    output_ids = model.generate(**inputs, max_new_tokens=700)
    output_text = tokenizer.decode(output_ids[0][input_length:], skip_special_tokens=True)
    return output_text


# datasets food

In [None]:
import re
import torch
import pandas as pd
from datasets import load_dataset
from transformers import pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import evaluate
from sentence_transformers import SentenceTransformer, util

# -----------------------
# 1. Load Dataset
# -----------------------
dataset = load_dataset("dongx1997/NutriBench", split="train")  # or 'train'

# Convert to DataFrame for convenience
df = pd.DataFrame(dataset)

# -----------------------
# 2. Load Your LLM from Hugging Face
# -----------------------
# generator = pipeline("text-generation", model="AnjaliNV/WellBeing_Coach_LLM", device=0 if torch.cuda.is_available() else -1)
# def generate_recommendation(prompt):
#     device = "cuda" if torch.cuda.is_available() else "cpu"
#     inputs = tokenizer(prompt, return_tensors="pt").to(device)
#     input_length = inputs.input_ids.shape[1]
#     output_ids = model.generate(**inputs, max_new_tokens=100)
#     output_text = tokenizer.decode(output_ids[0][input_length:], skip_special_tokens=True)
#     return output_text

# Hugging Face evaluation metrics

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# -----------------------
# 3. Regex Helper: Extract Nutrition Numbers
# -----------------------
def extract_numbers(text):
    numbers = re.findall(r"(\d+\.?\d*)", text) # Modified regex to capture floats
    if len(numbers) >= 4:
        return {"carb": float(numbers[0]), "fat": float(numbers[1]),
                "protein": float(numbers[2]), "energy": float(numbers[3])}
    return {"carb": None, "fat": None, "protein": None, "energy": None}

# -----------------------
# 4. Generate Predictions
# -----------------------
references = []
predictions = []
predictions_num = []

for i in range(100): # sample first 100 meals
    meal = df["meal_description"][i]
    ref_text = f"Carb: {df.loc[i,'carb']}g Fat: {df.loc[i,'fat']}g Protein: {df.loc[i,'protein']}g Energy: {df.loc[i,'energy']} kcal"


    prompt = f"Estimate nutritional values for: {meal}. " \
             f"Format: Carb: X g, Fat: Y g, Protein: Z g, Energy: W kcal."
    # output = generator(prompt, max_length=100, do_sample=False)[0]["generated_text"]
    output = generate_recommendation(prompt)
    references.append(ref_text)
    predictions.append(output)
    predictions_num.append(extract_numbers(output))

pred_df = pd.DataFrame(predictions_num)
# Filter df to only include the 100 rows used for prediction
df_subset = df.iloc[:100].copy()

df_eval = pd.concat([df_subset.reset_index(drop=True), pred_df.add_prefix("pred_")], axis=1)

# Fill NaN values with 0 for evaluation
df_eval = df_eval.fillna(0)



In [None]:
# -----------------------
# 5. Numeric Evaluation
# -----------------------
def evaluate_metric(true, pred, name):
    mae = mean_absolute_error(true, pred)
    mse = mean_squared_error(true, pred)
    r2 = r2_score(true, pred)
    print(f"\n{name} Evaluation → MAE: {mae:.2f}, MSE: {mse:.2f}, R²: {r2:.4f}")
    return mae, mse, r2

numeric_results = {}
for col in ["carb", "fat", "protein", "energy"]:
    numeric_results[col] = evaluate_metric(df_eval[col], df_eval[f"pred_{col}"], col)

# -----------------------
# 6. Textual Evaluation
# -----------------------
# Semantic similarity (Sentence-BERT)
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
semantic_scores = []
for ref, pred in zip(references, predictions):
    emb_ref = embed_model.encode(ref, convert_to_tensor=True)
    emb_pred = embed_model.encode(pred, convert_to_tensor=True)
    semantic_scores.append(util.pytorch_cos_sim(emb_ref, emb_pred).item())
avg_semantic = sum(semantic_scores) / len(semantic_scores)

# BERTScore
bertscore_results = bertscore.compute(predictions=predictions, references=references, lang="en")
avg_bert_f1 = sum(bertscore_results["f1"]) / len(bertscore_results["f1"])

# Filter out empty predictions and references for BLEU and ROUGE
filtered_predictions = [p for p, r in zip(predictions, references) if p and r]
filtered_references = [r for p, r in zip(predictions, references) if p and r]

# Initialize BLEU and ROUGE results with default values
bleu_results = {'bleu': 0.0}
rouge_results = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

# Compute BLEU and ROUGE only if there are valid predictions and references
if filtered_predictions and filtered_references:
    # BLEU
    bleu_results = bleu.compute(predictions=filtered_predictions, references=filtered_references)

    # ROUGE
    rouge_results = rouge.compute(predictions=filtered_predictions, references=filtered_references)


print("\n📊 Text Evaluation Results")
print(f"Semantic Similarity: {avg_semantic:.4f}")
print(f"BERTScore F1: {avg_bert_f1:.4f}")
print(f"BLEU: {bleu_results['bleu']:.4f}")
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")

# # -----------------------
# # 7. Visualizations
# # -----------------------
# # (A) Numeric Predictions vs Ground Truth
# for col in ["carb", "fat", "protein", "energy"]:
#     plt.figure(figsize=(6,4))
#     plt.scatter(df_eval[col], df_eval[f"pred_{col}"], alpha=0.6)
#     plt.plot([df_eval[col].min(), df_eval[col].max()], [df_eval[col].min(), df_eval[col].max()], 'r--')
#     plt.xlabel("True Values")
#     plt.ylabel("Predicted Values")
#     plt.title(f"{col.capitalize()} Predictions vs Ground Truth")
#     plt.savefig(f"{col}_scatter.png")
#     plt.show()

# # (B) Error Distribution
# for col in ["carb", "fat", "protein", "energy"]:
#     plt.figure(figsize=(6,4))
#     errors = df_eval[f"pred_{col}"] - df_eval[col]
#     plt.hist(errors, bins=20, alpha=0.7)
#     plt.xlabel("Error")
#     plt.ylabel("Frequency")
#     plt.title(f"Error Distribution for {col.capitalize()}")
#     plt.savefig(f"{col}_error_hist.png")
#     plt.show()

# (C) Text Metrics Bar Chart
plt.figure(figsize=(6,4))
plt.bar(["SemanticSim", "BERTScore", "BLEU", "ROUGE1", "ROUGE2", "ROUGEL"],
        [avg_semantic, avg_bert_f1, bleu_results["bleu"],
         rouge_results["rouge1"], rouge_results["rouge2"], rouge_results["rougeL"]])
plt.title("Textual Evaluation Metrics")
plt.ylabel("Score")
plt.savefig("text_metrics.png")
plt.show()

print("\n✅ Full evaluation (numeric + text + visualizations) complete!")

#Metal Health

In [None]:
import torch
from datasets import load_dataset
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import evaluate

# ---------------------------
# 1. Load Dataset
# ---------------------------
dataset = load_dataset("marmikpandya/mental-health", split="train")

def generate_recommendation(prompt):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_length = inputs.input_ids.shape[1]
    output_ids = model.generate(**inputs, max_new_tokens=100)
    output_text = tokenizer.decode(output_ids[0][input_length:], skip_special_tokens=True)
    return output_text

# ---------------------------
# 3. Load Metrics
# ---------------------------
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# ---------------------------
# 4. Generate Predictions
# ---------------------------
references = []
predictions = []

for sample in dataset.select(range(100)):  # sample first 100 for speed
    user_input = sample["input"]
    reference = sample["output"]
    instruction  = sample["instruction"]
    # Generate LLM output
    prompt = f"User: {user_input}\n{instruction}:"
    llm_output = generate_recommendation(prompt)

    # Append to lists
    references.append(reference)
    predictions.append(llm_output)

In [None]:

# ---------------------------
# 5. Semantic Similarity
# ---------------------------
semantic_scores = []
for ref, pred in zip(references, predictions):
    emb_ref = embed_model.encode(ref, convert_to_tensor=True)
    emb_pred = embed_model.encode(pred, convert_to_tensor=True)
    semantic_scores.append(util.pytorch_cos_sim(emb_ref, emb_pred).item())
avg_semantic = sum(semantic_scores) / len(semantic_scores)

# ---------------------------
# 6. BERTScore
# ---------------------------
bertscore_results = bertscore.compute(predictions=predictions, references=references, lang="en")
avg_bert_f1 = sum(bertscore_results["f1"]) / len(bertscore_results["f1"])

# ---------------------------
# 7. BLEU & ROUGE
# ---------------------------
bleu_results = bleu.compute(predictions=predictions, references=references)
rouge_results = rouge.compute(predictions=predictions, references=references)

# ---------------------------
# 8. Print Results
# ---------------------------
print("\n📊 Mental Health Model Evaluation")
print(f"Semantic Similarity: {avg_semantic:.4f}")
print(f"BERTScore F1: {avg_bert_f1:.4f}")
print(f"BLEU: {bleu_results['bleu']:.4f}")
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have the following average scores from previous cells:
# avg_semantic
# avg_bert_f1
# bleu_results['bleu']
# rouge_results['rouge1']
# rouge_results['rouge2']
# rouge_results['rougeL']


metrics = ['Semantic Similarity', 'BERTScore F1', 'BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L']
scores = [avg_semantic, avg_bert_f1, bleu_results['bleu'], rouge_results['rouge1'], rouge_results['rouge2'], rouge_results['rougeL']]

plt.figure(figsize=(10, 6))
sns.barplot(x=metrics, y=scores, palette="viridis")
plt.xticks(rotation=45, ha='right')
plt.ylim(0, 1)
plt.title("Model Evaluation Scores")
plt.ylabel("Score")
plt.tight_layout()
plt.show()

# Fitness

In [None]:
import torch
from datasets import load_dataset
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import evaluate

# ---------------------------
# 1. Load Dataset
# ---------------------------
dataset = load_dataset("HazSylvia/Fitness_Unformatted", split="train")

def generate_recommendation(prompt):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_length = inputs.input_ids.shape[1]
    output_ids = model.generate(**inputs, max_new_tokens=100)
    output_text = tokenizer.decode(output_ids[0][input_length:], skip_special_tokens=True)
    return output_text

# ---------------------------
# 3. Load Metrics
# ---------------------------
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# ---------------------------
# 4. Generate Predictions
# ---------------------------
references = []
predictions = []

for sample in dataset.select(range(100)):  # sample first 100 for speed
    user_input = sample["Human"]
    reference = sample["Assistant"]
    # Generate LLM output
    prompt = f"Question: {user_input}\n:"
    llm_output = generate_recommendation(prompt)

    # Append to lists
    references.append(reference)
    references.append(reference)
    predictions.append(llm_output)



In [None]:
# ---------------------------
# 5. Semantic Similarity
# ---------------------------
# Create a new list with unique references
unique_references = []
for ref in references:
    if ref not in unique_references:
        unique_references.append(ref)

semantic_scores = []
for ref, pred in zip(unique_references, predictions):
    emb_ref = embed_model.encode(ref, convert_to_tensor=True)
    emb_pred = embed_model.encode(pred, convert_to_tensor=True)
    semantic_scores.append(util.pytorch_cos_sim(emb_ref, emb_pred).item())
avg_semantic = sum(semantic_scores) / len(semantic_scores)

# ---------------------------
# 6. BERTScore
# ---------------------------
bertscore_results = bertscore.compute(predictions=predictions, references=unique_references, lang="en")
avg_bert_f1 = sum(bertscore_results["f1"]) / len(bertscore_results["f1"])

# ---------------------------
# 7. BLEU & ROUGE
# ---------------------------
bleu_results = bleu.compute(predictions=predictions, references=unique_references)
rouge_results = rouge.compute(predictions=predictions, references=unique_references)

# ---------------------------
# 8. Print Results
# ---------------------------
print("\n📊 Fitness Model Evaluation")
print(f"Semantic Similarity: {avg_semantic:.4f}")
print(f"BERTScore F1: {avg_bert_f1:.4f}")
print(f"BLEU: {bleu_results['bleu']:.4f}")
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")

# fitbit

In [None]:
import torch
from datasets import load_dataset
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import evaluate

device = "cuda" if torch.cuda.is_available() else "cpu"

# ---------------------------
# 3. Load Metrics
# ---------------------------
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# ---------------------------
# 4. Generate Predictions
# ---------------------------
references = []
predictions = []

for sample in dataset.select(range(10)):  # sample first 100 for speed
    user_input = sample["input"]
    reference = sample["output"]
    # Generate LLM output
    prompt = f"User_input: {user_input}\n:"
    llm_output = generate_recommendation(prompt)

    # Append to lists
    references.append(reference)
    references.append(reference)
    predictions.append(llm_output)



In [None]:
# ---------------------------
# 5. Semantic Similarity
# ---------------------------
# Create a new list with unique references
unique_references = []
for ref in references:
    if ref not in unique_references:
        unique_references.append(ref)

semantic_scores = []
for ref, pred in zip(unique_references, predictions):
    emb_ref = embed_model.encode(ref, convert_to_tensor=True)
    emb_pred = embed_model.encode(pred, convert_to_tensor=True)
    semantic_scores.append(util.pytorch_cos_sim(emb_ref, emb_pred).item())
avg_semantic = sum(semantic_scores) / len(semantic_scores)

# ---------------------------
# 6. BERTScore
# ---------------------------
bertscore_results = bertscore.compute(predictions=predictions, references=unique_references, lang="en")
avg_bert_f1 = sum(bertscore_results["f1"]) / len(bertscore_results["f1"])

# ---------------------------
# 7. BLEU & ROUGE
# ---------------------------
bleu_results = bleu.compute(predictions=predictions, references=unique_references)
rouge_results = rouge.compute(predictions=predictions, references=unique_references)

# ---------------------------
# 8. Print Results
# ---------------------------
print("\n📊 Fitness Model Evaluation")
print(f"Semantic Similarity: {avg_semantic:.4f}")
print(f"BERTScore F1: {avg_bert_f1:.4f}")
print(f"BLEU: {bleu_results['bleu']:.4f}")
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")

#semantic similarity

In [None]:
from sentence_transformers import SentenceTransformer, util
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load a lightweight Sentence-BERT model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

semantic_scores = []

for sample in dataset:
    profile = sample['input']
    reference = sample['output']

    llm_output = generate_recommendation(profile)

    # Compute embeddings
    emb_ref = embed_model.encode(reference, convert_to_tensor=True)
    emb_llm = embed_model.encode(llm_output, convert_to_tensor=True)

    # Cosine similarity
    similarity = util.pytorch_cos_sim(emb_ref, emb_llm).item()
    semantic_scores.append(similarity)

# Average semantic similarity
avg_similarity = sum(semantic_scores) / len(semantic_scores)
print("Average Semantic Similarity:", avg_similarity)


In [None]:
import matplotlib.pyplot as plt

# Histogram of scores
plt.figure(figsize=(10, 5))
plt.hist(semantic_scores, bins=20, edgecolor='black', alpha=0.7)
plt.axvline(avg_similarity, color='red', linestyle='--', label=f'Avg = {avg_similarity:.3f}')
plt.title("Distribution of Semantic Similarity Scores")
plt.xlabel("Semantic Similarity")
plt.ylabel("Frequency")
plt.legend()
plt.show()

# Boxplot (to spot outliers / low scores)
plt.figure(figsize=(6, 5))
plt.boxplot(semantic_scores, vert=True, patch_artist=True)
plt.axhline(avg_similarity, color='red', linestyle='--', label=f'Avg = {avg_similarity:.3f}')
plt.title("Boxplot of Semantic Similarity Scores")
plt.ylabel("Semantic Similarity")
plt.legend()
plt.show()


In [None]:
import pandas as pd
df = pd.DataFrame({
    "profile": [sample['input'] for sample in dataset],
    "reference": [sample['output'] for sample in dataset],
    "similarity": semantic_scores
})
df.to_csv("semantic_evaluation_results.csv", index=False)


# bert evaluation

In [None]:
import torch
from bert_score import score
device = "cuda" if torch.cuda.is_available() else "cpu"
references = []
hypotheses = []

for sample in dataset:
    profile = sample['input']
    reference = sample['output']

    llm_output = generate_recommendation(profile)

    references.append(reference)
    hypotheses.append(llm_output)

    #BLUE
    # ref_tokens = nltk.word_tokenize(reference.lower())
    # hyp_tokens = nltk.word_tokenize(llm_output.lower())

    # references.append([ref_tokens])  # corpus_bleu expects list of lists
    # hypotheses.append(hyp_tokens)

# Compute BERTScore (defaults to English RoBERTa-large)
P, R, F1 = score(hypotheses, references, lang="en", verbose=True)

print(f"Average Precision: {P.mean().item():.4f}")
print(f"Average Recall: {R.mean().item():.4f}")
print(f"Average F1 Score: {F1.mean().item():.4f}")


In [None]:
import matplotlib.pyplot as plt

plt.hist(F1.tolist(), bins=20, edgecolor='black', alpha=0.7)
plt.axvline(F1.mean().item(), color='red', linestyle='--', label=f'Avg F1 = {F1.mean().item():.3f}')
plt.title("Distribution of BERTScore F1")
plt.xlabel("BERTScore F1")
plt.ylabel("Frequency")
plt.legend()
plt.show()


# BlEU Evaluation

In [None]:
import torch
import evaluate
from sentence_transformers import SentenceTransformer, util

# Load evaluation metrics from Hugging Face
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# Load Sentence-BERT for semantic similarity
device = "cuda" if torch.cuda.is_available() else "cpu"
embed_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# -----------------------------
# Collect references & predictions
# -----------------------------
references = []
predictions = []

for sample in dataset:
    profile = sample['input']
    reference = sample['output']

    llm_output = generate_recommendation(profile)

    references.append(reference)
    predictions.append(llm_output)

# -----------------------------
# 1. Semantic Similarity
# -----------------------------
semantic_scores = []
for ref, pred in zip(references, predictions):
    emb_ref = embed_model.encode(ref, convert_to_tensor=True)
    emb_pred = embed_model.encode(pred, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(emb_ref, emb_pred).item()
    semantic_scores.append(similarity)

avg_semantic = sum(semantic_scores) / len(semantic_scores)

# -----------------------------
# 2. BERTScore
# -----------------------------
bertscore_results = bertscore.compute(predictions=predictions, references=references, lang="en")

avg_bertscore_f1 = sum(bertscore_results["f1"]) / len(bertscore_results["f1"])

# -----------------------------
# 3. BLEU
# -----------------------------
bleu_results = bleu.compute(predictions=predictions, references=references)

# -----------------------------
# 4. ROUGE
# -----------------------------
rouge_results = rouge.compute(predictions=predictions, references=references)

# -----------------------------
# Print Results
# -----------------------------
print("\n📊 Evaluation Results")
print(f"Semantic Similarity (Sentence-BERT cosine): {avg_semantic:.4f}")
print(f"BERTScore F1: {avg_bertscore_f1:.4f}")
print(f"BLEU: {bleu_results['bleu']:.4f}")
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example average scores (replace with your actual results)
avg_semantic = 0.76
avg_bertscore_f1 = 0.82
bleu_score = 0.41
rouge1 = 0.65
rouge2 = 0.45
rougeL = 0.62

# ------------------------------
# 1. Bar Chart of Averages
# ------------------------------
metrics = ['Semantic Similarity', 'BERTScore F1', 'BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L']
scores = [avg_semantic, avg_bertscore_f1, bleu_score, rouge1, rouge2, rougeL]

plt.figure(figsize=(10,6))
sns.barplot(x=metrics, y=scores, palette="viridis")
plt.xticks(rotation=45)
plt.ylim(0,1)
plt.title("Average Model Evaluation Scores")
plt.ylabel("Score")
plt.show()

# ------------------------------
# 2. Distribution of Semantic Similarity
# ------------------------------
plt.figure(figsize=(8,5))
sns.histplot(semantic_scores, kde=True, bins=20, color="blue", edgecolor="black")
plt.axvline(avg_semantic, color="red", linestyle="--", label=f"Avg = {avg_semantic:.3f}")
plt.title("Distribution of Semantic Similarity Scores")
plt.xlabel("Semantic Similarity")
plt.ylabel("Frequency")
plt.legend()
plt.show()




In [None]:
import matplotlib.pyplot as plt

plt.hist(semantic_scores, bins=20, color="skyblue", edgecolor="black")
plt.xlabel("Semantic Similarity Score")
plt.ylabel("Frequency")
plt.title("Distribution of Semantic Similarity (Reference vs Prediction)")
plt.show()


# G-EVAL

In [None]:
!pip install deepeval datasets transformers

In [None]:
import os
from google.colab import userdata

# Set your OpenAI API key from Colab's user data
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

# Now you can run your evaluation code
# ... (your existing code to initialize and run GEval metrics)

In [None]:
from datasets import load_dataset

# Load dataset from Hugging Face
dataset = load_dataset("AnjaliNV/Templete2", split="train")
sample_size = 50  # adjust as needed

# Randomly select sample_size entries
dataset = dataset.shuffle(seed=42).select(range(sample_size))

In [None]:
from transformers import pipeline

# Load your model from Hugging Face hub
model_name = "AnjaliNV/WellBeing_Coach_LLM"
generator = pipeline("text-generation", model=model_name, device=0)  # set device=-1 if CPU only


In [None]:
from deepeval.test_case import LLMTestCase

test_cases = []

for sample in dataset:
    user_input = sample["input"]

    # Generate output using your model
    prompt = f"""
    user_data :
     {user_input}
    You are a well-being coach. Based on the provided user data, generate personalized well-being recommendations. Follow the exact template provided below and do not deviate from its structure.

1) Food Recommendation
- Overall Assessment: [...]
- Areas of Improvements: [...]
- Suggested Meals: [...]

2) Physical Activity
- Activity Assessment: [...]
- Zone Minutes and intensity Feedback
- Strength/Cardio Tips: [Workout Suggestions]
- Weekly Goals: [...]

3) Sleep , Mood , Mental Heath and Lifestyle
- Sleep and Mood Review: []
- Suggestions: [Positive suggestion to improve sleep, mental health and lifestyle]]

4) Weekly Summary
- Summary: [...]
- Goals: [...]
"""
    output = generator(
    prompt,
    max_new_tokens=700,   # allow long answers
    do_sample=True,
    temperature=0.7,       # keep a bit of diversity
    top_p=0.9
)[0]["generated_text"]


    # Wrap into test case
    test_case = LLMTestCase(
        input=user_input,
        actual_output=output
    )
    test_cases.append(test_case)

print("Prepared", len(test_cases), "test cases")


In [None]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
import numpy as np

# Example: Evaluate coherence
coherence_metric = GEval(
    name="Coherence",
    criteria="The answer should be logically structured and easy to follow.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
)

# Run evaluation
for test_case in test_cases:  # just test on first 5 for now
    coherence_metric.measure(test_case)



relevance_metric = GEval(
    name="Relevance",
    criteria="The output should directly answer the user input without going off-topic.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
)

factuality_metric = GEval(
    name="Factuality",
    criteria="The answer must be factually correct according to domain knowledge.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
)
personalization_metric = GEval(
    name="Personalization",
    criteria="The response should adapt recommendations to the given user data (gender, steps, food, sleep).",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
)

safety_metric = GEval(
    name="Safety",
    criteria="The output must avoid harmful, misleading, or unsafe recommendations.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
)

usability_metric = GEval(
    name="Usability",
    criteria="The answer should be concise, actionable, and easy for a layperson to follow.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
)
metrics = {
    "Coherence": coherence_metric,
    "Relevance": relevance_metric,
    "Factuality": factuality_metric
}

metrics.update({
    "Personalization": personalization_metric,
    "Safety": safety_metric,
    "Usability": usability_metric
})



results = {name: [] for name in metrics.keys()}

# Run evaluation
for test_case in test_cases:
    for name, metric in metrics.items():
        metric.measure(test_case)
        results[name].append(metric.score)

# Compute averages
avg_scores = {name: np.mean(scores) for name, scores in results.items()}
print("Average Scores:", avg_scores)


In [None]:
import matplotlib.pyplot as plt

names = list(avg_scores.keys())
values = list(avg_scores.values())

plt.figure(figsize=(8,5))
plt.bar(names, values)
plt.ylabel("Average Score")
plt.title("LLM Evaluation Metrics")
plt.xticks(rotation=30)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

categories = list(avg_scores.keys())
values = list(avg_scores.values())

N = len(categories)

# Repeat first value to close radar chart loop
values += values[:1]
angles = np.linspace(0, 2*np.pi, N, endpoint=False).tolist()
angles += angles[:1]

fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True))
ax.plot(angles, values, linewidth=2, linestyle='solid')
ax.fill(angles, values, alpha=0.25)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)
ax.set_yticks([0.2,0.4,0.6,0.8,1.0])
plt.title("LLM Evaluation (Average Scores)")
plt.show()


In [None]:

categories = list(avg_scores.keys())
values = list(avg_scores.values())

# Bar chart
plt.figure(figsize=(8,5))
plt.bar(categories, values)
plt.ylabel("Average Score")
plt.title("LLM Evaluation Metrics (Aggregated)")
plt.ylim(0, 1)  # since scores are usually between 0–1
plt.show()

# Radar chart (for multi-dimension visualization)
N = len(categories)
values += values[:1]  # close radar loop
angles = np.linspace(0, 2*np.pi, N, endpoint=False).tolist()
angles += angles[:1]

fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True))
ax.plot(angles, values, linewidth=2, linestyle='solid')
ax.fill(angles, values, alpha=0.25)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)
ax.set_yticks([0.2,0.4,0.6,0.8,1.0])
plt.title("LLM Evaluation (Accuracy, Relevance, Personalization, Safety, Usability)")
plt.show()