<a href="https://colab.research.google.com/github/AnjaliVaghjiani/Thesis/blob/main/Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Required Libraies

In [None]:
!pip install datasets transformers sentence-transformers torch

In [None]:
!pip install bert-score

In [None]:
!pip install evaluate

In [None]:
!pip install rouge_score

#data set

In [None]:
from datasets import load_dataset

# Replace 'your_dataset_name' with the HF dataset repo ID
dataset = load_dataset('AnjaliNV/Templete2', split='train')

sample_size = 100  # adjust as needed

# Randomly select sample_size entries
dataset = dataset.shuffle(seed=42).select(range(sample_size))


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# model_name = "ContactDoctor/Bio-Medical-Llama-3-8B"  # Hugging Face repo of your model
model_name = "AnjaliNV/WellBeing_Coach_LLM"  # Hugging Face repo of your model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)




In [None]:
def generate_recommendation(profile):
    prompt = f"""
You are a professional well-being coach.
Your role is to generate personalized well-being recommendations for the user.

USER INPUT:
{profile}

INSTRUCTIONS:
- Always follow the official guidelines before making any recommendation.
- Do NOT over-recommend. Keep suggestions safe, concise, and realistic. Ask user to simple avoid the contradit food with their diet prefernce.
- Always take into account the user’s age, sex, and fitness history.
- Strictly follow the output template provided below. Do not add extra sections, headings, or commentary.
- Keep tone supportive, professional, and encouraging.

TASK:
Based on the above user data, generate recommendations using EXACTLY the following structure:

1) Food Recommendation
- Overall Assessment: [...]
- Areas of Improvements: [...]
- Suggested Meals: [...]

2) Physical Activity
- Activity Assessment: [...]
- Zone Minutes and Intensity Feedback: [...]
- Strength/Cardio Tips: [Workout Suggestions]
- Weekly Goals: [...]

3) Sleep, Mood, Mental Health and Lifestyle
- Sleep and Mood Review: [...]
- Suggestions: [...]

4) Weekly Summary
- Summary: [...]
- Goals: [...]
"""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_length = inputs.input_ids.shape[1]
    output_ids = model.generate(**inputs, max_new_tokens=700)
    output_text = tokenizer.decode(output_ids[0][input_length:], skip_special_tokens=True)
    return output_text


#semantic similarity

In [None]:
from sentence_transformers import SentenceTransformer, util
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load a lightweight Sentence-BERT model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

semantic_scores = []

for sample in dataset:
    profile = sample['input']
    reference = sample['output']

    llm_output = generate_recommendation(profile)

    # Compute embeddings
    emb_ref = embed_model.encode(reference, convert_to_tensor=True)
    emb_llm = embed_model.encode(llm_output, convert_to_tensor=True)

    # Cosine similarity
    similarity = util.pytorch_cos_sim(emb_ref, emb_llm).item()
    semantic_scores.append(similarity)

# Average semantic similarity
avg_similarity = sum(semantic_scores) / len(semantic_scores)
print("Average Semantic Similarity:", avg_similarity)


In [None]:
import matplotlib.pyplot as plt

# Histogram of scores
plt.figure(figsize=(10, 5))
plt.hist(semantic_scores, bins=20, edgecolor='black', alpha=0.7)
plt.axvline(avg_similarity, color='red', linestyle='--', label=f'Avg = {avg_similarity:.3f}')
plt.title("Distribution of Semantic Similarity Scores")
plt.xlabel("Semantic Similarity")
plt.ylabel("Frequency")
plt.legend()
plt.show()

# Boxplot (to spot outliers / low scores)
plt.figure(figsize=(6, 5))
plt.boxplot(semantic_scores, vert=True, patch_artist=True)
plt.axhline(avg_similarity, color='red', linestyle='--', label=f'Avg = {avg_similarity:.3f}')
plt.title("Boxplot of Semantic Similarity Scores")
plt.ylabel("Semantic Similarity")
plt.legend()
plt.show()


In [None]:
import pandas as pd
df = pd.DataFrame({
    "profile": [sample['input'] for sample in dataset],
    "reference": [sample['output'] for sample in dataset],
    "similarity": semantic_scores
})
df.to_csv("semantic_evaluation_results.csv", index=False)


# bert evaluation

In [None]:
import torch
from bert_score import score
device = "cuda" if torch.cuda.is_available() else "cpu"
references = []
hypotheses = []

for sample in dataset:
    profile = sample['input']
    reference = sample['output']

    llm_output = generate_recommendation(profile)

    references.append(reference)
    hypotheses.append(llm_output)

    #BLUE
    # ref_tokens = nltk.word_tokenize(reference.lower())
    # hyp_tokens = nltk.word_tokenize(llm_output.lower())

    # references.append([ref_tokens])  # corpus_bleu expects list of lists
    # hypotheses.append(hyp_tokens)

# Compute BERTScore (defaults to English RoBERTa-large)
P, R, F1 = score(hypotheses, references, lang="en", verbose=True)

print(f"Average Precision: {P.mean().item():.4f}")
print(f"Average Recall: {R.mean().item():.4f}")
print(f"Average F1 Score: {F1.mean().item():.4f}")


In [None]:
import matplotlib.pyplot as plt

plt.hist(F1.tolist(), bins=20, edgecolor='black', alpha=0.7)
plt.axvline(F1.mean().item(), color='red', linestyle='--', label=f'Avg F1 = {F1.mean().item():.3f}')
plt.title("Distribution of BERTScore F1")
plt.xlabel("BERTScore F1")
plt.ylabel("Frequency")
plt.legend()
plt.show()


# BlEU Evaluation

In [None]:
import torch
import evaluate
from sentence_transformers import SentenceTransformer, util

# Load evaluation metrics from Hugging Face
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# Load Sentence-BERT for semantic similarity
device = "cuda" if torch.cuda.is_available() else "cpu"
embed_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# -----------------------------
# Collect references & predictions
# -----------------------------
references = []
predictions = []

for sample in dataset:
    profile = sample['input']
    reference = sample['output']

    llm_output = generate_recommendation(profile)

    references.append(reference)
    predictions.append(llm_output)

# -----------------------------
# 1. Semantic Similarity
# -----------------------------
semantic_scores = []
for ref, pred in zip(references, predictions):
    emb_ref = embed_model.encode(ref, convert_to_tensor=True)
    emb_pred = embed_model.encode(pred, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(emb_ref, emb_pred).item()
    semantic_scores.append(similarity)

avg_semantic = sum(semantic_scores) / len(semantic_scores)

# -----------------------------
# 2. BERTScore
# -----------------------------
bertscore_results = bertscore.compute(predictions=predictions, references=references, lang="en")

avg_bertscore_f1 = sum(bertscore_results["f1"]) / len(bertscore_results["f1"])

# -----------------------------
# 3. BLEU
# -----------------------------
bleu_results = bleu.compute(predictions=predictions, references=references)

# -----------------------------
# 4. ROUGE
# -----------------------------
rouge_results = rouge.compute(predictions=predictions, references=references)

# -----------------------------
# Print Results
# -----------------------------
print("\n📊 Evaluation Results")
print(f"Semantic Similarity (Sentence-BERT cosine): {avg_semantic:.4f}")
print(f"BERTScore F1: {avg_bertscore_f1:.4f}")
print(f"BLEU: {bleu_results['bleu']:.4f}")
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example average scores (replace with your actual results)
avg_semantic = 0.76
avg_bertscore_f1 = 0.82
bleu_score = 0.41
rouge1 = 0.65
rouge2 = 0.45
rougeL = 0.62

# ------------------------------
# 1. Bar Chart of Averages
# ------------------------------
metrics = ['Semantic Similarity', 'BERTScore F1', 'BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L']
scores = [avg_semantic, avg_bertscore_f1, bleu_score, rouge1, rouge2, rougeL]

plt.figure(figsize=(10,6))
sns.barplot(x=metrics, y=scores, palette="viridis")
plt.xticks(rotation=45)
plt.ylim(0,1)
plt.title("Average Model Evaluation Scores")
plt.ylabel("Score")
plt.show()

# ------------------------------
# 2. Distribution of Semantic Similarity
# ------------------------------
plt.figure(figsize=(8,5))
sns.histplot(semantic_scores, kde=True, bins=20, color="blue", edgecolor="black")
plt.axvline(avg_semantic, color="red", linestyle="--", label=f"Avg = {avg_semantic:.3f}")
plt.title("Distribution of Semantic Similarity Scores")
plt.xlabel("Semantic Similarity")
plt.ylabel("Frequency")
plt.legend()
plt.show()




In [None]:
import matplotlib.pyplot as plt

plt.hist(semantic_scores, bins=20, color="skyblue", edgecolor="black")
plt.xlabel("Semantic Similarity Score")
plt.ylabel("Frequency")
plt.title("Distribution of Semantic Similarity (Reference vs Prediction)")
plt.show()
