In [None]:
# Load dataset
import pandas as pd

df = pd.read_csv("/content/plants.csv", encoding="latin1")
df.head()

Unnamed: 0,Plant Name,Growth,Soil,Sunlight,Watering,Fertilization Type
0,Aloe Vera,slow,sandy,indirect sunlight,Water weekly,Balanced
1,Basil,fast,well-drained,full sunlight,Keep soil evenly moist,Organic
2,Snake Plant,slow,well-drained,indirect sunlight,Water when soil is dry,No
3,Lavender,moderate,sandy,full sunlight,Let soil dry between watering,No
4,Cactus,slow,sandy,full sunlight,Let soil dry between watering,Low-nitrogen


In [None]:
!pip install -q google-genai rouge-score bert-score nltk

import pandas as pd
import json
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from google import genai

nltk.download('punkt')
nltk.download('punkt_tab')

# Load CSV
df = pd.read_csv("plants.csv", encoding="latin1")

# Select one plant example
row = df.iloc[0]

plant_text = f"""
Plant Name: {row['Plant Name']}
Growth: {row['Growth']}
Soil: {row['Soil']}
Sunlight: {row['Sunlight']}
Watering: {row['Watering']}
Fertilization: {row['Fertilization Type']}
"""

# Expected reference summary
reference_summary = f"""
The {row['Plant Name']} grows at a {row['Growth']} rate.
It prefers {row['Soil']} soil and requires {row['Sunlight']}.
Watering should follow this guideline: {row['Watering']}.
Fertilizer recommendation: {row['Fertilization Type']}.
"""
reference_summary = reference_summary.strip()

# Gemini client
client = genai.Client(api_key="AIzaSyC5EucWnZPtgg-MERFTJf5qL8EN6_vftMk")

prompt = f"""
You are a plant-care assistant.
Generate care instructions using ONLY the attributes provided.

Return ONLY this JSON structure:

{{
  "plant_name": "",
  "care_instructions": ""
}}

Plant Data:
{plant_text}
"""

# Model call
response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt
    )

raw = response.text
raw = raw.replace("```json", "").replace("```", "").strip()

try:
    result = json.loads(raw)
    model_summary = result["care_instructions"]
except:
    print("Model returned invalid JSON:\n", raw)
    raise

print("\n================ Model Output ================")
print(model_summary)
print("====================================================")

print("\n================ Ground Truth ================")
print(reference_summary)
print("====================================================")

# BLEU score
reference_tokens = nltk.word_tokenize(reference_summary.lower())
candidate_tokens = nltk.word_tokenize(model_summary.lower())
bleu = sentence_bleu([reference_tokens], candidate_tokens)

# ROGUE scores
rouge = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
rouge_scores = rouge.score(reference_summary, model_summary)

# BERTScore (semantic)
P, R, F1 = bert_score(
    [model_summary],
    [reference_summary],
    lang="en",
    rescale_with_baseline=True
)

# Print all results
print("\n================ Evaluation Results ================")
print(f"BLEU Score: {bleu:.4f}")

print("\nROUGE Scores:")
for key, value in rouge_scores.items():
    print(f"  {key}: {value.fmeasure:.4f}")

print("\nBERTScore:")
print(f"  Precision: {P[0]:.4f}")
print(f"  Recall:    {R[0]:.4f}")
print(f"  F1 Score:  {F1[0]:.4f}")
print("====================================================")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



--- Model Output ---
Aloe Vera has slow growth. It thrives in sandy soil and prefers indirect sunlight. Water weekly and use a balanced fertilizer.

--- Ground Truth ---
The Aloe Vera grows at a slow rate. 
It prefers sandy soil and requires indirect sunlight. 
Watering should follow this guideline: Water weekly. 
Fertilizer recommendation: Balanced.


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BLEU Score: 0.0000

ROUGE Scores:
  rouge1: 0.6383
  rouge2: 0.2667
  rougeL: 0.5106

BERTScore:
  Precision: 0.5844
  Recall:    0.2809
  F1 Score:  0.4293


**GenAI Evauation Methods:**

BLEU Score (Bilingual Evaluation Understudy)
- Measures similiarity between generated and reference texts using n-gram comparisons.

ROGUE Score (Recall-Oriented Understudy for Gisting Evaluation)
- Measures how much content from the reference text appears in the generated text.

BERTScore
- Measures semantic similarity.

BLEU/ROGUE are designed for text that closely matches reference wording, but this model generates free-form text.
- BLEU may be low even if the content is correct.
- ROUGE may miss meaning paraphrease matches.

The results of the example above:
- BLEU Score = 0

BLEU heavily penalizes mismatched 4-grams.
- ROGUE Scores

ROGUE-1: 0.64 (Model captured ~ 64% of the important unigrams).

ROGUE-2: 0.27 (Captured ~ 27% of the important bigrams).

ROUGE-L: 0.51 (Maintained a moderate amount of structural similarity).

- BERTScore

Precision: 0.58

Recall: 0.28

F1 Score: 0.43

 A >0.40 F1 means LLM's meaning matches the reference moderately well.
