In [1]:
import os
import uuid
import requests
import json
import streamlit as st
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from dotenv import load_dotenv

load_dotenv()

# Constants
GROQ_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
collection_name = "battery_chunks"
client = QdrantClient(host="localhost", port=6333)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

with open("evaluation_questions_tf.json", "r", encoding="utf-8") as f:
    tf_dataset = json.load(f)

def retrieve_context(question: str, top_k=20) -> str:
    query_vector = model.encode(question)
    response = client.query_points(
        collection_name=collection_name,
        query=query_vector,
        limit=top_k
    )
    retrieved = [point.payload.get("text") for point in response.points if point.payload.get("text")]
    return "\n\n".join(retrieved), response.points

def load_tf_prompt_template(path="evaluation_prompt_tf.txt") -> str:
    try:
        with open(path, "r", encoding="utf-8") as file:
            return file.read()
    except FileNotFoundError:
        raise FileNotFoundError(f"Prompt template file not found at: {path}")
    except Exception as e:
        raise RuntimeError(f"Failed to load prompt template: {e}")

def generate_true_false_prompt(context: str, query: str) -> str:
    template = load_tf_prompt_template()
    try:
        return template.format(context=context, query=query)
    except KeyError as e:
        raise KeyError(f"Missing placeholder in prompt template: {e}")

def ask_groq_llm(prompt, model=GROQ_MODEL, key=GROQ_API_KEY):
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": "You are a technical expert."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.0
    }
    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    return response.json()["choices"][0]["message"]["content"].strip().lower()

# Evaluation loop
correct = 0
total = len(tf_dataset)

import time

TP = FP = FN = TN = correct = 0
total = len(tf_dataset)

def normalize_result(result):
    return result.strip().lower().replace("**", "")

In [4]:
for i, item in enumerate(tf_dataset, 1):
    time.sleep(10)
    # i = 0
    # item = tf_dataset[i]
    query = item["question"]
    expected = "true" if item["answer"] else "false"

    context, _ = retrieve_context(query)
    prompt = generate_true_false_prompt(context, query)
    result = ask_groq_llm(prompt)

    result_normalized = normalize_result(result)
    is_correct = (result_normalized == expected)
    correct += is_correct

    # Update confusion matrix
    if expected == "true":
        if result_normalized == "true":
            TP += 1
        else:
            FN += 1
    else:
        if result_normalized == "true":
            FP += 1
        else:
            TN += 1

    print(f"{i}. Q: {query}")
    print(f"   Expected: {expected}, Got: {result_normalized}, Correct: {is_correct}\n")

1. Q: Michael Faraday proposed in 1834 that every ampere flowing through an electrochemical cell corresponds to a matching chemical change.
   Expected: true, Got: true, Correct: True

2. Q: The actual energy generated by a battery is calculated by multiplying amperes, time, and average voltage.
   Expected: true, Got: true, Correct: True

3. Q: One chemical equivalent of zinc weighs approximately 87 grams.
   Expected: false, Got: false, Correct: True

4. Q: One faraday of charge is equal to 96,485 coulombs.
   Expected: true, Got: true, Correct: True

5. Q: A zinc and manganese dioxide cell operating at 1.2 volts produces approximately 32.2 watt-hours of energy per chemical equivalent discharged.
   Expected: true, Got: true, Correct: True

6. Q: Zhang et al. developed nanocomposite paper supercapacitors using multi-walled carbon nanotubes embedded in micro-fibrillated cellulose.
   Expected: true, Got: true, Correct: True

7. Q: The paper-based supercapacitor created by Zhang et al.

In [5]:
# Final metrics
accuracy = correct / total * 100
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# Report
print(f"\nEvaluation Complete: Accuracy = {accuracy:.2f}% on {total} questions\n")
print(f"Precision: {precision:.2f}")
print(f"Recall:    {recall:.2f}")
print(f"F1 Score:  {f1:.2f}")
print(f"TP: {TP}, FP: {FP}, TN: {TN}, FN: {FN}")


Evaluation Complete: Accuracy = 89.29% on 28 questions

Precision: 1.00
Recall:    0.76
F1 Score:  0.86
TP: 16, FP: 0, TN: 9, FN: 5


In [3]:
with open("subjective_questions.json", "r", encoding="utf-8") as f:
    subj_dataset = json.load(f)

print(subj_dataset)

[{'question': 'What makes graphene a suitable material for use in supercapacitors?', 'answer': 'Graphene is highly conductive, extremely thin, and flexible, allowing it to maximize electrode surface area for efficient energy storage.'}, {'question': 'How do capacitors differ from batteries in storing energy?', 'answer': 'Capacitors store energy on the surface of electrodes without chemical reactions, enabling much faster charging compared to batteries.'}, {'question': 'Why is degradation per 100 cycles important in battery evaluation?', 'answer': "It indicates how quickly a battery's performance declines, helping assess its long-term stability and efficiency in repeated usage."}, {'question': 'What is the role of the Faraday constant in electrochemical cells?', 'answer': 'It relates the amount of electric charge per mole of electrons transferred during electrochemical reactions, essential for energy calculations.'}, {'question': 'Why are liquid metal batteries considered highly durable

In [18]:
def load_subj_prompt_template(path="evaluation_prompt_subj.txt") -> str:
    try:
        with open(path, "r", encoding="utf-8") as file:
            return file.read()
    except FileNotFoundError:
        raise FileNotFoundError(f"Prompt template file not found at: {path}")
    except Exception as e:
        raise RuntimeError(f"Failed to load prompt template: {e}")
    
def generate_subj_prompt(context: str, query: str) -> str:
    template = load_subj_prompt_template()
    try:
        return template.format(context=context, query=query)
    except KeyError as e:
        raise KeyError(f"Missing placeholder in prompt template: {e}")

def load_subj_judge_template(path="judge_prompt.txt") -> str:
    try:
        with open(path, "r", encoding="utf-8") as file:
            return file.read()
    except FileNotFoundError:
        raise FileNotFoundError(f"Prompt template file not found at: {path}")
    except Exception as e:
        raise RuntimeError(f"Failed to load prompt template: {e}")
    
def generate_judge_prompt(question: str, reference_answer: str, candidate_answer: str) -> str:
    template = load_subj_judge_template()
    try:
        return template.format(question = question, reference_answer = reference_answer, candidate_answer = candidate_answer)
    except KeyError as e:
        raise KeyError(f"Missing placeholder in prompt template: {e}")



In [14]:
for i, item in enumerate(subj_dataset, 1):
    time.sleep(10)
    # i = 0
    # item = tf_dataset[i]
    query = item["question"]
    expected_answer = item["answer"]

    context, _ = retrieve_context(query)
    prompt = generate_subj_prompt(context, query)
    result = ask_groq_llm(prompt)

    item['llm_answer'] = result
    print(query, expected_answer, result)

What makes graphene a suitable material for use in supercapacitors? Graphene is highly conductive, extremely thin, and flexible, allowing it to maximize electrode surface area for efficient energy storage. graphene's high surface area, electrical conductivity, and thermal management properties make it suitable for supercapacitors, enabling rapid charge/discharge and high energy density.
How do capacitors differ from batteries in storing energy? Capacitors store energy on the surface of electrodes without chemical reactions, enabling much faster charging compared to batteries. capacitors store energy on electrode surfaces, not through chemical reactions like batteries, allowing for rapid charging but lower energy density, typically around 1/3 watt-hour per kilogram.
Why is degradation per 100 cycles important in battery evaluation? It indicates how quickly a battery's performance declines, helping assess its long-term stability and efficiency in repeated usage. degradation per 100 cycle

In [15]:
print(subj_dataset)

[{'question': 'What makes graphene a suitable material for use in supercapacitors?', 'answer': 'Graphene is highly conductive, extremely thin, and flexible, allowing it to maximize electrode surface area for efficient energy storage.', 'llm_answer': "graphene's high surface area, electrical conductivity, and thermal management properties make it suitable for supercapacitors, enabling rapid charge/discharge and high energy density."}, {'question': 'How do capacitors differ from batteries in storing energy?', 'answer': 'Capacitors store energy on the surface of electrodes without chemical reactions, enabling much faster charging compared to batteries.', 'llm_answer': 'capacitors store energy on electrode surfaces, not through chemical reactions like batteries, allowing for rapid charging but lower energy density, typically around 1/3 watt-hour per kilogram.'}, {'question': 'Why is degradation per 100 cycles important in battery evaluation?', 'answer': "It indicates how quickly a battery'

In [20]:
scores = []
for i, item in enumerate(subj_dataset, 1):
    time.sleep(5)
    question = item['question']
    reference_answer = item['answer']
    candidate_answer = item['llm_answer']
    prompt = generate_judge_prompt(question, reference_answer, candidate_answer)
    result = ask_groq_llm(prompt)
    scores.append(result)
    print(result)

5
5
5
3
5
5
5
5
5
5
4
4
5
5
4
4
4
5
4
5


In [25]:
for i in range(len(scores)):
    scores[i] = int(scores[i])
print("Subjective accuracy as judged by the LLM:", (sum(scores)*0.2)/len(scores))

Subjective accuracy as judged by the LLM: 0.9200000000000002
