In [1]:


from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from difflib import SequenceMatcher

# Test data: questions and expected answers
examples = [
    {
        "question": "What is a stacked trace?",
        "expected_answer": "A stacked trace is a single trace formed by summing or stacking together traces of each CMP gather. It is often used to approximate a zero-offset trace."
    },
    {
        "question": "How do I apply a high-pass filter in Seismic Unix?",
        "expected_answer": "To apply a high-pass filter at 10Hz in Seismic Unix, use: sufilter f=10"
    }
]

# Simulation of real responses
chatbot_responses = [
    "A stacked trace is a single trace formed by summing or stacking together traces of each Common Midpoint (CMP) gather. The stacked trace is often used to approximate a zero-offset trace.",
    "To apply a high-pass filter at 10Hz in Seismic Unix, use: sufilter f=10"
]

# Metrics
rouge = Rouge()

def contains(expected, actual):
    return expected.lower() in actual.lower()

def evaluate_all():
    total = len(examples)
    matched = 0
    bleu_scores = []
    rouge_scores = []

    print("\n🔍 Evaluation Results\n" + "-"*40)

    for idx, (example, response) in enumerate(zip(examples, chatbot_responses)):
        expected = example["expected_answer"]
        predicted = response

        match = contains(expected, predicted)
        bleu = sentence_bleu([expected.split()], predicted.split())
        rouge_score = rouge.get_scores(predicted, expected)[0]['rouge-l']['f']

        if match:
            matched += 1

        bleu_scores.append(bleu)
        rouge_scores.append(rouge_score)

        print(f"\n📌 Question {idx+1}: {example['question']}")
        print(f"Expected: {expected}")
        print(f"Bot replied: {predicted}")
        print(f"Contains expected phrase? {'✅' if match else '❌'}")
        print(f"BLEU score: {bleu:.2f}")
        print(f"ROUGE-L score: {rouge_score:.2f}")

    print("\n✅ Summary")
    print(f"Matched answers: {matched}/{total} ({(matched/total)*100:.2f}%)")
    print(f"Average BLEU: {sum(bleu_scores)/total:.2f}")
    print(f"Average ROUGE-L: {sum(rouge_scores)/total:.2f}")

if __name__ == "__main__":
    evaluate_all()



🔍 Evaluation Results
----------------------------------------

📌 Question 1: What is a stacked trace?
Expected: A stacked trace is a single trace formed by summing or stacking together traces of each CMP gather. It is often used to approximate a zero-offset trace.
Bot replied: A stacked trace is a single trace formed by summing or stacking together traces of each Common Midpoint (CMP) gather. The stacked trace is often used to approximate a zero-offset trace.
Contains expected phrase? ❌
BLEU score: 0.72
ROUGE-L score: 0.87

📌 Question 2: How do I apply a high-pass filter in Seismic Unix?
Expected: To apply a high-pass filter at 10Hz in Seismic Unix, use: sufilter f=10
Bot replied: To apply a high-pass filter at 10Hz in Seismic Unix, use: sufilter f=10
Contains expected phrase? ✅
BLEU score: 1.00
ROUGE-L score: 1.00

✅ Summary
Matched answers: 1/2 (50.00%)
Average BLEU: 0.86
Average ROUGE-L: 0.94


In [None]:
import os
import time
import pickle
import re
from langchain_community.document_loaders import TextLoader, PyPDFLoader, CSVLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from huggingface_hub import InferenceClient
import evaluate

API_KEY = "API-KEY"

client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=API_KEY)
embeddings = HuggingFaceEndpointEmbeddings(
    model="sentence-transformers/all-MiniLM-L6-v2",
    huggingfacehub_api_token=API_KEY
)

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

def clean(text):
    return re.sub(r"[^\w\s]", "", text.lower()).strip()

def load_documents():
    #BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    BASE_DIR = os.getcwd()
    DATASET_DIR = os.path.join(BASE_DIR, "dataset")
    documents = []

    if not os.path.exists(DATASET_DIR):
        raise FileNotFoundError("Missing 'dataset' directory")

    splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=30)

    for file in os.listdir(DATASET_DIR):
        path = os.path.join(DATASET_DIR, file)

        if file.endswith(".txt"):
            loader = TextLoader(path, encoding="utf-8")
        elif file.endswith(".pdf"):
            loader = PyPDFLoader(path)
        elif file.endswith(".csv"):
            loader = CSVLoader(path)
        else:
            continue

        docs = loader.load()
        for doc in splitter.split_documents(docs):
            doc.metadata["source_file"] = file
            documents.append(doc)

    return documents

def build_chroma(documents):
    texts = [doc.page_content for doc in documents]
    embs = embeddings.embed_documents(texts)
    return Chroma.from_texts(texts, embedding=embeddings, metadatas=[doc.metadata for doc in documents])

def answer_question(query, db):
    results = db.similarity_search(query, k=1)
    if results:
        return results[0].page_content.strip(), results[0].metadata.get("source_file", "Unknown")
    else:
        return "No relevant information found.", "N/A"

def evaluate_response(question, expected_answer, bot_answer):
    pred = clean(bot_answer)
    ref = clean(expected_answer)

    contains_expected = expected_answer.lower() in bot_answer.lower()
    bleu_score = bleu.compute(predictions=[pred], references=[[ref]])["bleu"]
    rouge_score = rouge.compute(predictions=[pred], references=[ref])["rougeL"]

    return contains_expected, bleu_score, rouge_score

# Load and embed documents
print("🔄 Loading documents...")
documents = load_documents()
db = build_chroma(documents)
print("✅ Documents loaded and embedded.")

# Define test questions and answers
test_cases = [
    {
        "question": "What is a stacked trace?",
        "expected": "A stacked trace is a single trace formed by summing or stacking together traces of each CMP gather. It is often used to approximate a zero-offset trace."
    },
    {
        "question": "How do I apply a high-pass filter in Seismic Unix?",
        "expected": "To apply a high-pass filter at 10Hz in Seismic Unix, use: sufilter f=10"
    }
]

print("\n🔍 Evaluation Results\n" + "-" * 40)

total = len(test_cases)
match_count = 0
total_bleu = 0
total_rouge = 0

for i, case in enumerate(test_cases, 1):
    print(f"\n📌 Question {i}: {case['question']}")
    expected = case["expected"]
    answer, source = answer_question(case["question"], db)
    print(f"Expected: {expected}")
    print(f"Bot replied: {answer}")
    print(f"Source file: {source}")

    contains, bleu_score, rouge_score = evaluate_response(case["question"], expected, answer)

    print(f"Contains expected phrase? {'✅' if contains else '❌'}")
    print(f"BLEU score: {bleu_score:.2f}")
    print(f"ROUGE-L score: {rouge_score:.2f}")
    print(f"📎 This answer was found in the document: {source}")

    match_count += contains
    total_bleu += bleu_score
    total_rouge += rouge_score

# Summary
print("\n✅ Summary")
print(f"Matched answers: {match_count}/{total} ({(match_count/total)*100:.2f}%)")
print(f"Average BLEU: {total_bleu/total:.2f}")
print(f"Average ROUGE-L: {total_rouge/total:.2f}")

🔄 Loading documents...
✅ Documents loaded and embedded.

🔍 Evaluation Results
----------------------------------------

📌 Question 1: What is a stacked trace?
Expected: A stacked trace is a single trace formed by summing or stacking together traces of each CMP gather. It is often used to approximate a zero-offset trace.
Bot replied: used to approximate a zero-offset trace, which can be acquired by placing a shot and a
receiver at the same position. The stacked trace has good signal content because the stack-
Source file: processing_data.pdf
Contains expected phrase? ❌
BLEU score: 0.18
ROUGE-L score: 0.21
📎 This answer was found in the document: processing_data.pdf

📌 Question 2: How do I apply a high-pass filter in Seismic Unix?
Expected: To apply a high-pass filter at 10Hz in Seismic Unix, use: sufilter f=10
Bot replied: seismic, geology, & well data
c. Data Enhancement
Input: CDP, statics, & velocity files
1.  NMO & statics corrections
2.  CDP stack
3.  Earth absorption compensation
