In [1]:
import string

def normalize(text: str) -> str:
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return " ".join(text.split()).strip()

def presence_match(test_pairs):
    N = len(test_pairs)
    matches = 0
    for gt, pred in test_pairs:
        gt_words = set(normalize(gt).split())
        pred_words = set(normalize(pred).split())
        if gt_words.issubset(pred_words):
            matches += 1
    return matches * 100 / N if N > 0 else 0.0

def extract_pairs_from_markdown(content: str):
    pairs = []
    gt, pred = None, None
    for line in content.splitlines():
        line = line.strip()

        if line.startswith("**GraphRAG Response:**"):
            pred = line.replace("**GraphRAG Response:**", "").strip()
        elif line.startswith("**Ground Truth:**"):
            gt = line.replace("**Ground Truth:**", "").strip()

        # if response text spans multiple lines, keep appending
        elif pred is not None and not line.startswith("**") and line != "---":
            pred += " " + line

        if gt and pred:
            pairs.append((gt, pred))
            gt, pred = None, None

    return pairs


if __name__ == "__main__":
    md_file_path = r"C:\Users\DAIICT D\Desktop\EXTRINSIC\WIKI\graphRag\global\musiq_global.md"
    with open(md_file_path, "r", encoding="utf-8") as f:
        content = f.read()

    test_pairs = extract_pairs_from_markdown(content)

    presence_score = presence_match(test_pairs)
    print(f"Presence Match Score: {presence_score:.4f} %")


Presence Match Score: 16.6000 %


In [1]:
import string

def normalize(text: str) -> str:
    # Lowercase, remove punctuation, trim extra spaces
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return " ".join(text.split()).strip()

def exact_match(test_pairs):
    N = len(test_pairs)
    matches = 0
    for gt, pred in test_pairs:
        if normalize(gt) == normalize(pred):
            matches += 1
    return matches * 100 / N if N > 0 else 0.0

def extract_pairs_from_markdown(content: str):
    pairs = []
    gt, pred = None, None
    for line in content.splitlines():
        line = line.strip()
        if line.startswith("**GraphRAG Response:**"):
            pred = line.replace("**GraphRAG Response:**", "").strip()
        elif line.startswith("**Ground Truth:**"):
            gt = line.replace("**Ground Truth:**", "").strip()
        if gt and pred:
            pairs.append((gt, pred))
            gt, pred = None, None
    return pairs


if __name__ == "__main__":
    md_file_path = r"C:\Users\DAIICT D\Desktop\EXTRINSIC\hotpot\graphRAG\LOCAL\hotpot_local.md"
    with open(md_file_path, "r", encoding="utf-8") as f:
        content = f.read()

    test_pairs = extract_pairs_from_markdown(content)

    em_score = exact_match(test_pairs)
    print(f"Exact Match Score: {em_score:.4f} %")

Exact Match Score: 0.0000 %


In [2]:
from typing import List
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import re


# ----- Parse markdown input file for answer pairs -----
def load_test_pairs_from_markdown(md_file_path: str):
    with open(md_file_path, "r", encoding="utf-8") as f:
        content = f.read()

    pairs = []
    gt, rt = None, None
    collecting = None  # track if we're collecting "gt" or "rt"

    for line in content.splitlines():
        line = line.strip()

        # Start of GraphRAG or Retrieved Answer
        if line.startswith("**Retrieved Answer:**") or line.startswith("**GraphRAG Response:**"):
            rt = line.split("**", 2)[-1].replace("Retrieved Answer:", "").replace("GraphRAG Response:", "").strip()
            collecting = "rt"

        # Start of Ground Truth
        elif line.startswith("**Ground Truth:**"):
            gt = line.replace("**Ground Truth:**", "").strip()
            collecting = "gt"

        # If line is continuation (multi-line text)
        elif collecting == "rt" and line and not line.startswith("**") and line != "---":
            rt += " " + line
        elif collecting == "gt" and line and not line.startswith("**") and line != "---":
            gt += " " + line

        # End of a block
        if gt and rt:
            pairs.append((gt.strip(), rt.strip()))
            gt, rt, collecting = None, None, None

    return pairs


# ----- Embedding model -----
model = SentenceTransformer("all-MiniLM-L6-v2")  # lightweight model


def embed_sentences(sentences: List[str]) -> np.ndarray:
    return model.encode(sentences, convert_to_numpy=True, show_progress_bar=False)


# ----- Retrieval Similarity function -----
def retrieval_similarity(retrieved: List[str], gold: List[str]) -> float:
    assert len(retrieved) == len(gold), "Input lists must be of same length"

    retr_embs = embed_sentences(retrieved)
    gold_embs = embed_sentences(gold)

    sims = [
        cosine_similarity(retr_embs[i].reshape(1, -1), gold_embs[i].reshape(1, -1)).item()
        for i in range(len(retrieved))
    ]

    # Debug: print each pair's similarity score
    for idx, score in enumerate(sims, start=1):
        print(f"Pair {idx} similarity: {score:.4f}")

    return float(np.mean(sims))


# ----- MAIN -----
if __name__ == "__main__":
    md_file_path = r"C:\Users\DAIICT D\Desktop\EXTRINSIC\hotpot\graphRAG\LOCAL\hotpot_local.md"

    test_pairs = load_test_pairs_from_markdown(md_file_path)

    if not test_pairs:
        print("⚠️ No Ground Truth / Retrieved Answer pairs found. Check formatting.")
    else:
        gold_texts = [gt for gt, _ in test_pairs]
        retrieved_texts = [rt for _, rt in test_pairs]

        rs_score = retrieval_similarity(retrieved_texts, gold_texts) * 100.0
        print(f"\nAverage Retrieval Similarity (R-S): {rs_score:.4f} %")


  from .autonotebook import tqdm as notebook_tqdm


Pair 1 similarity: 0.1262
Pair 2 similarity: 0.1269
Pair 3 similarity: 0.1405
Pair 4 similarity: 0.4030
Pair 5 similarity: 0.5020
Pair 6 similarity: 0.6230
Pair 7 similarity: 0.1660
Pair 8 similarity: 0.1981
Pair 9 similarity: 0.5911
Pair 10 similarity: -0.0394
Pair 11 similarity: 0.4209
Pair 12 similarity: 0.1830
Pair 13 similarity: 0.4762
Pair 14 similarity: 0.1985
Pair 15 similarity: 0.6731
Pair 16 similarity: 0.5536
Pair 17 similarity: -0.0394
Pair 18 similarity: 0.1736
Pair 19 similarity: 0.2048
Pair 20 similarity: 0.3796
Pair 21 similarity: 0.4912
Pair 22 similarity: 0.3011
Pair 23 similarity: 0.6555
Pair 24 similarity: 0.2229
Pair 25 similarity: 0.4988
Pair 26 similarity: 0.0920
Pair 27 similarity: 0.2798
Pair 28 similarity: 0.1804
Pair 29 similarity: 0.3977
Pair 30 similarity: 0.0145
Pair 31 similarity: 0.3895
Pair 32 similarity: 0.1357
Pair 33 similarity: 0.1033
Pair 34 similarity: 0.3576
Pair 35 similarity: 0.7374
Pair 36 similarity: 0.1821
Pair 37 similarity: 0.2190
Pair 38 