In [37]:
import json
import re
import os

# =========================================================
# INPUT FILES
# =========================================================

RAG_FILES = [
    "/content/irish_life_health_RAG_chunks_2026.jsonl",
    "/content/laya_RAG_layout_aware.jsonl",
    "/content/LEVEL_RAG_layout_aware.jsonl",
    "/content/VHI_RAG_layout_aware.jsonl",
]

FULL_LAYOUT_FILES = [
    "/content/irish_life_health_FT_full_pages_2026.jsonl",
    "/content/laya_full_layout_pages.jsonl",
    "/content/LEVEL_full_layout_pages.jsonl",
    "/content/VHI_full_layout_pages.jsonl",
]

OUTPUT_PATH = "MASTER_STRUCTURED_SUPERSET_2026.jsonl"

if os.path.exists(OUTPUT_PATH):
    os.remove(OUTPUT_PATH)

# =========================================================
# REGEX PATTERNS
# =========================================================

EURO = re.compile(r"â‚¬\s?([\d,]+)")
PERCENT = re.compile(r"(\d+)\s?%")
DAYS = re.compile(r"(\d+)\s+days", re.IGNORECASE)
EXCESS = re.compile(r"â‚¬\s?([\d,]+)[^\n]*?excess", re.IGNORECASE)
COPAY = re.compile(r"â‚¬\s?([\d,]+)[^\n]*?(co-?payment|copay)", re.IGNORECASE)

def detect_scope(text):
    text = text.lower()
    if "lifetime" in text:
        return "lifetime"
    if "calendar year" in text or "policy year" in text or "renewal year" in text:
        return "annual"
    if "per visit" in text:
        return "per_visit"
    if "per admission" in text:
        return "per_admission"
    if "per night" in text:
        return "per_night"
    return None

def detect_flags(text):
    t = text.lower()
    return {
        "psychiatric_flag": "psychiatric" in t,
        "fertility_flag": "fertility" in t,
        "maternity_flag": "maternity" in t,
        "international_flag": "abroad" in t or "repatriation" in t,
        "high_tech_flag": "high-tech" in t or "blackrock" in t or "mater private" in t,
    }

def parse_chunk(provider, plan_name, page_number, text):

    excess = None
    copay = None
    percent = None
    limit = None
    day_limit = None

    m = EXCESS.search(text)
    if m:
        excess = float(m.group(1).replace(",", ""))

    m = COPAY.search(text)
    if m:
        copay = float(m.group(1).replace(",", ""))

    m = PERCENT.search(text)
    if m:
        percent = float(m.group(1))

    euros = EURO.findall(text)
    if euros:
        values = [float(v.replace(",", "")) for v in euros]
        limit = max(values)

    m = DAYS.search(text)
    if m:
        day_limit = int(m.group(1))

    scope = detect_scope(text)
    flags = detect_flags(text)

    return {
        "provider": provider,
        "plan_name": plan_name,
        "page_number": page_number,
        "excess_amount": excess,
        "copayment_amount": copay,
        "coverage_percentage": percent,
        "limit_amount": limit,
        "day_limit": day_limit,
        "limit_scope": scope,
        **flags
    }

# =========================================================
# PRIMARY EXTRACTION FROM RAG CHUNKS
# =========================================================

structured_rows = []

for file_path in RAG_FILES:

    if not os.path.exists(file_path):
        continue

    with open(file_path, "r") as f:
        for line in f:
            obj = json.loads(line)

            provider = obj.get("provider", "Unknown")
            plan_name = obj.get("plan_name")
            page_number = obj.get("page_number")
            text = obj.get("chunk_text", "")

            record = parse_chunk(provider, plan_name, page_number, text)

            structured_rows.append(record)

# =========================================================
# ENRICHMENT FROM FULL LAYOUT (MISSING SCOPE/FLAGS ONLY)
# =========================================================

for file_path in FULL_LAYOUT_FILES:

    if not os.path.exists(file_path):
        continue

    with open(file_path, "r") as f:
        for line in f:
            obj = json.loads(line)

            plan_name = obj.get("plan_name")
            text = obj.get("raw_text_with_tables") or obj.get("full_page_text") or ""

            scope = detect_scope(text)
            flags = detect_flags(text)

            for row in structured_rows:
                if row["plan_name"] == plan_name:

                    if row["limit_scope"] is None:
                        row["limit_scope"] = scope

                    for k, v in flags.items():
                        if not row[k]:
                            row[k] = v

# =========================================================
# DEDUPLICATE (benefit-level uniqueness)
# =========================================================

unique = {}
for row in structured_rows:
    key = (
        row["provider"],
        row["plan_name"],
        row["page_number"],
        row["excess_amount"],
        row["copayment_amount"],
        row["limit_amount"]
    )
    unique[key] = row

final_rows = list(unique.values())

# =========================================================
# SAVE
# =========================================================

with open(OUTPUT_PATH, "w") as out:
    for row in final_rows:
        out.write(json.dumps(row) + "\n")

print("======================================")
print("MASTER STRUCTURED SUPERSET CREATED")
print("Total benefit-level records:", len(final_rows))
print("Saved to:", OUTPUT_PATH)
print("======================================")

MASTER STRUCTURED SUPERSET CREATED
Total benefit-level records: 290
Saved to: MASTER_STRUCTURED_SUPERSET_2026.jsonl


In [38]:
import json

INPUT_PATH = "/content/MASTER_STRUCTURED_SUPERSET_2026.jsonl"
OUTPUT_PATH = "/content/MASTER_STRUCTURED_SUPERSET_2026-1.jsonl"

with open(INPUT_PATH, "r") as infile, open(OUTPUT_PATH, "w") as outfile:

    for i, line in enumerate(infile, start=1):
        obj = json.loads(line)

        # Lines 1â€“169 â†’ Irish Life Health
        if 1 <= i <= 169:
            obj["provider"] = "Irish Life Health"

        # Lines 170â€“189 â†’ Laya Healthcare
        elif 170 <= i <= 189:
            obj["provider"] = "Laya Healthcare"

        outfile.write(json.dumps(obj) + "\n")

print("Provider names updated successfully.")
print("Saved to:", OUTPUT_PATH)

Provider names updated successfully.
Saved to: /content/MASTER_STRUCTURED_SUPERSET_2026-1.jsonl


In [16]:
!pip install faiss-cpu sentence-transformers



In [39]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# =========================
# CONFIG
# =========================

DATA_FILES = {
    "/content/irish_life_health_RAG_chunks_2026.jsonl": "Irish Life Health",
    "/content/laya_RAG_layout_aware.jsonl": "Laya Healthcare",
    "/content/LEVEL_RAG_layout_aware.jsonl": "Level Health",
    "/content/VHI_RAG_layout_aware.jsonl": "VHI",
}

STRUCTURED_PATH = "/content/MASTER_STRUCTURED_SUPERSET_2026-1.jsonl"

INDEX_PATH = "faiss_multi_provider_index.bin"
METADATA_PATH = "metadata_multi_provider.json"

# =========================
# LOAD MODEL
# =========================

model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

documents = []
metadata = []

# =========================
# LOAD RAG FILES (CLEAN PROVIDER FIX)
# =========================

for file_path, provider in DATA_FILES.items():
    print(f"Loading: {file_path} â†’ {provider}")

    with open(file_path, "r") as f:
        for line in f:
            obj = json.loads(line)

            plan_name = obj.get("plan_name", "").lower()

            # EXCLUDE NON-PLAN DOCUMENTS
            if any(x in plan_name for x in ["membership", "terms", "rules"]):
                continue

            chunk_text = obj["chunk_text"]
            page_number = obj.get("page_number")

            enriched_chunk = (
                f"[Provider: {provider}]\n"
                f"[Plan: {plan_name}]\n"
                f"[Page: {page_number}]\n\n"
                f"{chunk_text}"
            )

            documents.append(enriched_chunk)

            metadata.append({
                "provider": provider,
                "plan_name": plan_name,
                "page_number": page_number,
                "chunk_text": chunk_text
            })

# =========================
# CREATE EMBEDDINGS
# =========================

embeddings = model.encode(
    documents,
    batch_size=32,
    show_progress_bar=True,
    normalize_embeddings=True
)

embeddings = np.array(embeddings).astype("float32")

# =========================
# BUILD FAISS INDEX
# =========================

dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)

faiss.write_index(index, INDEX_PATH)

with open(METADATA_PATH, "w") as f:
    json.dump(metadata, f)

print("\nðŸ”¥ Clean multi-provider FAISS index built successfully.")

# ==========================================================
# LOAD STRUCTURED DATA
# ==========================================================

structured_data = {}

with open(STRUCTURED_PATH, "r") as f:
    for line in f:
        obj = json.loads(line)
        plan = obj["plan_name"]
        structured_data.setdefault(plan, []).append(obj)

print("Structured dataset loaded.")

# ==========================================================
# STRUCTURED SCORING FUNCTION (FIXED FALLBACK)
# ==========================================================

def compute_structured_score(plan_name):

    if plan_name not in structured_data:
        return 0.5

    plan_entries = structured_data[plan_name]

    excess_values = []
    coverage_percentages = []
    caps = []

    for entry in plan_entries:

        if entry.get("excess_amount") is not None:
            excess_values.append(float(entry["excess_amount"]))

        if entry.get("coverage_percentage") is not None:
            coverage_percentages.append(float(entry["coverage_percentage"]))

        if entry.get("limit_amount") is not None:
            caps.append(float(entry["limit_amount"]))

    if not excess_values and not coverage_percentages and not caps:
        return 0.5

    avg_excess = np.mean(excess_values) if excess_values else 0
    avg_coverage = np.mean(coverage_percentages) if coverage_percentages else 0
    avg_cap = np.mean(caps) if caps else 0

    excess_score = 1 / (1 + avg_excess / 500)
    coverage_score = avg_coverage / 100
    cap_score = min(avg_cap / 10000, 1)

    structured_score = (
        0.4 * excess_score +
        0.4 * coverage_score +
        0.2 * cap_score
    )

    return structured_score

# ==========================================================
# BUILD PROVIDER LOOKUP MAP (FAST)
# ==========================================================

plan_provider_map = {m["plan_name"]: m["provider"] for m in metadata}

# ==========================================================
# HYBRID SEARCH FUNCTION (NORMALIZED RAG)
# ==========================================================

def search(query, k=20):

    prefixed_query = (
        "Represent this sentence for searching relevant passages: "
        + query
    )

    query_embedding = model.encode(
        [prefixed_query],
        normalize_embeddings=True
    )

    query_embedding = np.array(query_embedding).astype("float32")

    scores, indices = index.search(query_embedding, k)

    plan_scores = {}
    plan_best_chunk = {}

    for score, idx in zip(scores[0], indices[0]):
        plan = metadata[idx]["plan_name"]

        if plan not in plan_scores:
            plan_scores[plan] = 0
            plan_best_chunk[plan] = metadata[idx]["chunk_text"]

        plan_scores[plan] += float(score)

    results = []

    for plan, rag_score in plan_scores.items():

        # Normalize RAG score
        normalized_rag = rag_score / (rag_score + 5)

        structured_score = compute_structured_score(plan)

        final_score = normalized_rag * 0.6 + structured_score * 0.4

        results.append({
            "plan_name": plan,
            "provider": plan_provider_map.get(plan, "Unknown"),
            "rag_score": round(normalized_rag, 4),
            "structured_score": round(structured_score, 4),
            "final_score": round(final_score, 4),
            "best_matching_chunk": plan_best_chunk[plan]
        })

    results = sorted(results, key=lambda x: x["final_score"], reverse=True)

    return results[:5]

# =========================
# TEST QUERY
# =========================

query = """
Coverage for cardiac treatment, frequent hospital visits,
consultant fees, medication support,
and minimal excess for inpatient admission.
"""

results = search(query)

for r in results:
    print("\n==============================")
    print("Plan:", r["plan_name"])
    print("Provider:", r["provider"])
    print("RAG Score:", r["rag_score"])
    print("Structured Score:", r["structured_score"])
    print("Final Score:", r["final_score"])
    print("Matched Section Preview:")
    print(r["best_matching_chunk"][:500])

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

Loading: /content/irish_life_health_RAG_chunks_2026.jsonl â†’ Irish Life Health
Loading: /content/laya_RAG_layout_aware.jsonl â†’ Laya Healthcare
Loading: /content/LEVEL_RAG_layout_aware.jsonl â†’ Level Health
Loading: /content/VHI_RAG_layout_aware.jsonl â†’ VHI


Batches:   0%|          | 0/3 [00:00<?, ?it/s]


ðŸ”¥ Clean multi-provider FAISS index built successfully.
Structured dataset loaded.

Plan: vhi_company_plan_plus_level_1_table_of_cover_2024-10-01
Provider: VHI
RAG Score: 0.2239
Structured Score: 0.7762
Final Score: 0.4448
Matched Section Preview:
Table of Benefits â€“ Company Plan Plus Level 1 Applicable to new registrations or renewals on/or after 1st October, 2024. This Table of Benefits must be read in conjunction with your Company Plan Terms and Conditions and the Directory of Approved Medical Facilities. Facilities may change from time to time, so log on to Vhi.ie or phone us on (056) 444 4444 if you are planning treatment. ~ Benefit Provision Benefit Section 1 - Hospital charges A Public 1 & 2 hospitals ï‚· Day care, side room, semi-pr

Plan: laya_healthcare_pmi_plan_b_table_of_benefits_table_of_cover_2021-10-01
Provider: Laya Healthcare
RAG Score: 0.1257
Structured Score: 0.8436
Final Score: 0.4129
Matched Section Preview:
PMI 01 10 When carried out as a Fixed Price Procedur

In [40]:
import json

with open("metadata_multi_provider.json") as f:
    data = json.load(f)

print(set([d["provider"] for d in data]))

{'VHI', 'Irish Life Health', 'Level Health', 'Laya Healthcare'}


In [41]:
import json
import numpy as np
import faiss
from collections import defaultdict
from sentence_transformers import SentenceTransformer

# ----------------------------
# CONFIG
# ----------------------------

INDEX_PATH = "faiss_multi_provider_index.bin"
METADATA_PATH = "metadata_multi_provider.json"
STRUCTURED_PATH = "MASTER_STRUCTURED_SUPERSET_2026-1.jsonl"

TOP_K = 3

# ----------------------------
# LOAD MODEL + INDEX
# ----------------------------

print("Loading embedding model...")
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

print("Loading FAISS index...")
index = faiss.read_index(INDEX_PATH)

print("Loading metadata...")
with open(METADATA_PATH, "r") as f:
    metadata = json.load(f)

# ----------------------------
# LOAD STRUCTURED DATA
# ----------------------------

structured_data = {}

with open(STRUCTURED_PATH, "r") as f:
    for line in f:
        obj = json.loads(line)
        plan = obj["plan_name"]
        structured_data.setdefault(plan, []).append(obj)

print("Structured dataset loaded.")

# ----------------------------
# STRUCTURED SCORING
# ----------------------------

def compute_structured_score(plan_name):

    if plan_name not in structured_data:
        return 0.5

    plan_entries = structured_data[plan_name]

    excess_values = []
    coverage_percentages = []
    caps = []

    for entry in plan_entries:
        if entry.get("excess_amount"):
            excess_values.append(float(entry["excess_amount"]))

        if entry.get("coverage_percentage"):
            coverage_percentages.append(float(entry["coverage_percentage"]))

        if entry.get("limit_amount"):
            caps.append(float(entry["limit_amount"]))

    avg_excess = np.mean(excess_values) if excess_values else 0
    avg_coverage = np.mean(coverage_percentages) if coverage_percentages else 100
    avg_cap = np.mean(caps) if caps else 0

    excess_score = 1 / (1 + avg_excess / 500)
    coverage_score = avg_coverage / 100
    cap_score = min(avg_cap / 10000, 1)

    structured_score = (
        0.4 * excess_score +
        0.4 * coverage_score +
        0.2 * cap_score
    )

    return structured_score

# ----------------------------
# HYBRID SEARCH FUNCTION
# ----------------------------

def search(query, k=20):

    prefixed_query = (
        "Represent this sentence for searching relevant passages: "
        + query
    )

    query_embedding = model.encode(
        [prefixed_query],
        normalize_embeddings=True
    )

    query_embedding = np.array(query_embedding).astype("float32")

    scores, indices = index.search(query_embedding, k)

    plan_scores = defaultdict(float)

    for score, idx in zip(scores[0], indices[0]):
        plan_name = metadata[idx]["plan_name"]
        plan_scores[plan_name] += float(score)

    hybrid_results = []

    for plan_name, rag_score in plan_scores.items():
        structured_score = compute_structured_score(plan_name)
        final_score = rag_score * 0.6 + structured_score * 0.4

        hybrid_results.append((plan_name, final_score))

    hybrid_results.sort(key=lambda x: x[1], reverse=True)

    return [plan for plan, _ in hybrid_results[:TOP_K]]

# ----------------------------
# PLAN NAME RESOLUTION
# ----------------------------

def get_plan_name(plan_keyword):
    matches = [
        m["plan_name"]
        for m in metadata
        if plan_keyword.lower() in m["plan_name"].lower()
    ]
    if not matches:
        raise ValueError(f"No plan found for keyword: {plan_keyword}")
    return matches[0]

# ----------------------------
# BUILD EVALUATION SET
# ----------------------------

evaluation_set = [
    {
        "query": "Does Horizon 4 cover inpatient consultant fees?",
        "expected_plan": "Horizon 4"
    },
    {
        "query": "Which plan has a â‚¬300 excess for semi-private room admission?",
        "expected_plan": "Level Health Plan B with 300 Excess"
    },
    {
        "query": "How many days of psychiatric treatment are covered under Plan A?",
        "expected_plan": "Level Health Plan A"
    },
    {
        "query": "Are inpatient scans fully covered under Health Plan 26.1?",
        "expected_plan": "Health Plan 26.1"
    }
]

# ----------------------------
# METRIC COMPUTATION
# ----------------------------

correct_top1 = 0
correct_topk = 0
reciprocal_ranks = []

print("\nRunning evaluation...\n")

for item in evaluation_set:
    query = item["query"]
    expected = item["expected_plan"]

    returned_plans = search(query)

    print("Query:", query)
    print("Expected:", expected)
    print("Returned:", returned_plans)
    print("-" * 60)

    if returned_plans and returned_plans[0] == expected:
        correct_top1 += 1

    if expected in returned_plans:
        correct_topk += 1
        rank = returned_plans.index(expected) + 1
        reciprocal_ranks.append(1.0 / rank)
    else:
        reciprocal_ranks.append(0.0)

# ----------------------------
# FINAL METRICS
# ----------------------------

total = len(evaluation_set)

top1_accuracy = correct_top1 / total
topk_recall = correct_topk / total
mrr = sum(reciprocal_ranks) / total

print("\n===== FINAL METRICS =====")
print(f"Top-1 Accuracy: {top1_accuracy:.3f}")
print(f"Top-{TOP_K} Recall: {topk_recall:.3f}")
print(f"MRR: {mrr:.3f}")

Loading embedding model...


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

Loading FAISS index...
Loading metadata...
Structured dataset loaded.

Running evaluation...

Query: Does Horizon 4 cover inpatient consultant fees?
Expected: Horizon 4
Returned: ['horizon_4_table_of_cover', 'level_health_plan_b_with_150_excess_table_of_cover__table_of_cover_2025-06-27', 'vhi_company_plan_plus_level_1_table_of_cover_2024-10-01']
------------------------------------------------------------
Query: Which plan has a â‚¬300 excess for semi-private room admission?
Expected: Level Health Plan B with 300 Excess
Returned: ['level_health_plan_d_table_of_cover__table_of_cover_2025-06-27', 'vhi_company_plan_plus_level_1_table_of_cover_2024-10-01', 'vhi_health_access_table_of_cover_2023-12-31']
------------------------------------------------------------
Query: How many days of psychiatric treatment are covered under Plan A?
Expected: Level Health Plan A
Returned: ['vhi_health_access_table_of_cover_2023-12-31', 'laya_healthcare_pmi_plan_b_table_of_benefits_table_of_cover_2021-10-01

In [29]:
! pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [42]:
import json
import numpy as np
import re
from collections import defaultdict
from rank_bm25 import BM25Okapi

# ----------------------------
# CONFIG
# ----------------------------

METADATA_PATH = "metadata_multi_provider.json"
TOP_K = 3
CHUNK_TOP_K = 50  # how many chunks to consider before plan aggregation

# ----------------------------
# LOAD METADATA
# ----------------------------

print("Loading metadata...")
with open(METADATA_PATH, "r") as f:
    metadata = json.load(f)

# ----------------------------
# FILTER OUT NON-PLAN DOCUMENTS
# ----------------------------

filtered_metadata = []
for m in metadata:
    name = m["plan_name"].lower()
    if any(x in name for x in ["membership", "terms", "rules"]):
        continue
    filtered_metadata.append(m)

metadata = filtered_metadata

print(f"Using {len(metadata)} filtered chunks")

# ----------------------------
# TOKENIZATION
# ----------------------------

def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9â‚¬]+", " ", text)
    return text.split()

corpus = [m["chunk_text"] for m in metadata]
tokenized_corpus = [tokenize(doc) for doc in corpus]

print("Building BM25 index...")
bm25 = BM25Okapi(tokenized_corpus)

# ----------------------------
# SEARCH FUNCTION
# ----------------------------

def search_bm25(query):

    tokenized_query = tokenize(query)
    scores = bm25.get_scores(tokenized_query)

    # Take only top chunk-level hits
    top_chunk_indices = np.argsort(scores)[::-1][:CHUNK_TOP_K]

    plan_scores = defaultdict(float)

    for idx in top_chunk_indices:
        plan = metadata[idx]["plan_name"]
        plan_scores[plan] += scores[idx]

    # Normalize BM25 score
    results = []
    for plan, score in plan_scores.items():
        normalized = score / (score + 10)
        results.append((plan, normalized))

    results.sort(key=lambda x: x[1], reverse=True)

    return [plan for plan, _ in results[:TOP_K]]

# ----------------------------
# EXACT EVALUATION SET
# ----------------------------

evaluation_set = [
    {
        "query": "Does Horizon 4 cover inpatient consultant fees?",
        "expected_plan": "Horizon 4"
    },
    {
        "query": "Which plan has a â‚¬300 excess for semi-private room admission?",
        "expected_plan": "Level Health Plan B with 300 Excess"
    },
    {
        "query": "How many days of psychiatric treatment are covered under Plan A?",
        "expected_plan": "Level Health Plan A"
    },
    {
        "query": "Are inpatient scans fully covered under Health Plan 26.1?",
        "expected_plan": "Health Plan 26.1"
    }
]

# ----------------------------
# METRICS
# ----------------------------

correct_top1 = 0
correct_topk = 0
reciprocal_ranks = []

print("\nRunning BM25 Clean Evaluation...\n")

for item in evaluation_set:
    query = item["query"]
    expected = item["expected_plan"]

    returned = search_bm25(query)

    print("Query:", query)
    print("Returned:", returned)
    print("-" * 60)

    if returned and returned[0] == expected:
        correct_top1 += 1

    if expected in returned:
        correct_topk += 1
        rank = returned.index(expected) + 1
        reciprocal_ranks.append(1.0 / rank)
    else:
        reciprocal_ranks.append(0.0)

total = len(evaluation_set)

print("\n===== BM25 Clean =====")
print("Top-1:", correct_top1 / total)
print("Top-3:", correct_topk / total)
print("MRR:", sum(reciprocal_ranks) / total)

Loading metadata...
Using 76 filtered chunks
Building BM25 index...

Running BM25 Clean Evaluation...

Query: Does Horizon 4 cover inpatient consultant fees?
Returned: ['horizon_4_table_of_cover', 'health_plan_26.1_table_of_cover', 'vhi_company_plan_plus_level_1_table_of_cover_2024-10-01']
------------------------------------------------------------
Query: Which plan has a â‚¬300 excess for semi-private room admission?
Returned: ['laya_healthcare_first_family_plan_table_of_benefits_table_of_cover_2021-02-01', 'vhi_company_plan_plus_level_1_table_of_cover_2024-10-01', 'laya_healthcare_pmi_plan_b_table_of_benefits_table_of_cover_2021-10-01']
------------------------------------------------------------
Query: How many days of psychiatric treatment are covered under Plan A?
Returned: ['vhi_company_plan_plus_level_1_table_of_cover_2024-10-01', 'laya_healthcare_pmi_plan_b_table_of_benefits_table_of_cover_2021-10-01', 'horizon_4_table_of_cover']
-----------------------------------------------

In [43]:
import json
import numpy as np
import faiss
import re
from collections import defaultdict
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer

# ----------------------------
# CONFIG
# ----------------------------

INDEX_PATH = "faiss_multi_provider_index.bin"
METADATA_PATH = "metadata_multi_provider.json"

TOP_K = 3
ALPHA = 0.6
BETA = 0.4
CHUNK_TOP_K = 50

# ----------------------------
# LOAD DATA
# ----------------------------

print("Loading MXBAI model...")
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

print("Loading FAISS index...")
index = faiss.read_index(INDEX_PATH)

print("Loading metadata...")
with open(METADATA_PATH, "r") as f:
    metadata = json.load(f)

# ----------------------------
# FILTER NON-PLAN DOCS
# ----------------------------

filtered_metadata = []
for m in metadata:
    name = m["plan_name"].lower()
    if any(x in name for x in ["membership", "terms", "rules"]):
        continue
    filtered_metadata.append(m)

metadata = filtered_metadata

print(f"Using {len(metadata)} filtered chunks")

# ----------------------------
# TOKENIZATION
# ----------------------------

def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9â‚¬]+", " ", text)
    return text.split()

corpus = [m["chunk_text"] for m in metadata]
tokenized_corpus = [tokenize(doc) for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

# ----------------------------
# NORMALIZATION
# ----------------------------

def normalize(scores_dict):
    if not scores_dict:
        return {}
    scores = np.array(list(scores_dict.values()))
    min_s = scores.min()
    max_s = scores.max()
    normalized = {}
    for k, v in scores_dict.items():
        if max_s - min_s == 0:
            normalized[k] = 0.0
        else:
            normalized[k] = (v - min_s) / (max_s - min_s)
    return normalized

# ----------------------------
# HYBRID SEARCH
# ----------------------------

def hybrid_search(query, k=TOP_K):

    # -------- DENSE --------
    prefixed_query = "Represent this sentence for searching relevant passages: " + query
    query_embedding = model.encode([prefixed_query], normalize_embeddings=True)
    query_embedding = np.array(query_embedding).astype("float32")

    dense_scores, dense_indices = index.search(query_embedding, CHUNK_TOP_K)

    dense_plan_scores = defaultdict(float)

    for score, idx in zip(dense_scores[0], dense_indices[0]):
        plan = metadata[idx]["plan_name"]
        dense_plan_scores[plan] += float(score)

    # -------- BM25 --------
    tokenized_query = tokenize(query)
    bm25_scores_all = bm25.get_scores(tokenized_query)

    top_chunk_indices = np.argsort(bm25_scores_all)[::-1][:CHUNK_TOP_K]

    bm25_plan_scores = defaultdict(float)

    for idx in top_chunk_indices:
        plan = metadata[idx]["plan_name"]
        bm25_plan_scores[plan] += bm25_scores_all[idx]

    # -------- NORMALIZE --------
    dense_norm = normalize(dense_plan_scores)
    bm25_norm = normalize(bm25_plan_scores)

    # -------- FUSION --------
    final_scores = {}

    all_plans = set(dense_norm.keys()) | set(bm25_norm.keys())

    for plan in all_plans:
        d = dense_norm.get(plan, 0)
        b = bm25_norm.get(plan, 0)
        final_scores[plan] = ALPHA * d + BETA * b

    ranked = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)

    return [plan for plan, _ in ranked[:k]]

# ----------------------------
# EXACT EVALUATION SET
# ----------------------------

evaluation_set = [
    {
        "query": "Does Horizon 4 cover inpatient consultant fees?",
        "expected_plan": "Horizon 4"
    },
    {
        "query": "Which plan has a â‚¬300 excess for semi-private room admission?",
        "expected_plan": "Level Health Plan B with 300 Excess"
    },
    {
        "query": "How many days of psychiatric treatment are covered under Plan A?",
        "expected_plan": "Level Health Plan A"
    },
    {
        "query": "Are inpatient scans fully covered under Health Plan 26.1?",
        "expected_plan": "Health Plan 26.1"
    }
]

# ----------------------------
# METRICS
# ----------------------------

correct_top1 = 0
correct_topk = 0
reciprocal_ranks = []

print("\nRunning Clean Hybrid Evaluation...\n")

for item in evaluation_set:
    query = item["query"]
    expected = item["expected_plan"]

    returned = hybrid_search(query)

    print("Query:", query)
    print("Returned:", returned)
    print("-" * 60)

    if returned and returned[0] == expected:
        correct_top1 += 1

    if expected in returned:
        correct_topk += 1
        rank = returned.index(expected) + 1
        reciprocal_ranks.append(1.0 / rank)
    else:
        reciprocal_ranks.append(0.0)

total = len(evaluation_set)

print("\n===== CLEAN HYBRID RESULTS =====")
print("Top-1:", correct_top1 / total)
print("Top-3:", correct_topk / total)
print("MRR:", sum(reciprocal_ranks) / total)

Loading MXBAI model...


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

Loading FAISS index...
Loading metadata...
Using 76 filtered chunks

Running Clean Hybrid Evaluation...

Query: Does Horizon 4 cover inpatient consultant fees?
Returned: ['horizon_4_table_of_cover', 'vhi_company_plan_plus_level_1_table_of_cover_2024-10-01', 'vhi_health_access_table_of_cover_2023-12-31']
------------------------------------------------------------
Query: Which plan has a â‚¬300 excess for semi-private room admission?
Returned: ['vhi_company_plan_plus_level_1_table_of_cover_2024-10-01', 'laya_healthcare_pmi_plan_b_table_of_benefits_table_of_cover_2021-10-01', 'level_health_plan_d_table_of_cover__table_of_cover_2025-06-27']
------------------------------------------------------------
Query: How many days of psychiatric treatment are covered under Plan A?
Returned: ['vhi_company_plan_plus_level_1_table_of_cover_2024-10-01', 'laya_healthcare_pmi_plan_b_table_of_benefits_table_of_cover_2021-10-01', 'horizon_4_table_of_cover']
-------------------------------------------------

In [52]:
import json
import numpy as np
import faiss
from collections import defaultdict
from sentence_transformers import SentenceTransformer

# ======================================================
# CONFIG
# ======================================================

INDEX_PATH = "faiss_multi_provider_index.bin"
METADATA_PATH = "metadata_multi_provider.json"
STRUCTURED_PATH = "MASTER_STRUCTURED_SUPERSET_2026-1.jsonl"

TOP_K = 3

# ======================================================
# LOAD MODEL + DATA
# ======================================================

print("Loading embedding model...")
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

print("Loading FAISS index...")
index = faiss.read_index(INDEX_PATH)

print("Loading metadata...")
with open(METADATA_PATH, "r") as f:
    metadata = json.load(f)

# Remove legal documents (prevents corpus pollution bias)
metadata = [
    m for m in metadata
    if not any(x in m["plan_name"].lower()
               for x in ["membership", "terms", "rules"])
]

plan_names = sorted(list(set(m["plan_name"] for m in metadata)))
print(f"Loaded {len(plan_names)} plans")

# ======================================================
# LOAD STRUCTURED DATA
# ======================================================

structured_data = defaultdict(list)

with open(STRUCTURED_PATH, "r") as f:
    for line in f:
        obj = json.loads(line)
        structured_data[obj["plan_name"]].append(obj)

print("Structured dataset loaded.")

# ======================================================
# DENSE RELEVANCE (Relevance Multiplier Only)
# ======================================================

def compute_dense_scores(query):

    prefixed = "Represent this sentence for searching relevant passages: " + query
    q_emb = model.encode([prefixed], normalize_embeddings=True)
    q_emb = np.array(q_emb).astype("float32")

    scores, indices = index.search(q_emb, 50)

    plan_scores = defaultdict(float)

    for score, idx in zip(scores[0], indices[0]):
        plan = metadata[idx]["plan_name"]
        plan_scores[plan] += float(score)

    # Normalize 0â€“1
    if plan_scores:
        vals = np.array(list(plan_scores.values()))
        min_v, max_v = vals.min(), vals.max()
        for k in plan_scores:
            plan_scores[k] = (plan_scores[k] - min_v) / (max_v - min_v + 1e-8)

    return plan_scores

# ======================================================
# FINANCIAL SCORE (Ratio-Based, No Density Bias)
# ======================================================

def compute_financial_score(plan):

    entries = structured_data.get(plan, [])
    if not entries:
        return 0.5, {}

    excess_vals = []
    coverage_vals = []
    cap_vals = []

    for e in entries:
        if e.get("excess_amount") is not None:
            excess_vals.append(float(e["excess_amount"]))
        if e.get("coverage_percentage") is not None:
            coverage_vals.append(float(e["coverage_percentage"]))
        if e.get("limit_amount") is not None:
            cap_vals.append(float(e["limit_amount"]))

    avg_excess = np.mean(excess_vals) if excess_vals else 0
    avg_cov = np.mean(coverage_vals) if coverage_vals else 0
    avg_cap = np.mean(cap_vals) if cap_vals else 0

    # Strong but symmetric penalty
    excess_score = max(0, 1 - avg_excess / 800)

    coverage_score = avg_cov / 100
    cap_score = min(avg_cap / 10000, 1)

    financial_score = (
        0.5 * excess_score +
        0.3 * coverage_score +
        0.2 * cap_score
    )

    breakdown = {
        "avg_excess": round(avg_excess, 2),
        "excess_score": round(excess_score, 2),
        "coverage_score": round(coverage_score, 2),
        "cap_score": round(cap_score, 2)
    }

    return financial_score, breakdown

# ======================================================
# DISEASE FIT SCORE (Symmetric Rules)
# ======================================================

def compute_disease_score(plan, profile):

    entries = structured_data.get(plan, [])
    score = 0
    breakdown = {}

    for condition in profile["conditions"]:

        # HEART
        if condition == "heart_disease":
            if any(e.get("high_tech_flag") for e in entries):
                score += 8
                breakdown["heart_high_tech"] = 8
            else:
                score -= 6
                breakdown["heart_penalty"] = -6

        # CANCER
        if condition == "cancer":
            if any(e.get("high_tech_flag") for e in entries):
                score += 10
                breakdown["cancer_high_tech"] = 10
            else:
                score -= 6
                breakdown["cancer_penalty"] = -6

        # PSYCHIATRIC
        if condition == "psychiatric_disorder":
            days = [
                float(e["day_limit"])
                for e in entries
                if e.get("day_limit") is not None
            ]
            if days:
                val = min(max(days), 150) / 10
                score += val
                breakdown["psych_days_bonus"] = round(val, 2)
            else:
                score -= 4
                breakdown["psych_penalty"] = -4

        # PREGNANCY
        if condition == "pregnancy":
          maternity_entries = [e for e in entries
        if (
            "maternity" in str(e.get("section","")).lower()
            or
            "maternity" in str(e.get("subsection","")).lower()
            or
            "maternity" in str(e.get("benefit_name","")).lower()
        )
    ]
          if maternity_entries:
            score += 8
            breakdown["maternity_bonus"] = 8
          else:
            score -= 6
            breakdown["maternity_penalty"] = -6

    return score, breakdown

# ======================================================
# FINAL BIAS-NEUTRAL RECOMMENDER
# ======================================================

def recommend_plans(query, profile):

    dense_scores = compute_dense_scores(query)

    results = []

    for plan in plan_names:

        dense = dense_scores.get(plan, 0)
        financial, fin_break = compute_financial_score(plan)
        disease_raw, dis_break = compute_disease_score(plan, profile)

        # Scale disease to 0â€“1 safely
        disease_scaled = min(max((disease_raw + 10) / 30, 0), 1)

        # Suitability independent of text verbosity
        suitability = (
            0.5 * financial +
            0.5 * disease_scaled
        )

        # Dense only boosts relevance
        relevance_multiplier = 0.8 + 0.2 * dense

        final_score = suitability * relevance_multiplier

        results.append({
            "plan": plan,
            "final": final_score,
            "dense": dense,
            "financial": financial,
            "disease_raw": disease_raw,
            "disease_scaled": disease_scaled,
            "financial_breakdown": fin_break,
            "disease_breakdown": dis_break
        })

    results.sort(key=lambda x: x["final"], reverse=True)

    return results[:TOP_K]

# ======================================================
# MULTI-SCENARIO TEST
# ======================================================

SCENARIOS = {
    "Cardiac Case": {
        "profile": {"conditions": ["heart_disease"]},
        "query": "Cardiac procedures, high-tech hospital, consultant coverage"
    },
    "Cancer Case": {
        "profile": {"conditions": ["cancer"]},
        "query": "Oncology treatment, chemotherapy, inpatient admission"
    },
    "Psychiatric Case": {
        "profile": {"conditions": ["psychiatric_disorder"]},
        "query": "Mental health inpatient days coverage"
    },
    "Pregnancy Case": {
        "profile": {"conditions": ["pregnancy"]},
        "query": "Maternity cover, hospital delivery, obstetric services"
    }
}

print("\n====================================")
print("BIAS-NEUTRAL MULTI-SCENARIO ENGINE (V3)")
print("====================================")

for name, scenario in SCENARIOS.items():

    print(f"\n==============================")
    print(f"Scenario: {name}")
    print("==============================")

    top_plans = recommend_plans(
        scenario["query"],
        scenario["profile"]
    )

    for rank, r in enumerate(top_plans, 1):

        print(f"\n{rank}. {r['plan']}")
        print("   Final Score:", round(r["final"], 3))
        print("   Dense:", round(r["dense"], 3))
        print("   Financial:", round(r["financial"], 3))
        print("   Disease Raw:", round(r["disease_raw"], 2))
        print("   Disease Scaled:", round(r["disease_scaled"], 3))
        print("   Financial Breakdown:", r["financial_breakdown"])
        print("   Disease Breakdown:", r["disease_breakdown"])

Loading embedding model...


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

Loading FAISS index...
Loading metadata...
Loaded 16 plans
Structured dataset loaded.

BIAS-NEUTRAL MULTI-SCENARIO ENGINE (V3)

Scenario: Cardiac Case

1. select_starter_table_of_cover
   Final Score: 0.623
   Dense: 0.338
   Financial: 0.836
   Disease Raw: 8
   Disease Scaled: 0.6
   Financial Breakdown: {'avg_excess': np.float64(60.0), 'excess_score': np.float64(0.92), 'coverage_score': np.float64(0.58), 'cap_score': 1}
   Disease Breakdown: {'heart_high_tech': 8}

2. horizon_4_table_of_cover
   Final Score: 0.59
   Dense: 0.17
   Financial: 0.816
   Disease Raw: 8
   Disease Scaled: 0.6
   Financial Breakdown: {'avg_excess': np.float64(55.0), 'excess_score': np.float64(0.93), 'coverage_score': np.float64(0.5), 'cap_score': 1}
   Disease Breakdown: {'heart_high_tech': 8}

3. first_cover_table_of_cover
   Final Score: 0.589
   Dense: 0.168
   Financial: 0.812
   Disease Raw: 8
   Disease Scaled: 0.6
   Financial Breakdown: {'avg_excess': np.float64(60.0), 'excess_score': np.float64(0

In [55]:
# =====================================================
# LLM EXPLANATION BLOCK (NO RANKING CHANGES)
# =====================================================

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import json

MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
MAX_NEW_TOKENS = 170

print("\nLoading Explanation LLM...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

llm_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

llm_model.eval()

print("LLM loaded.")


Loading Explanation LLM...


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/434 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

LLM loaded.


In [65]:
def retrieve_plan_evidence(plan, query, top_n=3):

    prefixed = "Represent this sentence for searching relevant passages: " + query
    q_emb = model.encode([prefixed], normalize_embeddings=True)
    q_emb = np.array(q_emb).astype("float32")

    scores, indices = index.search(q_emb, 50)

    plan_chunks = []

    for score, idx in zip(scores[0], indices[0]):
        meta = metadata[idx]

        if meta["plan_name"] == plan:
            plan_chunks.append({
                "score": float(score),
                "text": meta.get("chunk_text", ""),
                "source": meta.get("source", "")
            })

        if len(plan_chunks) >= top_n:
            break

    return plan_chunks

def map_chunk_to_structured(plan, chunk_text):

    entries = structured_data.get(plan, [])
    matched = []

    chunk_lower = chunk_text.lower()

    for e in entries:
        benefit_text = " ".join([
            str(e.get("section","")),
            str(e.get("subsection","")),
            str(e.get("benefit_name",""))
        ]).lower()

        if any(word in chunk_lower for word in benefit_text.split()):
            matched.append(e)

    return matched[:3]

In [66]:
def generate_explanation(plan_data, query, profile):

    rag_chunks = retrieve_plan_evidence(plan_data["plan"], query)

    mapped_structured = []

    for chunk in rag_chunks:
        mapped_structured.extend(
            map_chunk_to_structured(plan_data["plan"], chunk["text"])
        )

    evidence = {
        "retrieved_chunks": rag_chunks,
        "structured_matches": mapped_structured
    }

    prompt = f"""
You are a medical insurance explanation engine.

STRICT RULES:
- Use ONLY retrieved chunks and structured matches.
- Do NOT invent coverage.
- Do NOT compare plans.
- Do NOT mention ranking.
- Maximum 150 words.
- Plain text only.

PATIENT PROFILE:
{profile}

PLAN:
{plan_data['plan']}

RAG CHUNKS:
{json.dumps(rag_chunks, indent=2)}

STRUCTURED MATCHES:
{json.dumps(mapped_structured, indent=2)}

Explain how this plan covers the patient's condition using only this evidence.
"""

    inputs = tokenizer(prompt, return_tensors="pt").to(llm_model.device)

    with torch.no_grad():
        outputs = llm_model.generate(
            **inputs,
            max_new_tokens=160,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if decoded.startswith(prompt):
        decoded = decoded[len(prompt):]

    return decoded.strip()

In [61]:
def build_condition_evidence(plan, profile):

    entries = structured_data.get(plan, [])
    matched_entries = []

    for e in entries:

        text_blob = " ".join([
            str(e.get("section", "")),
            str(e.get("subsection", "")),
            str(e.get("benefit_name", "")),
            str(e.get("description", ""))
        ]).lower()

        for condition in profile["conditions"]:

            # Direct keyword match
            if condition.replace("_", " ") in text_blob:
                matched_entries.append(e)
                continue

            # Domain-specific matching
            if condition == "heart_disease":
                if any(k in text_blob for k in [
                    "cardiac", "heart", "angioplasty",
                    "stent", "bypass", "cardiology"
                ]):
                    matched_entries.append(e)

            if condition == "cancer":
                if any(k in text_blob for k in [
                    "oncology", "chemotherapy",
                    "radiotherapy", "cancer",
                    "tumour", "malignant"
                ]):
                    matched_entries.append(e)

            if condition == "psychiatric_disorder":
                if any(k in text_blob for k in [
                    "psychiatric", "mental health",
                    "inpatient psychiatric",
                    "psychological"
                ]):
                    matched_entries.append(e)

            if condition == "pregnancy":
                if any(k in text_blob for k in [
                    "maternity", "obstetric",
                    "pregnancy", "delivery",
                    "antenatal"
                ]):
                    matched_entries.append(e)

    # Remove duplicates while preserving order
    seen = set()
    unique_entries = []
    for entry in matched_entries:
        entry_id = json.dumps(entry, sort_keys=True)
        if entry_id not in seen:
            seen.add(entry_id)
            unique_entries.append(entry)

    # Limit prompt size
    return unique_entries[:6]

In [67]:
# ======================================================
# ADVANCED MULTI-SCENARIO EVALUATION (V3-COMPATIBLE)
# ======================================================

SCENARIOS = {
    "Cardiac Case": {
        "profile": {"conditions": ["heart_disease"]},
        "query": "Cardiac procedures, high-tech hospital, consultant coverage"
    },
    "Cancer Case": {
        "profile": {"conditions": ["cancer"]},
        "query": "Oncology treatment, chemotherapy, inpatient admission"
    },
    "Psychiatric Case": {
        "profile": {"conditions": ["psychiatric_disorder"]},
        "query": "Mental health inpatient days coverage"
    },
    "Pregnancy Case": {
        "profile": {"conditions": ["pregnancy"]},
        "query": "Maternity cover, hospital delivery, obstetric services"
    }
}

print("\n====================================")
print("ADVANCED MULTI-SCENARIO EVALUATION (V3)")
print("====================================")

plan_win_counter = defaultdict(int)
plan_rank_sum = defaultdict(int)
scenario_margins = []

num_scenarios = len(SCENARIOS)

for scenario_name, scenario_data in SCENARIOS.items():

    print("\n==============================")
    print(f"Scenario: {scenario_name}")
    print("==============================")

    profile = scenario_data["profile"]
    query = scenario_data["query"]

    # Use your new recommender directly
    all_results = []

    dense_scores = compute_dense_scores(query)

    for plan in plan_names:

        dense = dense_scores.get(plan, 0)
        financial, fin_break = compute_financial_score(plan)
        disease_raw, dis_break = compute_disease_score(plan, profile)

        disease_scaled = min(max((disease_raw + 10) / 30, 0), 1)

        suitability = 0.4 * financial + 0.6 * disease_scaled
        relevance_multiplier = 0.8 + 0.2 * dense
        final_score = suitability * relevance_multiplier

        all_results.append({
    "plan": plan,
    "final": final_score,
    "dense": dense,
    "financial": financial,
    "disease_raw": disease_raw,
    "disease_scaled": disease_scaled,
    "financial_breakdown": fin_break,
    "disease_breakdown": dis_break
})

    # Sort descending
    all_results.sort(key=lambda x: x["final"], reverse=True)

    # Winner tracking
    winner = all_results[0]["plan"]
    plan_win_counter[winner] += 1

    # Rank tracking
    for rank, result in enumerate(all_results, 1):
        plan_rank_sum[result["plan"]] += rank

    # Print top K
    for rank, r in enumerate(all_results[:TOP_K], 1):

        print(f"\n{rank}. {r['plan']}")
        print("   Final Score:", round(r["final"], 3))
        print("   Dense:", round(r["dense"], 3))
        print("   Financial:", round(r["financial"], 3))
        print("   Disease Raw:", round(r["disease_raw"], 2))
        print("   Disease Scaled:", round(r["disease_scaled"], 3))
        explanation = generate_explanation(r, query, profile)

        print("\n   Explanation:")
        print("   ", explanation)

    # Confidence margin
    margin = all_results[0]["final"] - all_results[1]["final"]
    scenario_margins.append(margin)

    print("\nConfidence Margin:", round(margin, 4))

    if margin < 0.05:
        print("Interpretation: Very close competition")
    elif margin < 0.15:
        print("Interpretation: Moderate separation")
    else:
        print("Interpretation: Strong winner")

# ======================================================
# GLOBAL ANALYTICS
# ======================================================

print("\n====================================")
print("GLOBAL ANALYTICS")
print("====================================")

print("\nPlan Win Frequency:")
for plan, wins in sorted(plan_win_counter.items(), key=lambda x: x[1], reverse=True):
    print(f"{plan}: {wins} wins")

print("\nAverage Rank Per Plan:")

avg_ranks = {
    plan: plan_rank_sum[plan] / num_scenarios
    for plan in plan_names
}

for plan, avg_rank in sorted(avg_ranks.items(), key=lambda x: x[1]):
    print(f"{plan}: Avg Rank = {round(avg_rank, 2)}")

print("\nAverage Confidence Margin:",
      round(np.mean(scenario_margins), 4))

print("Margin Std Dev:",
      round(np.std(scenario_margins), 4))


ADVANCED MULTI-SCENARIO EVALUATION (V3)

Scenario: Cardiac Case

1. select_starter_table_of_cover
   Final Score: 0.603
   Dense: 0.338
   Financial: 0.836
   Disease Raw: 8
   Disease Scaled: 0.6

   Explanation:
    This plan covers inpatient hospital costs for heart disease, including consultants' fees and scans in selected hospitals. However, it does not cover day case treatment, semi-private rooms, or other listed special procedures. For emergency inpatient treatment abroad, it covers up to â‚¬55,000 of the hospital bill and repatriation expenses up to â‚¬1 million. There's no specific mention of outpatient or specialist cardiac procedures. Coverage for psychiatric treatment is limited to 150 days per 5 years, not related to substance abuse. No details are provided for public hospital maternity cover or post-natal home help. The plan includes support for travel expenses for inpatient care over 50km away but not for other travel expenses. It also offers digital doctor services and