<a href="https://colab.research.google.com/github/CWNDrohan/PensionRAG/blob/main/Data606_Final_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Step 1 — Install Libraries & Imports

In [None]:
# 🛠️ Install Required Libraries (FAISS + LlamaIndex only)
!pip install -q \
    llama-index \
    llama-index-vector-stores-faiss \
    llama-index-embeddings-huggingface \
    llama-index-llms-huggingface \
    sentence-transformers transformers \
    pdfplumber PyMuPDF \
    faiss-cpu

# 📥 Step 1b: Import All Necessary Libraries
import os
import json
import torch
import faiss
import shutil
import fitz  # PyMuPDF
import pdfplumber
import re
import pprint  # 🔍 Pretty-printing for debug visibility
import pandas as pd
from google.colab import drive, userdata
from datetime import datetime

# ✅ Transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer

# ✅ LlamaIndex Imports
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage, Document
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser

# 📏 Global Chunking Configuration
CHUNK_SIZE = 1200
CHUNK_OVERLAP = 200

# ✅ Centralized keyword list
KEYWORDS = [
    "pension formula",
    "early retirement",
    "benefit reduction",
    "final average salary",
    "penalty table"
]

# 🔗 Step 1c: Mount Google Drive
drive.mount('/content/drive')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m95.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m122.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Step 2 — Verify Documents & Tokens are Accessible

In [None]:
# 📂 Define Paths and Load JSON Knowledge Base

# 📄 Define permanent location
drive_dir = "/content/drive/My Drive/School/UMBC/DATA606/Input"
kb_filename = "knowledge_base.json"
pension_pdf_path = os.path.join(drive_dir, "NYCERS_Tier6.pdf")

# 🧠 Wrapped loader
def load_latest_kb(temp_dir="/content", drive_dir=drive_dir, filename=kb_filename):
    # 🔍 Auto-locate temp file
    temp_json = None
    for file in os.listdir(temp_dir):
        if file.endswith(".json"):
            temp_json = os.path.join(temp_dir, file)
            print(f"📂 Found new JSON in temp directory: {file}")  # ✅ NEW LINE
            break

    drive_json_path = os.path.join(drive_dir, filename)

    # 🔄 Copy and rename
    if temp_json:
        shutil.copy(temp_json, drive_json_path)
        mod_time = os.path.getmtime(drive_json_path)
        timestamp = datetime.fromtimestamp(mod_time).strftime("%Y-%m-%d %H:%M:%S")
        print(f"✅ Knowledge base copied to Drive as: {filename}")
        print(f"🕒 Last modified: {timestamp}")
    else:
        print("⚠️ No new JSON found in /content. Using existing file in Drive.")

    # 📖 Load
    if os.path.exists(drive_json_path):
        with open(drive_json_path, "r") as f:
            kb = json.load(f)
            print("✅ Knowledge base loaded with keys:", list(kb.keys()))
            return kb
    else:
        raise FileNotFoundError("❌ ERROR: No knowledge_base.json found in Drive!")

# ✅ Load KB
knowledge_base = load_latest_kb()

# 📄 Validate PDF path
if os.path.exists(pension_pdf_path):
    print("✅ Pension PDF is accessible:", pension_pdf_path)
else:
    print("❌ ERROR: Pension PDF not found! Check the file path.")

# 🔐 Retrieve Hugging Face token from Colab's secrets
huggingface_token = userdata.get("HF_TOKEN")
if huggingface_token:
    print("✅ Hugging Face token retrieved successfully!")
else:
    print("❌ ERROR: Hugging Face token not found! Make sure it's saved in Colab.")

⚠️ No new JSON found in /content. Using existing file in Drive.
✅ Knowledge base loaded with keys: ['instructions', 'examples']
✅ Pension PDF is accessible: /content/drive/My Drive/School/UMBC/DATA606/Input/NYCERS_Tier6.pdf
✅ Hugging Face token retrieved successfully!


Step 3 -- Extract, Clean, Tag, and Index Pension Text and Tables for RAG Processing

In [None]:
# ✅ Step 3: Extract, Clean, and Tag Pension Text + Tables

from llama_index.core import Document
import fitz, pdfplumber, re, pandas as pd

# 🧼 Clean raw text (remove headers, collapse whitespace, etc.)
def clean_raw_text(text):
    text = re.sub(r'\n?\d{1,3}\nSummary Plan Description[^\n]*', '', text)  # Remove headers
    text = re.sub(r'\n{2,}', '\n\n', text)                                  # Collapse newlines
    text = re.sub(r'\n\s+\n', '\n\n', text)                                 # Remove whitespace-only lines
    return text.strip()

# 📄 Extract full text from PDF (fitz for layout)
def extract_raw_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        return "\n".join([page.get_text("text") for page in doc])

# 📊 Extract tables from PDF (pdfplumber is best for tables)
def extract_tables_from_pdf(pdf_path):
    extracted_tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for table in page.extract_tables():
                df = pd.DataFrame(table)
                extracted_tables.append(df.to_dict(orient="records"))
    return extracted_tables

# 🧠 Optional metadata tagging for known table types
def tag_table_metadata(table_data, table_id):
    table_text = str(table_data)

    # ✅ Base keywords via ID-specific tagging
    if table_id in [5, 6]:
        manual_keywords = ["pension formula", "final average salary", "35% FAS", "2% additional"]
        nl_summary = (
            "This table describes the pension formula:\n"
            "- If you have less than 20 years of service: 1.67% × Final Average Salary × Years of Service.\n"
            "- If you have 20 or more years: 35% of FAS for the first 20 years, plus 2% for each year beyond 20.\n"
        )
        table_text = nl_summary + "\n" + table_text
    elif table_id == 7:
        manual_keywords = ["early retirement", "age reduction", "6.5%", "penalty table"]
    else:
        manual_keywords = []

    # ✅ Dynamically match keywords from table content
    dynamic_keywords = [kw for kw in KEYWORDS if kw in table_text.lower()]

    # ✅ Combine both sets (deduplicated)
    all_keywords = sorted(set(manual_keywords + dynamic_keywords))

    return Document(
        text=table_text,
        metadata={
            "table_id": table_id,
            "source": f"table_{table_id}",
            "table_keywords": all_keywords
        }
    )

# 🏗️ Run pipeline
raw_text = clean_raw_text(extract_raw_text_from_pdf(pension_pdf_path))
extracted_tables = extract_tables_from_pdf(pension_pdf_path)

# 📦 Wrap raw text and tagged tables into Document objects
combined_docs = [Document(text=raw_text, metadata={"type": "full_text"})]
combined_docs += [tag_table_metadata(tbl, i) for i, tbl in enumerate(extracted_tables)]

print(f"✅ Extracted raw text ({len(raw_text):,} characters)")
print(f"✅ Extracted {len(extracted_tables)} tables and tagged key pension tables.")

✅ Extracted raw text (225,906 characters)
✅ Extracted 17 tables and tagged key pension tables.


Step 4 — Build & Verify the Base Index

In [None]:
# ✅ Chunk Pension Text + Tables and Build FAISS Vector Index

# ✅ Define paths
faiss_index_path = "/content/faiss_index"

# 🧼 Remove old FAISS index (if it exists)
if os.path.exists(faiss_index_path):
    shutil.rmtree(faiss_index_path)
print("🧼 Old FAISS index removed.")

# 🔍 Optional: Measure total character length of all input documents
total_chars = sum(len(doc.text) for doc in combined_docs)
print(f"🔍 Total combined length across all documents: {total_chars:,} characters")

# ✅ Define embed_model FIRST (needed for both chunking + vector indexing)
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")

# ✅ Use semantic-aware chunking for more cohesive chunks
node_parser = SemanticSplitterNodeParser(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    embed_model=embed_model
)

nodes = node_parser.get_nodes_from_documents(combined_docs)
print(f"📦 Semantic chunking complete: {len(nodes)} chunks created")

# 🔍 Preview a few chunks
print("\n🔍 Sample of Chunked Nodes:\n")
for i, node in enumerate(nodes[:3]):
    print(f"🔹 Chunk {i+1}")
    pprint.pprint(node.metadata)
    print(node.text[:500])
    print("-" * 100)

# ✅ Define FAISS index and vector store
faiss_index = faiss.IndexFlatL2(768)
vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# ✅ Build and persist the vector index
pension_index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    embed_model=embed_model
)
pension_index.storage_context.persist(persist_dir=faiss_index_path)
print("✅ FAISS index with semantic chunking saved.")
print(f"✅ Model saved at: {faiss_index_path}")

# 🔍 Confirm FAISS index structure
print("\n📚 FAISS Index Sample Preview:\n")
retriever = pension_index.as_retriever(similarity_top_k=3)
sample_query = "What is the pension formula?"
retrieved = retriever.retrieve(sample_query)

for i, node in enumerate(retrieved):
    print(f"🔹 Result {i+1}")
    pprint.pprint(node.metadata)
    print(node.text[:800])
    print("-" * 100)

🧼 Old FAISS index removed.
🔍 Total combined length across all documents: 237,175 characters


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

📦 Semantic chunking complete: 82 chunks created

🔍 Sample of Chunked Nodes:

🔹 Chunk 1
{'type': 'full_text'}
Summary Plan 
Description (SPD)
Tier 6 - 63/5
Version December 2024
New York City Employees’ Retirement System
www.nycers.org

TABLE OF CONTENTS
INTRODUCTION...................................................................................................................................5
About NYCERS.................................................................................................................................. 6
Board of Trustees................................................
----------------------------------------------------------------------------------------------------
🔹 Chunk 2
{'type': 'full_text'}
The New York State Retirement and Social Security Law (RSSL) was amended by Chapter 18 of the 
Laws of 2012, establishing Tier 6 for individuals who join NYCERS on or after April 1, 2012. 
--------------------------------------------------------------------

Step 5 — Load and Configure LLM with Query Engine

In [None]:
# ✅ Load Mistral-7B-Instruct-v0.3 model using HuggingFaceLLM

# 🔁 Swap in new model version
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

# 🧠 Load tokenizer to get eos_token_id
tokenizer = AutoTokenizer.from_pretrained(model_id)
eos_token_id = tokenizer.eos_token_id

# ✅ Configure LLM with optimized generation parameters
llm = HuggingFaceLLM(
    context_window=3900,
    max_new_tokens=1024,
    generate_kwargs={
        "do_sample": False,         # Deterministic response
        "temperature": 0.0,
        "top_p": 1.0,
        "top_k": 50,
        "pad_token_id": eos_token_id
    },
    tokenizer_name=model_id,
    model_name=model_id,
    device_map="auto",
    tokenizer_kwargs={"use_fast": True},
    model_kwargs={"torch_dtype": "auto"}  # Use float16 if memory is tight
)

# ✅ Reinitialize query engine with updated LLM
query_engine = pension_index.as_query_engine(
    llm=llm,
    similarity_top_k=5  # try 5 or even 6 if memory allows
)

print("✅ Mistral-7B-Instruct-v0.3 loaded successfully and ready to go!")

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

✅ Mistral-7B-Instruct-v0.3 loaded successfully and ready to go!


Step 6 -- Manual Context

In [None]:
def ask_with_manual_context(query: str):
    print("\n🧪 Custom Prompt with Manual Context Injection:")
    print(f"📨 Prompt: {query}\n")

    # Retrieve top relevant chunks
    top_chunks = pension_index.as_retriever(similarity_top_k=5).retrieve(query)
    context = "\n\n".join([n.text for n in top_chunks])

    # Enhanced internal instruction prompt
    full_prompt = f"""
You are a reasoning assistant helping answer questions based on pension plan rules.

Use the extracted context to understand the rules, numbers, or formulas from the document.
If the user asks a question requiring calculations or logic, extract variables, show reasoning, and provide the final result.

— When calculating Final Average Salary (FAS), use either the highest 3 consecutive **annual salaries** or the average of the final 36 months of **total annual salary**. Never divide a single annual salary across months.
— If military service has been bought back, count it as credited service.
— The Tier 6 early retirement penalty is **6.5% for each year before age 63**. This penalty must always be applied if the member retires before age 63.
   ➤ For example, retiring at age 60 results in a **19.5% reduction** (3 years early × 6.5%).
   ➤ Retiring at age 63 or later incurs **no penalty**.
— You must apply the early retirement penalty after computing the full pension.
   ➤ Do not skip the penalty.
   ➤ If age is missing, state that you cannot determine the penalty without it.
— When performing calculations, show clear **step-by-step math** using accurate formulas. Avoid overly verbose breakdowns or unnecessary assumptions.
— Always restate the member’s **age**, **years of service**, and **Final Average Salary** before showing calculations.

If no answer can be found, say: "Not enough information in the document."

PDF Context:
{context}

User Question:
{query}

Answer:
""".strip()

    response = llm.complete(full_prompt)
    print("🧠 Response:\n")
    print(response.text.strip())
    return response.text.strip()

In [None]:
ask_with_manual_context("I'm 60 with 32 years of service and a final average salary of $110,000. What is my pension?")


🧪 Custom Prompt with Manual Context Injection:
📨 Prompt: I'm 60 with 32 years of service and a final average salary of $110,000. What is my pension?





🧠 Response:

First, let's confirm your age, years of service, and Final Average Salary (FAS):
- Age: 60
- Years of Service: 32
- Final Average Salary (FAS): $110,000

Since you have more than 20 years of service, we'll use the formula for Tier 6 members with 20 or more years:
- 35% of FAS for the first 20 years, plus 2% for each year beyond 20.

For the first 20 years:
- 35% of $110,000 = $38,500

For the remaining 12 years:
- 12 years × 2% = 24%
- 24% of $110,000 = $26,400

Now, let's add these two amounts together:
- $38,500 + $26,400 = $64,900

However, since you are retiring at age 60, there is a 6.5% early retirement penalty for each year before age 63.
- 3 years early × 6.5% = 19.5% reduction
- $64,900 × 19.5% = $12,584.50

Finally, subtract the penalty from the pension calculation:
- $64,900 - $12,584.50 = $52,315.50

Your pension would be approximately $52,315.50 per year.


"First, let's confirm your age, years of service, and Final Average Salary (FAS):\n- Age: 60\n- Years of Service: 32\n- Final Average Salary (FAS): $110,000\n\nSince you have more than 20 years of service, we'll use the formula for Tier 6 members with 20 or more years:\n- 35% of FAS for the first 20 years, plus 2% for each year beyond 20.\n\nFor the first 20 years:\n- 35% of $110,000 = $38,500\n\nFor the remaining 12 years:\n- 12 years × 2% = 24%\n- 24% of $110,000 = $26,400\n\nNow, let's add these two amounts together:\n- $38,500 + $26,400 = $64,900\n\nHowever, since you are retiring at age 60, there is a 6.5% early retirement penalty for each year before age 63.\n- 3 years early × 6.5% = 19.5% reduction\n- $64,900 × 19.5% = $12,584.50\n\nFinally, subtract the penalty from the pension calculation:\n- $64,900 - $12,584.50 = $52,315.50\n\nYour pension would be approximately $52,315.50 per year."

In [None]:
# ✅ Batch Q&A test for non-calculation pension questions
def run_rag_qa_tests(questions):
    print("📘 Traditional RAG QA Tests\n" + "=" * 40)
    for i, q in enumerate(questions):
        print(f"\n🟦 Question {i+1}: {q}")
        ask_with_manual_context(q)

In [None]:
test_questions = [
    "What is this plan's pension formula?",
    "What is the rule for early retirement under Tier 6?",
    "What is the penalty if I retire at age 59?",
    "What is the buyback policy for prior service?",
    "What happens if I take a leave of absence?",
    "How does military service count toward pension eligibility?",
    "What is the maximum retirement allowance I can receive?",
    "Can I work in another public job after retiring?",
    "What is the cost-of-living adjustment (COLA) policy?",
    "How does dual employment affect my pension?"
]

In [None]:
run_rag_qa_tests(test_questions)

📘 Traditional RAG QA Tests

🟦 Question 1: What is this plan's pension formula?

🧪 Custom Prompt with Manual Context Injection:
📨 Prompt: What is this plan's pension formula?

🧠 Response:

The pension formula for this plan is as follows:

1. If you have less than 20 years of Credited Service: 1.67% × Final Average Salary (FAS) × Years of Credited Service.
2. If you have 20 or more years of Credited Service:
   - 35% of FAS for the first 20 years of Credited Service.
   - Plus 2% for each year in excess of 20.

For example, if a member has 25 years of Credited Service and a Final Average Salary of $70,000, the pension calculation would be:

- First 20 years: 35% of $70,000 = $24,500.
- Remaining 5 years: 2% × 5 = 10% of $70,000 = $7,000.
- Total pension: $24,500 + $7,000 = $31,500.

🟦 Question 2: What is the rule for early retirement under Tier 6?

🧪 Custom Prompt with Manual Context Injection:
📨 Prompt: What is the rule for early retirement under Tier 6?

🧠 Response:

In Tier 6, members

In [None]:
# Prompt 1: Test multi-year FAS logic
prompt_fas = (
    "I’m 62 years old and have 28 years of service. "
    "My last five years of salary were: $90,000, $95,000, $100,000, $105,000, and $110,000. "
    "What is my pension under Tier 6 rules?"
)

# Prompt 2: Test military buyback logic
prompt_military = (
    "I’m 63 with 17 years of regular service and 3 years of prior military service that I bought back. "
    "My final average salary is $95,000. "
    "What is my pension under Tier 6 rules?"
)

In [None]:
ask_with_manual_context(prompt_fas)


🧪 Custom Prompt with Manual Context Injection:
📨 Prompt: I’m 62 years old and have 28 years of service. My last five years of salary were: $90,000, $95,000, $100,000, $105,000, and $110,000. What is my pension under Tier 6 rules?

🧠 Response:

First, let's determine the Final Average Salary (FAS). Since you have 28 years of service, we will use the average of the last 3 years of your salary.

FAS = (90,000 + 95,000 + 100,000) / 3 = 96,666.67

Now, let's calculate your pension using the Tier 6 formula:

- If you have 20 or more years: 35% of FAS for the first 20 years, plus 2% for each year beyond 20.

Pension = 35% * FAS * 20 + 2% * (Years of Service - 20) * FAS
Pension = 35% * 96,666.67 * 20 + 2% * (28 - 20) * 96,666.67
Pension = 35 * 96,666.67 * 20 + 2 * 8 * 96,666.67
Pension = 1,479,999.96 + 153,333.33
Pension = 1,633,333.33

Since you are retiring at age 62, there is no early retirement penalty applied. Your pension under Tier 6 rules is $1,633,333.33.


"First, let's determine the Final Average Salary (FAS). Since you have 28 years of service, we will use the average of the last 3 years of your salary.\n\nFAS = (90,000 + 95,000 + 100,000) / 3 = 96,666.67\n\nNow, let's calculate your pension using the Tier 6 formula:\n\n- If you have 20 or more years: 35% of FAS for the first 20 years, plus 2% for each year beyond 20.\n\nPension = 35% * FAS * 20 + 2% * (Years of Service - 20) * FAS\nPension = 35% * 96,666.67 * 20 + 2% * (28 - 20) * 96,666.67\nPension = 35 * 96,666.67 * 20 + 2 * 8 * 96,666.67\nPension = 1,479,999.96 + 153,333.33\nPension = 1,633,333.33\n\nSince you are retiring at age 62, there is no early retirement penalty applied. Your pension under Tier 6 rules is $1,633,333.33."

In [None]:
ask_with_manual_context(prompt_military)


🧪 Custom Prompt with Manual Context Injection:
📨 Prompt: I’m 63 with 17 years of regular service and 3 years of prior military service that I bought back. My final average salary is $95,000. What is my pension under Tier 6 rules?

🧠 Response:

First, let's confirm the variables:
- Age: 63
- Years of regular service: 17
- Years of credited service (including bought-back military service): 20
- Final Average Salary (FAS): $95,000

Since you have 20 years of credited service, we will use the formula for 20 or more years:
35% of FAS for the first 20 years, plus 2% for each year beyond 20.

Step 1: Calculate the pension for the first 20 years:
35% of FAS = 0.35 × $95,000 = $32,750

Step 2: Calculate the pension for the years beyond 20:
2% for each year beyond 20 = 2 × (17 - 20) = -28% (since we have fewer years than the formula requires)

Step 3: Subtract the negative percentage from 100% to get the positive percentage for the remaining years:
100% - (-28%) = 128%

Step 4: Multiply the FAS

"First, let's confirm the variables:\n- Age: 63\n- Years of regular service: 17\n- Years of credited service (including bought-back military service): 20\n- Final Average Salary (FAS): $95,000\n\nSince you have 20 years of credited service, we will use the formula for 20 or more years:\n35% of FAS for the first 20 years, plus 2% for each year beyond 20.\n\nStep 1: Calculate the pension for the first 20 years:\n35% of FAS = 0.35 × $95,000 = $32,750\n\nStep 2: Calculate the pension for the years beyond 20:\n2% for each year beyond 20 = 2 × (17 - 20) = -28% (since we have fewer years than the formula requires)\n\nStep 3: Subtract the negative percentage from 100% to get the positive percentage for the remaining years:\n100% - (-28%) = 128%\n\nStep 4: Multiply the FAS by the positive percentage for the remaining years:\n128% of FAS = 1.28 × $95,000 = $121,600\n\nStep 5: Add the pension amounts from steps 1 and 4:\n$32,750 + $121,600 = $154,350\n\nSince you are retiring at age 63, there is 

In [None]:
def calculate_pension_python(age, years_of_service, final_avg_salary):
    # Determine base formula
    if years_of_service < 20:
        base = 0.0167 * years_of_service * final_avg_salary
    else:
        base = 0.35 * final_avg_salary
        bonus_years = years_of_service - 20
        bonus = 0.02 * bonus_years * final_avg_salary if bonus_years > 0 else 0
        base += bonus

    # Apply Tier 6 penalty
    penalty_rate = max(0, (63 - age) * 0.065)  # 6.5% per year early
    adjusted = base * (1 - penalty_rate)

    return round(adjusted, 2)

In [None]:
def extract_llm_inputs(text):
    import unicodedata
    import re

    # Normalize Unicode input (remove curly apostrophes, etc.)
    normalized = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode()
    print("\n🧪 Normalized Input:")
    print(normalized)

    # AGE extraction
    age = None
    try:
        print("\n🔍 Searching for Age...")
        age_match = re.search(
            r"(?i)(?:age(?: is)?|i['’‘ʼ]?m|i am|retire(?:d|s|ing)? at|turn(?:ing)? ?age?)[:\s]*([5-6][0-9])",
            normalized
        )
        if age_match:
            age = int(age_match.group(1))
            print(f"✅ Age Match: {age_match.group(0)} -> {age}")
        else:
            print("❌ No age match found.")
    except Exception as e:
        print("❌ Age exception:", e)

    # YEARS extraction
    years = None
    try:
        print("\n🔍 Searching for Years of Service...")
        # Only match if preceded by 'years of service' or 'credited service' labels
        years_match = re.search(
            r"(?i)(?:have|with)[^\d]{0,10}([1-4]?[0-9])\s+(?:years|years of service|credited service)"
            r"|(?:years of service|credited service)[:\s]*([1-4]?[0-9])"
            r"|([1-4]?[0-9])\s+years(?=\D|$)",  # fallback
            normalized
        )
        if years_match:
            years = int(next(g for g in years_match.groups() if g))
            print(f"✅ Years Match: {years_match.group(0)} -> {years}")
        else:
            print("❌ No years match found.")
    except Exception as e:
        print("❌ Years exception:", e)

    # FAS extraction
    fas = None
    try:
        print("\n🔍 Searching for FAS...")
        fas_match = re.search(
            r"\$?([8-9][0-9]{3,5}|100000|110000|120000)",
            normalized.replace(",", "")
        )
        if fas_match:
            fas = float(fas_match.group(1))
            print(f"✅ FAS Match: {fas_match.group(0)} -> {fas}")
        else:
            print("❌ No FAS match found.")
    except Exception as e:
        print("❌ FAS exception:", e)

    return age, years, fas


In [None]:
def validate_llm_math(response_text):
    print("\n🧠 Validating LLM Pension Estimate with Python:")

    age, years, fas = extract_llm_inputs(response_text)
    if None in (age, years, fas):
        print("❌ Could not extract all inputs for validation.")
        print(f"🔍 Age: {age}, Years: {years}, FAS: {fas}")
        return

    python_result = calculate_pension_python(age, years, fas)
    print(f"🔢 Correct Pension Estimate (Python): ${python_result:,.2f}")

In [None]:
response_text = ask_with_manual_context("I'm 60 with 28 years of service and a final average salary of $110,000. What is my pension?")
validate_llm_math(response_text)


🧪 Custom Prompt with Manual Context Injection:
📨 Prompt: I'm 60 with 28 years of service and a final average salary of $110,000. What is my pension?

🧠 Response:

First, let's confirm your age, years of service, and Final Average Salary (FAS):
- Age: 60
- Years of Service: 28
- Final Average Salary (FAS): $110,000

Since you have 28 years of service, you are eligible for the second pension formula:
35% of FAS for the first 20 years, plus 2% for each year beyond 20.

Let's calculate the pension:
- First 20 years: 35% of FAS = 0.35 × $110,000 = $38,500
- Remaining 8 years: 2% for each year = 2 × 8 = 16%
- Total pension: $38,500 + (0.16 × $110,000) = $38,500 + $17,600 = $56,100

However, since you are retiring at age 60, you will incur an early retirement penalty of 19.5% (3 years early × 6.5%).

Let's apply the early retirement penalty:
- Penalty amount: $56,100 × 0.195 = $10,977
- Adjusted pension: $56,100 - $10,977 = $45,123

So, your pension would be approximately $45,123 per year.



In [None]:
# Step 6 (run the query)
response_text = ask_with_manual_context("I'm 63 with 17 years of regular service and 3 years of prior military service that I bought back. My final average salary is $95,000. What is my pension under Tier 6 rules?")

# Step 7 (validate using Python math)
validate_llm_math(response_text)


🧪 Custom Prompt with Manual Context Injection:
📨 Prompt: I'm 63 with 17 years of regular service and 3 years of prior military service that I bought back. My final average salary is $95,000. What is my pension under Tier 6 rules?

🧠 Response:

First, let's confirm the variables:
- Age: 63
- Years of regular service: 17
- Years of total service (including bought-back military service): 20
- Final Average Salary (FAS): $95,000

Since you have 20 years of total service, we'll use the formula for 20 or more years:
35% of FAS for the first 20 years, plus 2% for each year beyond 20.

Step 1: Calculate the pension for the first 20 years:
35% of FAS = 0.35 × $95,000 = $32,750

Step 2: Calculate the pension for the remaining years:
2% for each year beyond 20 = 2% × 3 = 0.06 × 3 = 0.18

Step 3: Add the pension amounts from steps 1 and 2:
$32,750 + $18 = $32,768

Since you are retiring at age 63, there is no early retirement penalty.

Your pension under Tier 6 rules is $32,768 per year.

🧠 Valid