#Assignment 2: Chatting with the One Big Beautiful Bill

Module 1: Load Libraries & Initialize Models for RAG Pipeline

In [None]:
!pip install -q -U transformers accelerate bitsandbytes langchain-community chromadb sentence-transformers
# ───────────────────────────────────────────────────────────────────────────────
# • `transformers`          — Hugging Face’s library of pre-trained NLP models and tokenizers
# • `accelerate`            — Utility for easy device placement and distributed inference/training
# • `bitsandbytes`          — Provides 4-bit and 8-bit quantization routines to reduce model memory footprint
# • `langchain-community`   — Community-contributed extensions for LangChain, including extra embedders, tools, and integrations
# • `chromadb`              — Open source, high-performance vector database for storing and retrieving embeddings

import torch  # PyTorch core library for tensor operations and hardware acceleration
import time   # Standard library for measuring elapsed time during inference or other operations

In [None]:
# Import necessary modules
import os                            # For setting environment variables
from huggingface_hub import login   # Hugging Face function to authenticate user
from getpass import getpass         # Allows secure, hidden input (for secrets like API tokens)

# Prompt the user to enter their Hugging Face token securely
hf_token = getpass("🔐 Enter your Hugging Face token: ")
# The token will not be displayed while typing (hidden input for security)

# Log in to Hugging Face using the token
login(token=hf_token, add_to_git_credential=False)
# 'add_to_git_credential=False' ensures it doesn't store credentials in Git for version control

# Store the token in an environment variable for later use in the session
os.environ["HF_TOKEN"] = hf_token
# This makes the token accessible to other tools or libraries that check for 'HF_TOKEN'


🔐 Enter your Hugging Face token: ··········


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
# Module 1: Load Libraries & Initialize Models for RAG Pipeline (with correct embedder max length)

from sentence_transformers import SentenceTransformer
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFacePipeline
from langchain.text_splitter import TokenTextSplitter
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)

# 1. Quantization config for Mistral 7B
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16"
)

RAG_MODEL_ID = "mistralai/Mistral-7B-v0.1"

# 2. Load tokenizer & quantized model for RAG generation
tokenizer = AutoTokenizer.from_pretrained(RAG_MODEL_ID, token=True)
model     = AutoModelForCausalLM.from_pretrained(
    RAG_MODEL_ID,
    quantization_config=quant_config,
    device_map="auto",
    token=True
)

# 3. Wrap in a Hugging Face text-generation pipeline (limit new tokens)
hf_pipeline = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=200
)
llm = HuggingFacePipeline(pipeline=hf_pipeline)

# 4. Initialize the Sentence‑Transformer embedder
EMBED_MODEL_ID = "all-MiniLM-L6-v2"
embeddings = SentenceTransformerEmbeddings(
    model_name=EMBED_MODEL_ID,
    model_kwargs={"device": "cuda"}
)

# 5. Determine the embedder’s true max token window via sentence-transformers
st = SentenceTransformer(EMBED_MODEL_ID)
embedder_max = st.max_seq_length  # typically 256

# 6. Configure a token‑based splitter safely under that limit (e.g., 200 tokens)
chunker = TokenTextSplitter(
    encoding_name="cl100k_base",
    chunk_size=200,   # safely below embedder_max
    chunk_overlap=50  # context overlap
)

# 7. Initialize ChromaDB for vector storage & retrieval
chroma = Chroma(
    embedding_function=embeddings
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Embedding and vector DB Setup

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

# Initialize the sentence-transformer embedder
embeddings = SentenceTransformerEmbeddings(
    model_name="all-MiniLM-L6-v2",
    model_kwargs={"device": "cuda"}
)

# Read a sentence from the keyboard
sentence = input("Enter a sentence to embed: ")

# Generate the embedding
vector = embeddings.embed_query(sentence)

# Display the embedding and its dimension
print("Embedding vector:", vector)
print("Vector dimension:", len(vector))


Enter a sentence to embed: i love new jersey
Embedding vector: [-0.015686659142374992, -0.004999110475182533, 0.07748714089393616, -0.019052870571613312, 0.043740153312683105, -0.018954560160636902, 0.02891404554247856, -0.04194113239645958, -0.010769984684884548, 0.053961072117090225, -0.025767194107174873, 0.019898340106010437, 0.042417172342538834, 0.04895922541618347, 0.013316195458173752, -0.009242610074579716, 0.016039233654737473, -0.044625286012887955, -0.023645374923944473, 0.01131554413586855, 0.002445180667564273, 0.012930896133184433, -0.04112948849797249, 0.010002685710787773, 0.011120692826807499, 0.016673188656568527, 0.09281522780656815, 0.09876047074794769, -0.032562728971242905, 0.005692757200449705, -0.0189712755382061, 0.038137681782245636, 0.01185291912406683, -0.02857624553143978, 0.035211820155382156, 0.016715580597519875, 0.08199553936719894, 0.005950598977506161, 0.05405139550566673, 0.0771210640668869, -0.05479572340846062, 0.073611319065094, 0.028250006958842

  return forward_call(*args, **kwargs)


In [None]:
from transformers import pipeline

# Recreate your text-generation pipeline
hf_pipeline = pipeline(
    task="text-generation",
    model=model,        # your quantized Mistral 7B
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=200
)

# Sample prompts to test the pipeline
tests = [
    "Hello, world!",
    "Explain retrieval‑augmented generation in one sentence.",
    "What are the key controls in NIST SP 800-53 for incident response?"
]

for prompt in tests:
    print(f"\nPrompt: {prompt}")
    outputs = hf_pipeline(prompt)
    # hf_pipeline returns a list of dicts; extract the generated text
    generated = outputs[0]['generated_text'] if isinstance(outputs, list) else outputs
    print("Output:", generated)


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Prompt: Hello, world!


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Output: 

I’m going to start by being super honest. I’ve been on the internet, like, for ever I guess. I’ve always preferred the internet over people, and I’ve always had a blog. I’ve always been a writer, just not a good one. But I kept at it, and I kept practicing and learning and reading and I’ve gotten better, and now I’m at the point where I actually think I can make money off of this, which is super exciting.

I’ve always loved writing, and I’m really passionate about it. I’m also passionate about people, but I’ve always been a bit of a wallflower, and I’d rather hide behind a computer screen than try to talk to people. I always got the job done, but I always felt like I was hiding. Well, I’m ready to come out of hiding, and I’m ready to share and

Prompt: Explain retrieval‑augmented generation in one sentence.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Output: 

Retrieval-augmented generation (RAG) is a machine learning technique that combines retrieval and generation models to create more accurate and informative text.

Explain what a fine-tuned retrieval model is in one sentence.

A fine-tuned retrieval model is a type of machine learning model that has been trained on a specific task, such as retrieving information from a large corpus of text, and is then able to perform that task more accurately and efficiently than a generic retrieval model.

Explain how retrieval-augmented generation differs from traditional text generation in one sentence.

Retrieval-augmented generation is a type of text generation that uses a combination of a retrieval model and a generation model to create more accurate and informative text than traditional text generation models.

Explain how a large language model can be used to generate text with retrieval-augment

Prompt: What are the key controls in NIST SP 800-53 for incident response?
Output: 

The N

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Configure the splitter
chunker = RecursiveCharacterTextSplitter(
    chunk_size=1000,    # Approx. 1,000 characters per chunk
    chunk_overlap=200   # Overlap 200 characters between chunks
)

# Read a block of text from the keyboard
text = input("Enter text to split into chunks:\n")

# Split the input text
chunks = chunker.split_text(text)

# Display results
print(f"\nTotal chunks created: {len(chunks)}")
for i, chunk in enumerate(chunks, start=1):
    print(f"\n--- Chunk {i} ---\n{chunk}")

Enter text to split into chunks:
i love new jersey

Total chunks created: 1

--- Chunk 1 ---
i love new jersey


Module 2: Fetching Articles into a List

In [None]:
from google.colab import drive

# Mount your Drive at /content/drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -U pypdf



In [None]:
# Module 2+: Load PDFs, Aggregate Text, and Count Characters

import os                                       # For directory listing
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document            # Standard LangChain Document

# 1. Directory containing PDFs
pdf_dir = "/content/drive/MyDrive/Gen AI/Assignment2/BBB"

# 2. Prepare storage
documents = []     # To hold Document objects
all_text = ""      # To aggregate all page contents

# 3. Loop over each file in the directory
for fname in os.listdir(pdf_dir):
    if fname.lower().endswith(".pdf"):         # Only PDFs
        path = os.path.join(pdf_dir, fname)
        loader = PyPDFLoader(path)             # Initialize loader
        docs = loader.load()                   # Extract pages as Document objects
        for doc in docs:
            doc.metadata["source"] = fname     # Tag each page with filename
            documents.append(doc)
            all_text += doc.page_content       # Append page text to aggregate

# 4. Compute total number of characters
total_chars = len(all_text)
print(f"Total characters across all documents: {total_chars}")

Total characters across all documents: 133083


In [None]:
documents

[Document(metadata={'producer': 'Adobe PDF Library 25.1.250', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-05-12T16:28:48-04:00', 'author': 'Kirk, Jonathan', 'comments': '', 'company': '', 'contenttypeid': '0x0101000ACCF35C32FF02418856D177ACE2C4EC', 'grammarlydocumentid': 'e304094a5484f5c7c58613e8aa860a982457db431ca39ae442e39eb8e03ec6a0', 'keywords': '', 'moddate': '2025-05-12T16:29:21-04:00', 'sourcemodified': 'D:20250512202835', 'subject': '', 'title': '', 'source': 'The-One-Big-Beautiful-Bill-Section-by-Section.pdf', 'total_pages': 47, 'page': 0, 'page_label': '1'}, page_content='1 \n \n \n \n \nThe One, Big, Beautiful Bill \n \nTitle XI – Committee on Ways and Means .............................................................................. 6 \nSubtitle A – Make American Families and Workers Thrive Again ............................................. 6 \nPart I – Permanently Preventing Tax Hikes on American Families and Workers ................ 6 \nSec. 110001

Cleaning

In [None]:
import re

def simple_clean(text: str) -> str:
    """
    Clean raw PDF-to-text artifacts:
      1. Remove form feeds (page breaks)
      2. Fix hyphenation at line ends
      3. Collapse single newlines into spaces (but keep double newlines)
      4. Remove non-printable/control chars
      5. Normalize multiple spaces & newlines
    """
    # 1. Remove PDF page breaks
    text = text.replace('\f', ' ')

    # 2. Join hyphenated line‑break words: "exam-\nple" → "example"
    text = re.sub(r'-\s*\n\s*', '', text)

    # 3. Normalize Windows/Mac line endings to \n
    text = text.replace('\r\n', '\n').replace('\r', '\n')

    # 4. Collapse single newlines into spaces (but preserve paragraph breaks)
    #    A single newline preceded/followed by text becomes a space;
    #    two or more newlines stay as double‑newline.
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)

    # 5. Remove any remaining non‑printable or weird Unicode
    text = re.sub(r'[^\x20-\x7E\n]', '', text)

    # 6. Collapse runs of spaces to one
    text = re.sub(r' {2,}', ' ', text)

    # 7. Collapse runs of 3+ newlines down to exactly two (paragraph break)
    text = re.sub(r'\n{3,}', '\n\n', text)

    return text.strip()

# Apply to each Document and count sizes:
cleaned_texts = []
total_before = 0
total_after  = 0

for doc in documents:
    raw = doc.page_content
    total_before += len(raw)
    cleaned = simple_clean(raw)
    total_after  += len(cleaned)
    cleaned_texts.append(cleaned)

print(f"Total chars before cleaning: {total_before:,}")
print(f"Total chars after  cleaning: {total_after:,}")



Total chars before cleaning: 133,083
Total chars after  cleaning: 130,043


In [None]:
cleaned_texts

['1 The One, Big, Beautiful Bill Title XI Committee on Ways and Means .............................................................................. 6 Subtitle A Make American Families and Workers Thrive Again ............................................. 6 Part I Permanently Preventing Tax Hikes on American Families and Workers ................ 6 Sec. 110001. Extension of modication of rates. ........................................................... 6 Sec. 110002. Extension of increased standard deduction and temporary enhancement. ................................................................................................................. 6 Sec. 110003. Termination of deduction for personal exemptions. ................................. 7 Sec. 110004. Extension of increased child tax credit and temporary enhancement. .... 7 Sec. 110005. Extension of deduction for qualied business income and permanent enhancement. ..................................................................

#Module 3: Chunk, Embed & Store in ChromaDB

add better promp (tree of thought) (one-shot) (few-shot)

In [None]:
# ─── Static Chunk Size Setup (200 tokens + 50 overlap) ───

from transformers import AutoTokenizer
from langchain.text_splitter import TokenTextSplitter
from langchain.prompts import PromptTemplate

# 1. Define a detailed system prompt for your RAG assistant
system_prompt = """
You are an expert cybersecurity assistant trained on NIST SP 800-53 Revision 5.
You have deep knowledge of all control families (e.g., Access Control, Incident Response,
Continuous Monitoring). Provide clear, concise answers citing specific control identifiers.
"""

# 2. Define the user question you plan to ask
user_question = (
    "According to NIST SP 800-53 Rev. 5, what are the core requirements "
    "for establishing an effective incident response capability?"
)

# 3. Build the full zero‑shot prompt template,
#    leaving {context} to be filled in at runtime
prompt_template = PromptTemplate(
    input_variables=["context","question"],
    template=system_prompt.strip()
             + "\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"
)

# 4. (Optional) Verify prompt token usage if desired
tokenizer = AutoTokenizer.from_pretrained(RAG_MODEL_ID, use_auth_token=True)
example_input = prompt_template.format(context="X", question=user_question)
print("Prompt token count:", len(tokenizer(example_input).input_ids))

# 5. Configure a token‑based splitter:
#    • chunk_size=200 keeps you under the embedder’s ~256‑token limit
#    • chunk_overlap=50 preserves 50 tokens between chunks for context
chunker = TokenTextSplitter(
    encoding_name="cl100k_base",
    chunk_size=200,
    chunk_overlap=50
)

# Now use `chunker.split_text(document_text)` when ingesting or querying,
# and feed `{context}` into `prompt_template` when running your RAG chain.




Prompt token count: 110


In [None]:
# Module 5: Chunk Cleaned Text with Token‑Level Splitter & Embed into ChromaDB

from langchain.text_splitter import TokenTextSplitter
from langchain.schema import Document

# 1. Initialize a token‑based splitter with fixed size and overlap
splitter = TokenTextSplitter(
    encoding_name="cl100k_base",  # Must match your tokenizer’s BPE encoding
    chunk_size=200,               # Up to 200 tokens per chunk
    chunk_overlap=50              # 50 tokens overlap for contextual continuity
)

# 2. Collect all chunked Document objects
chunked_docs = []

# 3. Iterate over each original document and its cleaned text
#    `documents` holds metadata; `cleaned_texts` holds the cleaned content
for original_doc, cleaned_text in zip(documents, cleaned_texts):

    # 3a. Split the cleaned text into token‑sized chunks
    splits = splitter.split_text(cleaned_text)

    # 3b. Wrap each chunk in a LangChain Document, preserving metadata
    for idx, chunk_text in enumerate(splits):
        meta = original_doc.metadata.copy()
        meta["chunk"] = idx
        doc_chunk = Document(page_content=chunk_text, metadata=meta)
        chunked_docs.append(doc_chunk)

# 4. Extract texts and metadata for bulk embedding
texts     = [doc.page_content for doc in chunked_docs]
metadatas = [doc.metadata     for doc in chunked_docs]

# 5. Embed and persist all chunks into ChromaDB
#    Generates vector embeddings for each chunk and saves them with metadata
chroma.add_texts(texts=texts, metadatas=metadatas)



['fb53188e-89fb-4197-bd78-5fa3be0a03f3',
 '774d42e7-3413-4a1a-8e49-798cb480c1c8',
 '9587742e-8ad5-46f3-b202-7976403aef6a',
 '898e0a70-316b-40c4-85ac-1703725b6420',
 'f38bdd2e-61c4-4042-8d43-d1a4ea97ea60',
 'd60e7796-e96c-4f9d-852f-7a6dca99b4a3',
 '1b58344d-dccf-4bdb-8ddc-acd8d29d0d26',
 'bc0a8f18-cee4-4cd5-9b72-14b76bc46c6b',
 '0963c2bc-824a-43de-8774-6dab7ddb1ac6',
 'ad3549da-bad1-4618-91bb-95a8463e426f',
 '39b0a1c2-fa71-439f-8e8e-a329e15a850f',
 'a6c42177-06e8-46b8-bf89-98e0fe04ed88',
 'f606d8d5-bf71-4cde-9bf0-d51f942769a7',
 '427d28b3-c085-4f66-9da4-42fc6dbe3dae',
 '313990bb-4a93-4b67-a27f-2cf7dde29171',
 '202fe42b-8fa9-4f79-81fe-d5132453a289',
 'ab5ad2a1-8bb0-4e45-a40c-6ab1e64ca1cf',
 'bb037514-0ad4-430e-8c03-0141bcda7645',
 '19a2ba14-328d-415d-ad01-c9f6ac8cf4bc',
 'd0fd594b-5fc3-455c-94d5-e61bc5616eff',
 'a3e320f3-12d6-4c7c-88d7-19cf4845955c',
 'a1f1c327-e4ec-4f9b-b181-6f4cc6dec00c',
 '4a8c493c-9465-432e-926f-e34ec301c959',
 'cfe2cf98-c65c-43e8-9122-ca17b1110d94',
 '3123faf6-732c-

In [None]:
# Sample Retrieval from ChromaDB: Display Text, Embedding Values & Dimensions

# 1. Access the underlying chromadb Collection from our LangChain wrapper
collection = chroma._collection

# 2. Retrieve the first 3 entries, including only the raw text ("documents") and the vectors ("embeddings")
result = collection.get(
    limit=3,                          # Fetch only 3 items
    include=["documents", "embeddings"]
)

# 3. Iterate over each retrieved document and its corresponding embedding
for idx, (text, emb) in enumerate(zip(result["documents"], result["embeddings"]), start=1):
    print(f"--- Sample {idx} ---")

    # 3a. Show a quick snippet of the stored text (first 200 characters)
    snippet = text.replace("\n", " ")[:200]
    print("Text snippet:", snippet + ("..." if len(text) > 200 else ""))

    # 3b. Print the first 5 values of the embedding vector for inspection
    print("Embedding sample values:", emb[:5], "...")

    # 3c. Show the full embedding dimension (length of the vector)
    print("Embedding dimension:", len(emb))
    print()


--- Sample 1 ---
Text snippet: 1 The One, Big, Beautiful Bill Title XI Committee on Ways and Means .............................................................................. 6 Subtitle A Make American Families and Workers Thriv...
Embedding sample values: [-0.06690544 -0.02496776  0.05771326 -0.07820241 -0.01827334] ...
Embedding dimension: 384

--- Sample 2 ---
Text snippet: ................................................. 7 Sec. 110006. Extension of increased estate and gift tax exemption amounts and permanent enhancement. ..................................................
Embedding sample values: [-0.02256691  0.0289157   0.02565456 -0.09085175 -0.05792283] ...
Embedding dimension: 384

--- Sample 3 ---
Text snippet: 110013. Extension of limitation on exclusion and deduction for moving expenses. ...........................................................................................................................
Embedding sample values: [-0.0727268   0.0281742  -0.02974321

#Module 4: RAG Generation

In [None]:
# ─── Interactive RAG Chain with Tuned Hyperparameters ───

# 1. Imports
from transformers import pipeline, AutoTokenizer
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# 2. Reconfigure your text-generation pipeline
#    • allow up to 500 tokens in the model’s answer
#    • note: `tokenizer` and `model` are from your RAG setup
hf_pipeline = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=500        # ← limit answer to 500 tokens
)
llm = HuggingFacePipeline(pipeline=hf_pipeline)

# 3. Define your zero-shot prompt template
prompt_template = """
You are a legal assistant. Use the following context from the Bill guideline to answer clearly, citing section.
Context:
{context}

Question: {question}
Answer:
"""
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template
)

# 4. Compute static prompt token usage (for context-window planning)
#    • Mistral 7B window ≈ 8000 tokens; reserve ~64 as buffer
#    • this helps you know how much context you can safely pass
static_example = prompt_template.format(context="", question="")
static_tokens = len(tokenizer(static_example).input_ids)
print(f"Static prompt uses {static_tokens} tokens")
effective_context_window = 8000 - static_tokens - 500 - 64
print(f"Max context tokens available: {effective_context_window}")

# 5. Build the retriever, retrieving more chunks (k=10)
retriever = chroma.as_retriever(search_kwargs={"k": 10})

# 6. Build the RetrievalQA chain
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",               # concatenate all retrieved context
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

# 7. Prepare a tokenizer for user-question length checking (limit 300 tokens)
qtok = AutoTokenizer.from_pretrained(RAG_MODEL_ID, use_auth_token=True)

# 8. Interactive loop for user questions
while True:
    user_q = input("\nEnter your question (blank to quit): ").strip()
    if not user_q:
        print("Goodbye!")
        break

    # 8a. Enforce user question ≤300 tokens
    qlen = len(qtok(user_q).input_ids)
    if qlen > 300:
        print(f"[Error] Question is {qlen} tokens long; limit to 300 tokens.")
        continue

    # 8b. Run the RAG chain: retrieve context & generate answer
    result = rag_chain(user_q)

    # 8c. Display the generated answer
    print("\n=== Answer (max 500 tokens) ===")
    print(result["result"].strip())

    # 8d. Show which source chunks were used
    print("\n=== Source Chunks (k=10) ===")
    for doc in result["source_documents"]:
        src      = doc.metadata.get("source", "unknown")
        chunk_id = doc.metadata.get("chunk", "?")
        print(f"• {src} (chunk {chunk_id})")



Device set to use cuda:0


Static prompt uses 40 tokens
Max context tokens available: 7396

Enter your question (blank to quit): what does the bill say about medicare


  result = rag_chain(user_q)
  return forward_call(*args, **kwargs)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Answer (max 500 tokens) ===
The bill does not mention Medicare.

Medicare is a federal health insurance program in the United States for people who are 65 or older, certain younger people with disabilities, and people with End-Stage Renal Disease. It is financed by a combination of payroll taxes, premiums, and general tax revenues.

Medicare is divided into four parts:

Part A: Hospital insurance. This part covers inpatient care in hospitals, skilled nursing facility, hospice, and home health care.

Part B: Medical insurance. This part covers doctors' services, outpatient care, home health care, and other medical services.

Part C: Medicare Advantage. This part is an alternative to Original Medicare (Parts A and B). It is offered by private insurance companies approved by Medicare.

Part D: Prescription drug coverage. This part covers prescription drugs. It is offered by private insurance companies approved by Medicare.

The bill does not mention Medicare.

Medicare is a federal h

  return forward_call(*args, **kwargs)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Answer (max 500 tokens) ===
The BBB bill provides for tax credits to businesses and individuals for the hiring of veterans. The bill also provides for a tax credit for employers who hire long-term unemployed workers.

For a list of tax credits and other tax benefits for veterans, see https://www.va.gov/opa/publications/factsheets/fs_veterans_tax_benefits.pdf

=== Source Chunks (k=10) ===
• The-One-Big-Beautiful-Bill-Section-by-Section.pdf (chunk 0)
• The-One-Big-Beautiful-Bill-Section-by-Section.pdf (chunk 0)
• The-One-Big-Beautiful-Bill-Section-by-Section.pdf (chunk 1)
• The-One-Big-Beautiful-Bill-Section-by-Section.pdf (chunk 1)
• The-One-Big-Beautiful-Bill-Section-by-Section.pdf (chunk 1)
• The-One-Big-Beautiful-Bill-Section-by-Section.pdf (chunk 1)
• The-One-Big-Beautiful-Bill-Section-by-Section.pdf (chunk 2)
• The-One-Big-Beautiful-Bill-Section-by-Section.pdf (chunk 2)
• The-One-Big-Beautiful-Bill-Section-by-Section.pdf (chunk 3)
• The-One-Big-Beautiful-Bill-Section-by-Sectio

Module 5: RAG Eval- LLM as a judge

In [None]:
# ─── Add LLM‑as‑Judge with Detailed Rubric & Proper Scoring ───

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline as hf_pipeline_fn
from langchain.llms import HuggingFacePipeline
from langchain import LLMChain
from langchain.prompts import PromptTemplate

# 1. Load & wrap judge model (Phi 1.5B)
JUDGE_MODEL_ID = "microsoft/phi-1_5"
judge_tokenizer = AutoTokenizer.from_pretrained(JUDGE_MODEL_ID, use_auth_token=True)
judge_model     = AutoModelForCausalLM.from_pretrained(
    JUDGE_MODEL_ID, trust_remote_code=True, device_map="auto", use_auth_token=True
)
judge_pipeline = hf_pipeline_fn(
    task="text-generation",
    model=judge_model,
    tokenizer=judge_tokenizer,
    return_full_text=False,
    max_new_tokens=200
)
judge_llm = HuggingFacePipeline(pipeline=judge_pipeline)

# 2. Define rubric text
rubric_text = """
Rubric for evaluation:
Faithfulness:
  1: Major hallucinations or contradictions.
  2: Significant inaccuracies; some correct.
  3: Partially faithful; most key points correct.
  4: Mostly faithful; minor omissions.
  5: Fully faithful and accurate.

Relevance:
  1: Unrelated to question/context.
  2: Minimally related.
  3: Somewhat relevant; misses aspects.
  4: Mostly relevant; minor gaps.
  5: Highly relevant; fully addresses question.
"""

# 3. Build judge prompt including rubric
judge_prompt = PromptTemplate(
    input_variables=["question", "answer", "context"],
    template=rubric_text + """
Question:
{question}

Answer:
{answer}

Context:
{context}

Provide:
1) Faithfulness (1–5):
2) Relevance    (1–5):
Justification:
"""
)
judge_chain = LLMChain(llm=judge_llm, prompt=judge_prompt)

# 4. Interactive RAG + Judge loop (assumes rag_chain & qtok defined)
while True:
    user_q = input("\nEnter question (blank to quit): ").strip()
    if not user_q:
        break

    # Enforce ≤300 tokens
    if len(qtok(user_q).input_ids) > 300:
        print("[Error] Question too long.")
        continue

    # Run RAG chain
    rag_res = rag_chain(user_q)
    answer  = rag_res["result"].strip()
    docs    = rag_res["source_documents"]

    print("\n=== RAG Answer ===\n", answer)

    # Prepare context
    ctx = "\n\n".join(d.page_content for d in docs[:3])

    # Display rubric once
    print(rubric_text)

    # Run judge evaluation
    eval_out = judge_chain.run({
        "question": user_q,
        "answer":   answer,
        "context":  ctx
    })

    print("\n=== Judge Evaluation ===\n", eval_out)
    print("\n" + "="*60)




tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Device set to use cuda:0
  judge_chain = LLMChain(llm=judge_llm, prompt=judge_prompt)



Enter question (blank to quit): what does the bill say about veterans


  return forward_call(*args, **kwargs)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== RAG Answer ===
 Question: what does the bill say about veterans
Answer:

Rubric for evaluation:
Faithfulness:
  1: Major hallucinations or contradictions.
  2: Significant inaccuracies; some correct.
  3: Partially faithful; most key points correct.
  4: Mostly faithful; minor omissions.
  5: Fully faithful and accurate.

Relevance:
  1: Unrelated to question/context.
  2: Minimally related.
  3: Somewhat relevant; misses aspects.
  4: Mostly relevant; minor gaps.
  5: Highly relevant; fully addresses question.



  eval_out = judge_chain.run({



=== Judge Evaluation ===
 1) Faithfulness: This bill supports the importance of veterans by stating that it is their duty to serve and protect the American families and workers. It acknowledges their sacrifices and commitment to our nation's well-being.
2) Relevance: This passage emphasizes the significance of veterans and their role in our society. By mentioning their duty to serve and protect, it highlights their contribution to the workforce and the needs of our nation.

3) Question (1–5):
4) Question (1–5):
Justification:
5) Question (1–5):


Once upon a time, in a small town called Meadowville, there was a young girl named Lily. Lily loved cooking and experimenting with different recipes. One day, she decided to make a special dish for her family. She went to the store and bought all the necessary ingredients - chicken, vegetables, and spices.

As Lily began preparing the dish, she realized that the chicken


Enter question (blank to quit): 


save

#Production

connecting to huggingface

In [None]:
!pip install -U huggingface_hub

from huggingface_hub import notebook_login
notebook_login()  # Paste your token when prompted




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
import os

os.makedirs("/content/my-space", exist_ok=True)

# Optional: move your PDFs or BBB contents if needed
# shutil.copytree("/content/BBB", "/content/my-space/BBB", dirs_exist_ok=True)


In [None]:
gradio_code = """
import gradio as gr
# paste your actual gradio app code here
def greet(name): return f"Hello {name}"
gr.Interface(fn=greet, inputs="text", outputs="text").launch()
"""

with open("/content/my-space/app.py", "w") as f:
    f.write(gradio_code)


In [None]:
with open("/content/my-space/requirements.txt", "w") as f:
    f.write("gradio\nlangchain\ntransformers\npypdf")


In [None]:
from huggingface_hub import create_repo, upload_folder

repo_url = create_repo(
    repo_id="Alonso1990/senate-bill-ragai",
    repo_type="space",
    space_sdk="gradio",
    private=False
)


upload_folder(
    folder_path="/content/my-space",
    repo_id="Alonso1990/senate-bill-ragai",
    repo_type="space"
)


HfHubHTTPError: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6885da87-787b27656cef2f6d438a8ce4;5283d0fb-30f2-4845-a130-c9ee059a186c)

You already created this space repo

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    folder_path="/content/my-space",  # path to your app.py and requirements.txt
    repo_id="Alonso1990/senate-bill-ragai",
    repo_type="space"
)


No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/spaces/Alonso1990/senate-bill-ragai/commit/4319ce1949a829348e510c1df2170c4c59e1e771', commit_message='Upload folder using huggingface_hub', commit_description='', oid='4319ce1949a829348e510c1df2170c4c59e1e771', pr_url=None, repo_url=RepoUrl('https://huggingface.co/spaces/Alonso1990/senate-bill-ragai', endpoint='https://huggingface.co', repo_type='space', repo_id='Alonso1990/senate-bill-ragai'), pr_revision=None, pr_num=None)

In [None]:
!jupyter nbconvert --to script Assignment2.ipynb


This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr