In [1]:
import requests
import os
import json
from dotenv import load_dotenv
from pathlib import Path

API_KEY = os.getenv("CONGRESS_API_KEY")
BASE_URL = "https://api.congress.gov/v3"

if API_KEY:
    print(f"API Key loaded: {API_KEY[:4]}...{API_KEY[-4:]}")
else:
    print("Error: CONGRESS_API_KEY not found. Please check your .env file.")

API Key loaded: gHhd...UdsC


In [4]:
def get_recent_bills(limit=500):
    # NOTE: Reduced limit to 10 for testing full text fetch, as it requires 1 call per bill.
    if not API_KEY:
        print("Cannot fetch bills: API Key is missing.")
        return []

    bills = []
    offset = 0
    page_size = 250
    
    print(f"Fetching list of {limit} bills...")
    
    # 1. Get the list of bills first
    while len(bills) < limit:
        remaining = limit - len(bills)
        current_limit = min(remaining, page_size)
        
        url = f"{BASE_URL}/bill"
        params = {
            "api_key": API_KEY,
            "format": "json",
            "offset": offset,
            "limit": current_limit,
            "sort": "updateDate desc"
        }
        
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            data = response.json()
            
            batch = data.get("bills", [])
            if not batch:
                break
                
            bills.extend(batch)
            offset += len(batch)
            
        except Exception as e:
            print(f"Error fetching bill list: {e}")
            break
    
    # 2. Fetch full text for each bill
    print(f"Fetching full text for {len(bills)} bills...")
    detailed_bills = []
    
    for bill in bills:
        congress = bill.get('congress')
        bill_type = bill.get('type').lower()
        bill_number = bill.get('number')
        
        # Construct text endpoint: /bill/{congress}/{type}/{number}/text
        text_url = f"{BASE_URL}/bill/{congress}/{bill_type}/{bill_number}/text"
        
        try:
            # We just want the latest text version
            text_response = requests.get(text_url, params={"api_key": API_KEY, "format": "json"})
            if text_response.status_code == 200:
                text_data = text_response.json()
                # The text is often in a list of versions. We'll take the last one (most recent).
                text_versions = text_data.get('textVersions', [])
                if text_versions:
                    latest_version = text_versions[-1]
                    # Sometimes the actual text is a URL to HTML/XML/PDF
                    # The API might not return the raw string directly in JSON.
                    # It usually gives a link to the formatted text.
                    # For this script, we'll store the metadata about the text availability.
                    bill['text_data'] = latest_version
                else:
                    bill['text_data'] = "No text versions found"
            else:
                bill['text_data'] = f"Error fetching text: {text_response.status_code}"
                
        except Exception as e:
            print(f"Failed to fetch text for bill {bill_number}: {e}")
            bill['text_data'] = str(e)
            
        detailed_bills.append(bill)
        print(f".", end="", flush=True) # Progress dot
            
    return detailed_bills

# Fetch 10 bills with text details
recent_bills = get_recent_bills(500)

print(f"\nSuccessfully retrieved {len(recent_bills)} bills with text metadata.")
print("Sample of first bill text data:")
print(json.dumps(recent_bills[0].get('text_data', {}), indent=2))

Fetching list of 500 bills...
Fetching full text for 500 bills...
....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
Successfully retrieved 500 bills with text metadata.
Sample of first bill text data:
{
  "date": "2015-01-13T05:00:00Z",
  "formats": [
    {
      "type": "Formatted Text",
      "url": "https://www.congress.gov/114/bills/s174/BILLS-114s174is.htm"
    },
    {
      "type": "PDF",
      "url": "https://www.congress.gov/114/bills/s174/BILLS-114s174is.pdf"
    },
    {
      "type": "Formatted XML",
      "url": "https://www.congr

In [5]:
links = []
for bill in recent_bills:
    try:
        links.append(
            {
                "link": bill["text_data"]["formats"][2]["url"],
                "type": "xml"
            }
        )
    except Exception as e:
        try:
            links.append(
                {
                    "link": bill["text_data"]["formats"][1]["url"],
                    "type": "pdf"
                }
            )
        except Exception as e:
            try:
                links.append(
                    {
                        "link": bill["text_data"]["formats"][0]["url"],
                        "type": "normal"
                    }
                )
            except Exception as e:
                continue

In [6]:
import xml.etree.ElementTree as ET
import requests

def chunk_bill_xml(xml_string):
    """
    Semantically chunks a bill XML by section (top-level only).
    """
    chunks = []
    try:
        root = ET.fromstring(xml_string)
        
        def get_text(elem):
            return "".join(elem.itertext())

        # Iterate over all elements to find sections, ignoring namespaces
        for elem in root.iter():
            # STRICTLY match 'section' (ignoring namespace)
            if elem.tag.split('}')[-1] == 'section':
                chunk = {}
                enum_text = ""
                header_text = ""
                
                for child in elem:
                    if child.tag.endswith('enum'):
                        enum_text = get_text(child).strip()
                    elif child.tag.endswith('header'):
                        header_text = get_text(child).strip()
                
                full_text = get_text(elem)
                clean_text = " ".join(full_text.split())
                
                chunk['label'] = f"{enum_text} {header_text}".strip()
                chunk['text'] = clean_text
                
                if chunk['label'] or chunk['text']:
                    chunks.append(chunk)
            
    except Exception as e:
        print(f"Error parsing XML: {e}")
        
    return chunks

# Test with the example bill
url = "https://www.congress.gov/115/bills/s1568/BILLS-115s1568is.xml"
print(f"Fetching example bill from: {url}")
res = requests.get(url)
if res.status_code == 200:
    chunks = chunk_bill_xml(res.text)
    print(f"Found {len(chunks)} chunks.")
    for i, c in enumerate(chunks[:3]):
        print(f"--- Chunk {i+1}: {c['label']} ---")
        print(f"{c['text'][:200]}...\n")
else:
    print(f"Failed to fetch bill: {res.status_code}")

Fetching example bill from: https://www.congress.gov/115/bills/s1568/BILLS-115s1568is.xml
Found 7 chunks.
--- Chunk 1: 1. Short title ---
1.Short titleThis Act may be cited as the President John F. Kennedy Commemorative Coin Act....

--- Chunk 2: 2. Findings ---
2.FindingsThe Congress finds that— (1)John Fitzgerald Kennedy served in the United States Navy, earning the Navy and Marine Corps Medal and a Purple Heart for his actions following the sinking of PT–1...

--- Chunk 3: 3. Coin specifications ---
3.Coin specifications (a)$1 silver coinsThe Secretary of the Treasury (hereafter in this Act referred to as the Secretary) shall mint and issue not more than 500,000 $1 coins in commemoration of Presi...



In [7]:
import xml.etree.ElementTree as ET
import requests

def chunk_bill_xml(xml_string):
    """
    Semantically chunks a bill XML by section (top-level only).
    """
    chunks = []
    try:
        root = ET.fromstring(xml_string)
        
        def get_text(elem):
            return "".join(elem.itertext())

        # Iterate over all elements to find sections, ignoring namespaces
        for elem in root.iter():
            # STRICTLY match 'section' (ignoring namespace)
            if elem.tag.split('}')[-1] == 'section':
                chunk = {}
                enum_text = ""
                header_text = ""
                
                for child in elem:
                    if child.tag.endswith('enum'):
                        enum_text = get_text(child).strip()
                    elif child.tag.endswith('header'):
                        header_text = get_text(child).strip()
                
                full_text = get_text(elem)
                clean_text = " ".join(full_text.split())
                
                chunk['label'] = f"{enum_text} {header_text}".strip()
                chunk['text'] = clean_text
                
                if chunk['label'] or chunk['text']:
                    chunks.append(chunk)
            
    except Exception as e:
        print(f"Error parsing XML: {e}")
        
    return chunks

In [8]:
url = "https://www.congress.gov/115/bills/s1568/BILLS-115s1568is.xml"
print(f"Fetching example bill from: {url}")
res = requests.get(url)
if res.status_code == 200:
    chunks = chunk_bill_xml(res.text)
    print(f"Found {len(chunks)} chunks.")
    for i, c in enumerate(chunks[:3]):
        print(f"--- Chunk {i+1}: {c['label']} ---")
        print(f"{c['text'][:200]}...\n")
else:
    print(f"Failed to fetch bill: {res.status_code}")

Fetching example bill from: https://www.congress.gov/115/bills/s1568/BILLS-115s1568is.xml
Found 7 chunks.
--- Chunk 1: 1. Short title ---
1.Short titleThis Act may be cited as the President John F. Kennedy Commemorative Coin Act....

--- Chunk 2: 2. Findings ---
2.FindingsThe Congress finds that— (1)John Fitzgerald Kennedy served in the United States Navy, earning the Navy and Marine Corps Medal and a Purple Heart for his actions following the sinking of PT–1...

--- Chunk 3: 3. Coin specifications ---
3.Coin specifications (a)$1 silver coinsThe Secretary of the Treasury (hereafter in this Act referred to as the Secretary) shall mint and issue not more than 500,000 $1 coins in commemoration of Presi...



In [9]:
chunks

[{'label': '1. Short title',
  'text': '1.Short titleThis Act may be cited as the President John F. Kennedy Commemorative Coin Act.'},
 {'label': '2. Findings',
  'text': '2.FindingsThe Congress finds that— (1)John Fitzgerald Kennedy served in the United States Navy, earning the Navy and Marine Corps Medal and a Purple Heart for his actions following the sinking of PT–109 during World War II; (2)John Fitzgerald Kennedy served honorably in the United States House of Representatives from 1947 to 1953; (3)John Fitzgerald Kennedy served honorably in the United States Senate from 1953 to 1960; (4)in 1960, at 43 years of age, John Fitzgerald Kennedy became the youngest person ever elected President of the United States; (5)in his inaugural address, President Kennedy challenged all people of the United States to ask not what your country can do for you—ask what you can do for your country; (6)President Kennedy’s call to service laid the foundation for the development of the Peace Corps; (7)Pr

In [10]:
all_chunks = []
for bill in recent_bills:
    try:
        res = requests.get(bill["text_data"]["formats"][2]["url"])
    except Exception as e:
        print(f"Failed to fetch bill: {e}")
        continue

    if res.status_code == 200:
        chunks = chunk_bill_xml(res.text)
        for chunk in chunks:
            chunk_0 = bill.copy()
            chunk_0["chunk_text"] = chunk
            all_chunks.append(chunk_0)

    else:
        print(f"Failed to fetch bill: {res.status_code}")

Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index out of range
Failed to fetch bill: list index o

In [11]:
all_chunks[0]

{'congress': 114,
 'latestAction': {'actionDate': '2015-01-13',
  'text': 'Read twice and referred to the Committee on Finance.'},
 'number': '174',
 'originChamber': 'Senate',
 'originChamberCode': 'S',
 'title': 'Stop Tax Haven Abuse Act',
 'type': 'S',
 'updateDate': '2025-12-02',
 'updateDateIncludingText': '2025-12-02',
 'url': 'https://api.congress.gov/v3/bill/114/s/174?format=json',
 'text_data': {'date': '2015-01-13T05:00:00Z',
  'formats': [{'type': 'Formatted Text',
    'url': 'https://www.congress.gov/114/bills/s174/BILLS-114s174is.htm'},
   {'type': 'PDF',
    'url': 'https://www.congress.gov/114/bills/s174/BILLS-114s174is.pdf'},
   {'type': 'Formatted XML',
    'url': 'https://www.congress.gov/114/bills/s174/BILLS-114s174is.xml'}],
  'type': 'Introduced in Senate'},
 'chunk_text': {'label': '1. Short title, etc',
  'text': '1.Short title, etc (a)Short titleThis Act may be cited as the Stop Tax Haven Abuse Act. (b)Amendment of 1986 codeExcept as otherwise expressly provided

In [12]:
len(all_chunks)

7645

In [13]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
sentences = ['search_document: TSNE is a dimensionality reduction algorithm created by Laurens van Der Maaten']
embeddings = model.encode(sentences)
print(embeddings)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_hf_nomic_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

[[ 4.94926661e-01  1.28402889e+00 -3.71534324e+00 -1.16852641e+00
   6.62463009e-01  2.61643618e-01  6.62997887e-02  1.03466880e+00
  -1.22910583e+00  4.59999174e-01 -3.32437396e-01  1.38281450e-01
   2.68823838e+00 -1.51774183e-01 -6.04068518e-01 -1.23871028e-01
  -6.62412822e-01 -2.17005581e-01 -4.78512317e-01  3.94661278e-01
  -3.13374728e-01 -9.35485363e-01 -7.79419541e-01 -1.13346350e+00
   1.44247937e+00  6.74199536e-02  4.32939008e-02  4.50967610e-01
   4.86702621e-01 -1.81662440e-01  1.43407750e+00 -1.32162893e+00
   2.77271848e-02 -1.29948512e-01 -9.96836483e-01  2.42035061e-01
  -2.25668743e-01  4.15005565e-01 -2.51880258e-01  1.05668278e-02
  -4.50851202e-01 -6.41959548e-01 -1.16626942e+00 -1.05262578e+00
   8.80191982e-01  3.09344847e-02  5.30076563e-01 -2.81585932e-01
  -2.57620424e-01 -6.29229724e-01 -1.80680662e-01  6.54555112e-02
   6.56127810e-01  1.31376058e-01  1.06702852e+00  3.11316758e-01
  -5.14815927e-01  9.15930152e-01  1.93530309e+00 -3.11154187e-01
   1.72625

In [15]:
for chunk in all_chunks:
    chunk["embedding"] = model.encode(f"search_document: {chunk["chunk_text"]["text"]}")

In [16]:
all_chunks[0]

{'congress': 114,
 'latestAction': {'actionDate': '2015-01-13',
  'text': 'Read twice and referred to the Committee on Finance.'},
 'number': '174',
 'originChamber': 'Senate',
 'originChamberCode': 'S',
 'title': 'Stop Tax Haven Abuse Act',
 'type': 'S',
 'updateDate': '2025-12-02',
 'updateDateIncludingText': '2025-12-02',
 'url': 'https://api.congress.gov/v3/bill/114/s/174?format=json',
 'text_data': {'date': '2015-01-13T05:00:00Z',
  'formats': [{'type': 'Formatted Text',
    'url': 'https://www.congress.gov/114/bills/s174/BILLS-114s174is.htm'},
   {'type': 'PDF',
    'url': 'https://www.congress.gov/114/bills/s174/BILLS-114s174is.pdf'},
   {'type': 'Formatted XML',
    'url': 'https://www.congress.gov/114/bills/s174/BILLS-114s174is.xml'}],
  'type': 'Introduced in Senate'},
 'chunk_text': {'label': '1. Short title, etc',
  'text': '1.Short title, etc (a)Short titleThis Act may be cited as the Stop Tax Haven Abuse Act. (b)Amendment of 1986 codeExcept as otherwise expressly provided

In [18]:
for chunk in all_chunks:
    chunk['embedding'] = chunk['embedding'].tolist()

In [None]:
with open("chunks.json", 'w') as json_file:
    json.dump(all_chunks, json_file, indent=2)

In [2]:
with open("chunks.json", 'r') as f:
    all_chunks = json.load(f)

In [3]:
all_chunks[0]

{'congress': 119,
 'latestAction': {'actionDate': '2025-12-01',
  'actionTime': '20:46:34',
  'text': 'Rules Committee Resolution H. Res. 916 Reported to House. Rule provides for consideration of H.R. 4312, H.R. 1005, H.R. 1049, H.R. 1069, H.R. 2965 and H.R. 4305. The resolution provides for consideration of H.R. 4312, H.R. 1005, H.R. 1049, H.R. 1069, H.R. 2965, and H.R. 4305 under a closed rule with one hour of general debate and one motion to recommit on each bill.'},
 'number': '4312',
 'originChamber': 'House',
 'originChamberCode': 'H',
 'title': 'SCORE Act',
 'type': 'HR',
 'updateDate': '2025-12-04',
 'updateDateIncludingText': '2025-12-04',
 'url': 'https://api.congress.gov/v3/bill/119/hr/4312?format=json',
 'text_data': {'date': '2025-07-10T04:00:00Z',
  'formats': [{'type': 'Formatted Text',
    'url': 'https://www.congress.gov/119/bills/hr4312/BILLS-119hr4312ih.htm'},
   {'type': 'PDF',
    'url': 'https://www.congress.gov/119/bills/hr4312/BILLS-119hr4312ih.pdf'},
   {'type'