In [None]:
!pip install pandas
!pip install tqdm
!pip install tenacity

In [1]:
from cdg_client import CDGClient

with open("api_key.txt", "r") as f:
    api_key = f.read()

client = CDGClient(api_key)

In [2]:
# Get information on HR 3684
example_bill = client.get("bill/117/hr/3684")
print(example_bill[0].get("bill").get("title"))

Infrastructure Investment and Jobs Act


In [27]:
import json
from tqdm import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm
from requests.exceptions import HTTPError
from tenacity import retry, wait_fixed, retry_if_exception_type, before_sleep_log
import logging
import sys
import requests
from time import sleep

In [30]:
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
logger = logging.getLogger(__name__)

# suppress debug logs from urllib3
logging.getLogger("urllib3").setLevel(logging.WARNING)

# custom logging function that logs retries
def retry_log(logger, level):
    def wrapper(retry_state):
        if retry_state.attempt_number > 1:
            logger.log(level, f"Retrying {retry_state.fn.__name__} (attempt {retry_state.attempt_number})")
    return wrapper

# wait 5 minutes and try again if an exception is raised (rate limited to 5000 API calls per hour)
@retry(wait=wait_fixed(300), retry=retry_if_exception_type(HTTPError), before_sleep=retry_log(logger, logging.DEBUG))
def getBill(congress, chamber, number):
    return client.get(f"bill/{congress}/{chamber}/{number}")

@retry(wait=wait_fixed(300), retry=retry_if_exception_type(HTTPError), before_sleep=retry_log(logger, logging.DEBUG))
def getBillText(congress, chamber, number):
    return client.get(f"bill/{congress}/{chamber}/{number}/text")

In [5]:
# 9709 bills were introduced by the House of the 117th congress
NUM_BILLS = 9709
start = 1

with logging_redirect_tqdm():
    for i in tqdm(range(start, NUM_BILLS + 1)):
        bill = getBill(117, "hr", i)    

        with open(f"BillData/JSONFiles/{str(i).zfill(5)}.json", "w") as outfile: 
            json.dump(bill[0].get("bill"), outfile)

 30%|██▉       | 2872/9709 [15:00<35:30,  3.21it/s]  

In [39]:
NUM_BILLS = 9709
start = 1

with logging_redirect_tqdm():
    for i in tqdm(range(start, NUM_BILLS + 1)):
        sources = getBillText(117, "hr", i)

        try:
            formats = sources[0].get("textVersions")[0].get("formats")
        except IndexError:
            print(f"Failed to retrieve the content. No text is available for H.R. {i}.")
            continue

        url = next((item["url"] for item in formats if item["type"] == "Formatted Text"), None)

        response = requests.get(url)
        if response.status_code == 200:
            with open(f"BillData/BillText/{str(i).zfill(5)}.htm", "w", encoding="utf-8") as file:
                file.write(response.text)
        else:
            print(f"Failed to retrieve the content. Status code: {response.status_code}")
        
        # wait before sending more https requests
        sleep(2)

  0%|          | 2/9709 [00:02<3:03:45,  1.14s/it]

Failed to retrieve the content. No text is available for H.R. 2.


  0%|          | 9/9709 [00:16<4:18:02,  1.60s/it]

Failed to retrieve the content. No text is available for H.R. 9.


  0%|          | 10/9709 [00:16<3:09:02,  1.17s/it]

Failed to retrieve the content. No text is available for H.R. 10.


  0%|          | 12/9709 [00:17<1:49:07,  1.48it/s]

Failed to retrieve the content. No text is available for H.R. 11.
Failed to retrieve the content. No text is available for H.R. 12.


  0%|          | 13/9709 [00:17<1:26:11,  1.87it/s]

Failed to retrieve the content. No text is available for H.R. 13.


  0%|          | 14/9709 [00:17<1:10:01,  2.31it/s]

Failed to retrieve the content. No text is available for H.R. 14.


  0%|          | 16/9709 [00:18<51:32,  3.13it/s]  

Failed to retrieve the content. No text is available for H.R. 15.
Failed to retrieve the content. No text is available for H.R. 16.


  0%|          | 17/9709 [00:18<46:26,  3.48it/s]

Failed to retrieve the content. No text is available for H.R. 17.


  0%|          | 20/9709 [00:23<2:38:19,  1.02it/s]

Failed to retrieve the content. No text is available for H.R. 20.


  6%|▌         | 548/9709 [22:05<6:06:06,  2.40s/it] 

In [34]:
getBillText(117, "hr", 2)[0]

{'pagination': {'count': 0},
 'request': {'billNumber': '2',
  'billType': 'hr',
  'billUrl': 'https://api.congress.gov/v3/bill/117/hr/2?format=json',
  'congress': '117',
  'contentType': 'application/json',
  'format': 'json'},
 'textVersions': []}