In [1]:
import urllib
import urllib.request
import pandas as pd
import xml.etree.ElementTree as ET
from cdg_client import CDGClient

In [2]:
all_bills = pd.DataFrame(columns=['Unique ID', 'Bill ID', 'Year', 'Congress', 'Link', 'Title', 'Subjects', 'Committees'])

# linked_set is used to generate URLs from bill ids
linked_set = {
    'hres': 'house-resolution',
    'hr': 'house-bill',
    'hjres': 'house-joint-resolution',
    'hconres': 'house-concurrent-resolution',
    'sres': 'senate-resolution',
    's': 'senate-bill',
    'sjres': 'senate-joint-resolution',
    'sconres': 'senate-concurrent-resolution'
}

In [3]:
xmls = []

def string_i(i):
    if (i < 10): i = '00' + str(i)
    elif (i < 100): i = '0' + str(i)
    else: i = str(i)
    return (i)

# biden administration (2021-2024)
for i in range(2, 518): xmls.append('/2024/roll' + string_i(i))
for i in range(21, 725): xmls.append('/2023/roll' + string_i(i))
for i in range(2, 550): xmls.append('/2022/roll' + string_i(i))
for i in range(3, 450): xmls.append('/2021/roll' + string_i(i))

# trump administration (2017-2020)
for i in range(2, 254): xmls.append('/2020/roll' + string_i(i))
for i in range(3, 702): xmls.append('/2019/roll' + string_i(i))
for i in range(2, 501): xmls.append('/2018/roll' + string_i(i))
for i in range(3, 711): xmls.append('/2017/roll' + string_i(i))

In [4]:
# Congress API
CONGRESS_API_KEY = "3kOBbSdMeVapzNNPZsaX0n8OYFquP8y3joe1Ush2"
BILL_PATH = "bill"
parse_xml = lambda data: ET.fromstring(data)
client = CDGClient(CONGRESS_API_KEY, response_format="xml")
print(f"Contacting Congress.gov at {client.base_url} ...")

def get_bill(type, congress, bill_hr, bill_num):
    endpoint = f"{BILL_PATH}/{congress}/{bill_hr}/{bill_num}/{type}"
    data, _ = client.get(endpoint)
    # get bill subjects
    if (type == 'subjects'):
        out = []
        strs = str(data).split('\\n')
        for i in range(0, len(strs)):
            if (strs[i].__contains__('<name>')):
                out.append(strs[i + 1].lstrip())
        return(out)
    # get bill committees
    elif (type == 'committees'):
        out = []
        strs = str(data).split('\\n')
        for i in range(0, len(strs)):
            if (strs[i].__contains__('<name>')):
                x = strs[i + 1].lstrip()
                if (x.__contains__('Committee')): out.append(x)
        return(out)
    # get the bill title
    elif (type == 'titles'):
        out = str(data).split('\\n')[11].lstrip()
        return (out)
    return (data)

Contacting Congress.gov at https://api.congress.gov/v3/ ...


In [17]:
last_bill = 'null'

for i in range(0, len(xmls)): # for every roll call
    xmlFile = xmls[i]
    year = xmlFile[1:5] #key_field
    print('Cataloging Roll ' + str(i + 1) + ' of ' + str(len(xmls)) + '...')
    url_resp = urllib.request.urlopen('https://clerk.house.gov/evs' + xmlFile + '.xml') # access the roll call data for a specific roll number
    strs = str(url_resp.read()).split('\\r\\n')

    bill_tag = strs[10][11:-12]
    congress = strs[6][10:-11] #key_field
    bill_str = ''.join([c for c in bill_tag if c.isalpha()]).lower()
    bill_num = ''.join([c for c in bill_tag if c.isnumeric()])
    bill_id = bill_str + ' ' + bill_num #key_field
    unique_id = bill_id + ' (' + year + ')' #key_field
    if ((unique_id != last_bill) and ('adjourn' not in unique_id) and ('quorum' not in unique_id) and ('election' not in unique_id) and ('motion' not in unique_id) and ('journal' not in unique_id)):
        last_bill = unique_id
        print('working on ' + unique_id)
        title = get_bill('titles', congress, bill_str, bill_num) #key_field
        subjects = get_bill('subjects', congress, bill_str, bill_num) #key_field
        committees = get_bill('committees', congress, bill_str, bill_num) #key_field
        print(title)
        print(subjects)
        print(committees)
        link = "https://www.congress.gov/bill/" + congress + "th-congress/" + linked_set[bill_str] + "/" + bill_num #key_field
        all_bills.loc[len(all_bills.index)] = [unique_id, bill_id, year, congress, link, title, subjects, committees]

Cataloging Roll 1 of 4373...
working on hres 947 (2024)
Providing for consideration of the bill (H.R. 788) to limit donations made pursuant to settlement agreements to which the United States is a party, and for other purposes; providing for consideration of the joint resolution (H.J. Res. 98) providing for congressional disapproval under chapter 8 of title 5, United States Code, of the rule submitted by the National Labor Relations Board relating to "Standard for Determining Joint Employer Status\'\'; and providing for consideration of the joint resolution (S.J. Res. 38) providing for congressional disapproval under chapter 8 of title 5, United States Code, of the rule submitted by the Federal Highway Administration relating to "Waiver of Buy America Requirements for Electric Vehicle Chargers\'\'.
['House of Representatives', 'Legislative rules and procedure', 'Congress']
['Rules Committee']
Cataloging Roll 2 of 4373...
Cataloging Roll 3 of 4373...
Cataloging Roll 4 of 4373...
Catalog

HTTPError: HTTP Error 503: Service Unavailable

In [None]:
all_bills.to_csv('all_bills.csv') # export bill data

In [None]:
print(all_bills.head()) # take a peek at the bill data