In [2]:
%pip install requests keyring
from cdg_client import CDGClient
import xml.etree.ElementTree as ET
import xmltodict
import pandas as pd
import numpy as np

Note: you may need to restart the kernel to use updated packages.


In [105]:
API_KEY_PATH = '../../../api_key.txt'       # put your api_key path here
                                            # it shouldn't go to github
LIMIT = 250 
START_DATE = '2023-01-01T00:00:00Z'
END_DATE = '2025-10-11T00:00:00Z'
parse_xml = lambda data: ET.fromstring(data)

# get initial 250 bills
with open(API_KEY_PATH) as file:
    api_key = file.readline()
client = CDGClient(api_key, response_format='xml')

data, _ = client.get(f'bill?format=xml&limit={LIMIT}&fromDateTime={START_DATE}' \
                     f'&toDateTime={END_DATE}')
bills_list = xmltodict.parse(data)['api-root']['bills']['bill']
bills_df = pd.DataFrame(bills_list) # store bills into a data frame

# needs to be cleaned up
def format_data(data):
    int_cols = ['congress', 'number']
    str_cols = ['type', 'originChamberCode', 'url', 'title']
    drop_cols = ['updateDateIncludingText', 'updateDate', 'latestAction', 
                'originChamberCode', 'originChamber']
    data[int_cols] = data[int_cols].astype(int)
    data[str_cols] = data[str_cols].astype('string')
    data['chamber'] = data['originChamberCode'] # just store H or S 
    data.drop(columns=drop_cols, inplace=True)
    data['introduced'] = pd.NaT
    data[['sponsor', 'sponsor_party', 'policy_area']] = ''
    data[['passed_senate', 'passed_house', 'to_president', 'became_law']] = False
    return data
format_data(bills_df)

Unnamed: 0,congress,type,number,url,title,chamber,introduced,sponsor,sponsor_party,policy_area,passed_senate,passed_house,to_president,became_law
0,119,S,2403,https://api.congress.gov/v3/bill/119/s/2403?fo...,Retire through Ownership Act,S,NaT,,,,False,False,False,False
1,119,S,1728,https://api.congress.gov/v3/bill/119/s/1728?fo...,Employee Ownership Representation Act of 2025,S,NaT,,,,False,False,False,False
2,119,S,1440,https://api.congress.gov/v3/bill/119/s/1440?fo...,Uniformed Services Leave Parity Act,S,NaT,,,,False,False,False,False
3,119,S,2283,https://api.congress.gov/v3/bill/119/s/2283?fo...,A bill to designate the facility of the United...,S,NaT,,,,False,False,False,False
4,119,SRES,337,https://api.congress.gov/v3/bill/119/sres/337?...,A resolution recognizing the 250th anniversary...,S,NaT,,,,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,119,SRES,421,https://api.congress.gov/v3/bill/119/sres/421?...,A resolution urging the executive branch and l...,S,NaT,,,,False,False,False,False
246,119,S,2936,https://api.congress.gov/v3/bill/119/s/2936?fo...,Stop ANTIFA Act of 2025,S,NaT,,,,False,False,False,False
247,119,S,2937,https://api.congress.gov/v3/bill/119/s/2937?fo...,AI LEAD Act,S,NaT,,,,False,False,False,False
248,119,SRES,419,https://api.congress.gov/v3/bill/119/sres/419?...,A resolution expressing support for the design...,S,NaT,,,,False,False,False,False


In [106]:
# need to do an api call for each bill to get more details
def fill_tables(bills, actions):
    for i in range(len(bills)):
        bill_url = bills.iloc[i]['url'][28:]
        bill_id = bill_url[5:-11].replace('/','-')
        bill_data, _ = client.get(bill_url)
        bill_dict = xmltodict.parse(bill_data)['api-root']['bill']
        bill_sponsors = bill_dict['sponsors']['item']
        # handle bill actions
        actions_url = bill_dict['actions']['url'][28:]
        action_data, _ = client.get(actions_url)
        actions_dict = xmltodict.parse(action_data)['api-root']
        actions_list = actions_dict['actions']['item']
        actions_df = pd.DataFrame(actions_list)[['actionDate', 'text', 
                                                 'type', 'actionCode']]
        actions_df.rename(columns={'actionDate':'date', 'actionCode':'code'}, inplace=True)
        actions_df['bill_id'] = bill_id
        actions_df[['text', 'type', 'code']] = actions_df[['text', 'type', 'code']].astype('string')
        actions_df.bill_id = actions_df.bill_id.astype('string')
        actions = pd.concat([actions, actions_df], ignore_index=True) if not actions.empty else actions_df
        
        
        bills.loc[i, 'introduced'] = pd.to_datetime(bill_dict['introducedDate'])
        bills.loc[i, 'sponsor'] = bill_sponsors['fullName']
        bills.loc[i, 'sponsor_party'] = bill_sponsors['party']
        bills.loc[i, 'policy_area'] = bill_dict.get('policyArea',{}).get('name','')
    
        # determine how far bill made it
        for action in actions_list:
            if ('Passed Senate' in action['text'] or 
                'Passed/agreed to in Senate' in action['text']):
                bills.loc[i, 'passed_senate'] = True
            if ('Passed House' in action['text'] or 
                'Passed/agreed to in House' in action['text']):
                bills.loc[i, 'passed_house'] = True
            if 'Presented to President' in action['text']:
                bills.loc[i, 'to_president'] = True
                bills.loc[i, 'passed_senate'] = True
                bills.loc[i, 'passed_house'] = True
            if ('Became Public Law' in action['text'] or 
                action['type'] == 'BecameLaw'):
                bills.loc[i, 'became_law'] = True
                break
        bills[['sponsor', 'sponsor_party', 'policy_area']] = bills[['sponsor', 'sponsor_party', 'policy_area']].astype('string')
    return bills, actions

In [107]:
actions_df = pd.DataFrame(columns=['bill_id', 'date', 'text', 'type', 'code'])
b, a = fill_tables(bills_df, actions_df)

In [108]:
b.to_csv('congress_bills_seed.csv', index=False)
a.to_csv('bill_actions_seed.csv', index=False)

In [111]:
import time

In [None]:
# now keep adding to the tables until reaching the start date (or hit max api requests)
try:
    bills, actions = pd.read_csv('congress_bills.csv'), pd.read_csv('bill_actions.csv')
    offset = len(bills)

    data, _ = client.get(f'bill?format=xml&limit={LIMIT}&fromDateTime={START_DATE}' \
                        f'&toDateTime={END_DATE}&offset={offset}')
    bills_list = xmltodict.parse(data)['api-root']['bills']['bill']
    bills_df = pd.DataFrame(bills_list)
    n=0
except: # hit api request limit
    print('hit limit: waiting 1 hour')
    time.sleep(3600)
while len(bills_df) == LIMIT:
    try:
        bills_df = format_data(bills_df)
        b, a = fill_tables(bills_df, actions) if n==0 else fill_tables(bills_df, pd.read_csv('bill_actions.csv'))
        bills = pd.concat([bills, b], ignore_index=True)
        # update the csv's
        bills.to_csv('congress_bills.csv', index=False)
        a.to_csv('bill_actions.csv', index=False)
        print('updated')
        # go again
        offset = len(bills)

        data, _ = client.get(f'bill?format=xml&limit={LIMIT}&fromDateTime={START_DATE}' \
                        f'&toDateTime={END_DATE}&offset={offset}')
        bills_list = xmltodict.parse(data)['api-root']['bills']['bill']
        bills_df = pd.DataFrame(bills_list)
        n+=1
    except:
        print('hit limit: waiting 1 hour')
        time.sleep(3600)
    

hit limit: waiting 1 hour


In [None]:
bills= pd.read_csv('congress_bills.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   congress                250 non-null    int64 
 1   type                    250 non-null    object
 2   number                  250 non-null    int64 
 3   url                     250 non-null    object
 4   title                   250 non-null    object
 5   chamber                 250 non-null    object
 6   introduced              250 non-null    object
 7   sponsor                 250 non-null    object
 8   sponsor_party           250 non-null    object
 9   policy_area             44 non-null     object
 10  passed_senate           250 non-null    bool  
 11  passed_house            250 non-null    bool  
 12  to_presidentbecame_law  250 non-null    bool  
dtypes: bool(3), int64(2), object(8)
memory usage: 20.4+ KB
