In [1]:
%pip install requests keyring
from cdg_client import CDGClient
import xml.etree.ElementTree as ET
import xmltodict
import pandas as pd
import numpy as np

Note: you may need to restart the kernel to use updated packages.


In [2]:
API_KEY_PATH = '../../../api_key.txt'       # put your api_key path here
                                            # it shouldn't go to github
LIMIT = 250 
START_DATE = '2023-01-01T00:00:00Z'
END_DATE = '2025-10-11T00:00:00Z'
parse_xml = lambda data: ET.fromstring(data)

# get initial 250 bills
with open(API_KEY_PATH) as file:
    api_key = file.readline()
client = CDGClient(api_key, response_format='xml')

data, _ = client.get(f'bill?format=xml&limit={LIMIT}&fromDateTime={START_DATE}' \
                     f'&toDateTime={END_DATE}')
bills_list = xmltodict.parse(data)['api-root']['bills']['bill']
bills_df = pd.DataFrame(bills_list) # store bills into a data frame

# needs to be cleaned up
def format_data(data):
    int_cols = ['congress', 'number']
    str_cols = ['type', 'originChamberCode', 'url', 'title']
    drop_cols = ['updateDateIncludingText', 'updateDate', 'latestAction', 
                'originChamberCode', 'originChamber']
    data[int_cols] = data[int_cols].astype(int)
    data[str_cols] = data[str_cols].astype('string')
    data['chamber'] = data['originChamberCode'] # just store H or S 
    data.drop(columns=drop_cols, inplace=True)
    data['introduced'] = pd.NaT
    data[['sponsor', 'sponsor_party', 'policy_area']] = ''
    data[['passed_senate', 'passed_house', 'to_president', 'became_law']] = False
    return data

In [5]:

# need to do an api call for each bill to get more details
def fill_tables(bills, actions):
    for i in range(len(bills)):
        bill_url = bills.iloc[i]['url'][28:]
        bill_id = bill_url[5:-11].replace('/','-')
        bill_data, _ = client.get(bill_url)
        bill_dict = xmltodict.parse(bill_data)['api-root']['bill']
        bill_sponsors = bill_dict['sponsors']['item']

        
        
        bills.loc[i, 'introduced'] = pd.to_datetime(bill_dict['introducedDate'])
        bills.loc[i, 'sponsor'] = bill_sponsors['fullName']
        bills.loc[i, 'sponsor_party'] = bill_sponsors['party']
        bills.loc[i, 'policy_area'] = bill_dict.get('policyArea',{}).get('name','')
    
        # determine how far bill made it
        try:
            # handle bill actions
            actions_url = bill_dict['actions']['url'][28:]
            action_data, _ = client.get(actions_url)
            actions_dict = xmltodict.parse(action_data)['api-root']
            actions_list = actions_dict['actions']['item']
            actions_df = pd.DataFrame(actions_list)[['actionDate', 'text', 
                                                    'type', 'actionCode']]
            actions_df.rename(columns={'actionDate':'date', 'actionCode':'code'}, inplace=True)
            actions_df['bill_id'] = bill_id
            actions_df[['text', 'type', 'code']] = actions_df[['text', 'type', 'code']].astype('string')
            actions_df.bill_id = actions_df.bill_id.astype('string')
            actions = pd.concat([actions, actions_df], ignore_index=True) if not actions.empty else actions_df
            for action in actions_list:
                if ('Passed Senate' in action['text'] or 
                    'Passed/agreed to in Senate' in action['text']):
                    bills.loc[i, 'passed_senate'] = True
                if ('Passed House' in action['text'] or 
                    'Passed/agreed to in House' in action['text']):
                    bills.loc[i, 'passed_house'] = True
                if 'Presented to President' in action['text']:
                    bills.loc[i, 'to_president'] = True
                    bills.loc[i, 'passed_senate'] = True
                    bills.loc[i, 'passed_house'] = True
                if ('Became Public Law' in action['text'] or 
                    action['type'] == 'BecameLaw'):
                    bills.loc[i, 'became_law'] = True
                    bills.loc[i, 'to_president'] = True
                    bills.loc[i, 'passed_senate'] = True
                    bills.loc[i, 'passed_house'] = True
                break
        except:
            print('couldnt get all actions')
            continue
        bills[['sponsor', 'sponsor_party', 'policy_area']] = bills[['sponsor', 'sponsor_party', 'policy_area']].astype('string')
    return bills, actions

In [9]:
actions_df = pd.DataFrame(columns=['bill_id', 'date', 'text', 'type', 'code'])
b, a = fill_tables(bills_df, actions_df)

KeyboardInterrupt: 

In [None]:
b.to_csv('congress_bills_seed.csv', index=False)
a.to_csv('bill_actions_seed.csv', index=False)

In [4]:
import time

In [None]:
# now keep adding to the tables until reaching the start date (or hit max api requests)

bills, actions = pd.read_csv('congress_bills.csv'), pd.read_csv('bill_actions.csv')
offset = len(bills)

data, _ = client.get(f'bill?format=xml&limit={LIMIT}&fromDateTime={START_DATE}' \
                    f'&toDateTime={END_DATE}&offset={offset}')
bills_list = xmltodict.parse(data)['api-root']['bills']['bill']
bills_df = pd.DataFrame(bills_list)
n=0


while True:

    bills_df = format_data(bills_df)
    b, a = (fill_tables(bills_df, actions) if n==0 
            else fill_tables(bills_df, pd.read_csv('bill_actions.csv')))
    bills = pd.concat([bills, b], ignore_index=True)
    # update the csv's
    bills.to_csv('congress_bills.csv', index=False)
    a.to_csv('bill_actions.csv', index=False)
    print('updated')
    # go again
    offset = len(bills)
    try:
        data, _ = client.get(f'bill?format=xml&limit={LIMIT}&fromDateTime={START_DATE}'\
                        f'&toDateTime={END_DATE}&offset={offset}')
        bills_list = xmltodict.parse(data)['api-root']['bills']['bill']
        bills_df = pd.DataFrame(bills_list)
        n+=1
    except:
        print('encountered error: waiting 1min')
        time.sleep(60) 

updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
couldnt get all actions
couldnt get all actions
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
couldnt get all actions
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
updated
couldnt get all actions
updated
updated
updated
updated
updated
updated
couldnt get all actions
updated


In [24]:
df = pd.read_csv('congress_bills.csv')
df.passed_senate = df.passed_senate | df.became_law
df.passed_house = df.passed_house | df.became_law
df.to_president = df.to_president | df.became_law
df.passed_senate = df.passed_senate | df.to_president
df.passed_house = df.passed_house | df.to_president

In [25]:
df.to_csv('congress_bills.csv', index=False)

In [28]:
# percent of proposed bills which make no progress to becoming law
100 *len(df[~(df.passed_senate | df.passed_house | df.to_president | df.became_law)])/len(df)

98.21739130434783