In [None]:
!pip install pandas
!pip install tqdm
!pip install tenacity
!pip install beautifulsoup4
!pip install nltk

## Gathering the Data
We'll be using the [congress.gov API](https://github.com/LibraryOfCongress/api.congress.gov) to request all of the data on bills introduced in the House of Representatives during the 117th Congress. 

In [1]:
from cdg_client import CDGClient

with open("congress_api.env", "r") as f:
    api_key = f.read()

client = CDGClient(api_key)

In [2]:
# Get information on HR 3684
example_bill = client.get("bill/117/hr/3684")
print(example_bill[0].get("bill").get("title"))

Infrastructure Investment and Jobs Act


In [40]:
import json
from tqdm import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm
from requests.exceptions import HTTPError
from tenacity import retry, wait_fixed, retry_if_exception_type, before_sleep_log
import logging
import sys
import requests
from time import sleep

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
logger = logging.getLogger(__name__)

# suppress debug logs from urllib3
logging.getLogger("urllib3").setLevel(logging.WARNING)

# custom logging function that logs retries
def retry_log(logger, level):
    def wrapper(retry_state):
        if retry_state.attempt_number > 1:
            logger.log(level, f"Retrying {retry_state.fn.__name__} (attempt {retry_state.attempt_number})")
    return wrapper

# wait 5 minutes and try again if an exception is raised (rate limited to 5000 API calls per hour)
@retry(wait=wait_fixed(300), retry=retry_if_exception_type(HTTPError), before_sleep=retry_log(logger, logging.DEBUG))
def getBill(congress, chamber, number):
    return client.get(f"bill/{congress}/{chamber}/{number}")

@retry(wait=wait_fixed(300), retry=retry_if_exception_type(HTTPError), before_sleep=retry_log(logger, logging.DEBUG))
def getBillText(congress, chamber, number):
    return client.get(f"bill/{congress}/{chamber}/{number}/text")

@retry(wait=wait_fixed(300), retry=retry_if_exception_type(HTTPError), before_sleep=retry_log(logger, logging.DEBUG))
def getBillSummary(congress, chamber, number):
    return client.get(f"bill/{congress}/{chamber}/{number}/summaries")

In [5]:
NUM_BILLS = 9709# 9709 bills were introduced by the House of the 117th congress

start = 1

with logging_redirect_tqdm():
    for i in tqdm(range(start, NUM_BILLS + 1)):
        bill = getBill(117, "hr", i)    

        with open(f"BillData/JSONFiles/{str(i).zfill(5)}.json", "w") as outfile: 
            json.dump(bill[0].get("bill"), outfile)

 30%|██▉       | 2872/9709 [15:00<35:30,  3.21it/s]  

In [39]:
NUM_BILLS = 9709
start = 1

with logging_redirect_tqdm():
    for i in tqdm(range(start, NUM_BILLS + 1)):
        sources = getBillText(117, "hr", i)

        try:
            formats = sources[0].get("textVersions")[0].get("formats")
        except IndexError:
            print(f"Failed to retrieve the content. No text is available for H.R. {i}.")
            continue

        url = next((item["url"] for item in formats if item["type"] == "Formatted Text"), None)

        response = requests.get(url)
        if response.status_code == 200:
            with open(f"BillData/BillText/{str(i).zfill(5)}.htm", "w", encoding="utf-8") as file:
                file.write(response.text)
        else:
            print(f"Failed to retrieve the content. Status code: {response.status_code}")
        
        # wait before sending more https requests
        sleep(2)

  0%|          | 2/9709 [00:02<3:03:45,  1.14s/it]

Failed to retrieve the content. No text is available for H.R. 2.


  0%|          | 9/9709 [00:16<4:18:02,  1.60s/it]

Failed to retrieve the content. No text is available for H.R. 9.


  0%|          | 10/9709 [00:16<3:09:02,  1.17s/it]

Failed to retrieve the content. No text is available for H.R. 10.


  0%|          | 12/9709 [00:17<1:49:07,  1.48it/s]

Failed to retrieve the content. No text is available for H.R. 11.
Failed to retrieve the content. No text is available for H.R. 12.


  0%|          | 13/9709 [00:17<1:26:11,  1.87it/s]

Failed to retrieve the content. No text is available for H.R. 13.


  0%|          | 14/9709 [00:17<1:10:01,  2.31it/s]

Failed to retrieve the content. No text is available for H.R. 14.


  0%|          | 16/9709 [00:18<51:32,  3.13it/s]  

Failed to retrieve the content. No text is available for H.R. 15.
Failed to retrieve the content. No text is available for H.R. 16.


  0%|          | 17/9709 [00:18<46:26,  3.48it/s]

Failed to retrieve the content. No text is available for H.R. 17.


  0%|          | 20/9709 [00:23<2:38:19,  1.02it/s]

Failed to retrieve the content. No text is available for H.R. 20.


100%|██████████| 9709/9709 [6:39:28<00:00,  2.47s/it]  


In [6]:
NUM_BILLS = 9709
start = 4944

with logging_redirect_tqdm():
    for i in tqdm(range(start, NUM_BILLS + 1)):
        response = getBillSummary(117, "hr", i)[0].get("summaries")

        if len(response) == 0:
            logging.debug(f"Failed to retrieve the content. No summary is available for H.R. {i}")
            continue

        summary = response[0].get("text")
        
        with open(f"BillData/BillSummaries/{str(i).zfill(5)}.htm", "w", encoding="utf-8") as file:
            file.write(summary)
        
        sleep(1.4)

100%|██████████| 4766/4766 [2:10:04<00:00,  1.64s/it]  


In [4]:
getBillSummary(117, "hr", 4944)[0].get("summaries")[0]

{'actionDate': '2021-08-06',
 'actionDesc': 'Introduced in House',
 'text': " <p><strong>Helping Kids Cope Act of 2021</strong></p> <p>This bill provides funding through FY2026 to the Health Resources and Services Administration (HRSA) for grants to enhance access to and provider training in pediatric behavioral health care.</p> <p>Specifically, HRSA must award grants to pediatricians, children's hospitals, and other providers to support pediatric behavioral health integration and coordination within communities. Grant-funded activities may include hiring community navigators to assist families in accessing appropriate services, incorporating behavioral health services in pediatric practices, and delivering services via telehealth.</p> <p>In addition, HRSA must award grants to children's hospitals to expand training for providers in the pediatric behavioral health workforce.</p>",
 'updateDate': '2022-02-03T22:40:03Z',
 'versionCode': '00'}

## Assembling the dataset

In [106]:
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import re
from pathlib import Path

In [17]:
def clean_htm(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file)

        # remove the title
        soup.b.decompose()

        return ' '.join(soup.stripped_strings)

# example usage
clean_htm("BillData/BillSummaries/02132.htm")

'This bill eliminates the exemption of waste associated with the exploration, development, or production of crude oil, natural gas, or geothermal energy from regulations governing the disposal of hazardous waste. Within a year of enactment, the Environmental Protection Agency (EPA) must determine whether such waste meets the criteria for hazardous waste, promulgate regulations concerning the disposal of such waste if the EPA determines it is hazardous, and revise regulations applicable to solid waste management and disposal facilities (e.g., landfills) to address such waste that is nonhazardous.'

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [34]:
# get pretrained GloVe model - Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download)
!wget https://huggingface.co/stanfordnlp/glove/resolve/main/glove.42B.300d.zip
!tar -xf glove.42B.300d.zip

--2024-07-21 11:48:26--  https://huggingface.co/stanfordnlp/glove/resolve/main/glove.42B.300d.zip
Resolving huggingface.co... 18.160.102.86, 18.160.102.96, 18.160.102.4, ...
Connecting to huggingface.co|18.160.102.86|:443... connected.
OpenSSL: error:140773E8:SSL routines:SSL23_GET_SERVER_HELLO:reason(1000)
Unable to establish SSL connection.
tar: Error opening archive: Failed to open 'glove.42B.300d.zip'


In [138]:
def get_clean_bill_summary(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file)

        # remove the title
        soup.b.decompose()

        # convert to string and lowercase
        summary = " ".join(soup.stripped_strings)
        summary = summary.lower()

        # drop all punctuation
        summary = re.sub(r"[^\w\s]", "", summary)

    return summary

def get_clean_bill_title(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

        title = data["title"].lower()
        title = re.sub(r"[^\w\s]", "", title)

    return title

def tokenize_text(text):
    # tokenize text
    tokens = nltk.word_tokenize(text.lower())

    # get rid of stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return tokens

# example usage
tokenize_text(get_clean_bill_title("BillData/JSONFiles/02132.json"))


['closing',
 'loopholes',
 'ending',
 'arbitrary',
 'needless',
 'evasion',
 'regulations',
 'act',
 '2021']

In [133]:
with open("BillData/JSONFiles/07776.json", "r", encoding="utf-8") as file:
    data = json.load(file)
    print(data)

{'actions': {'count': 51, 'url': 'https://api.congress.gov/v3/bill/117/hr/7776/actions?format=json'}, 'amendments': {'count': 21, 'url': 'https://api.congress.gov/v3/bill/117/hr/7776/amendments?format=json'}, 'cboCostEstimates': [{'description': 'As Posted on the Website of the House Committee on Rules on June 6, 2022\n', 'pubDate': '2022-06-08T20:37:00Z', 'title': 'CBO’s Estimate of the Statutory Pay-As-You-Go Effects of H.R. 7776, the Water Resources Development Act of 2022', 'url': 'https://www.cbo.gov/publication/58193'}], 'committeeReports': [{'citation': 'H. Rept. 117-347', 'url': 'https://api.congress.gov/v3/committee-report/117/HRPT/347?format=json'}], 'committees': {'count': 1, 'url': 'https://api.congress.gov/v3/bill/117/hr/7776/committees?format=json'}, 'congress': 117, 'constitutionalAuthorityStatementText': '<pre>\n[Congressional Record Volume 168, Number 83 (Monday, May 16, 2022)]\n[House]\nFrom the Congressional Record Online through the Government Publishing Office [<a 

In [140]:


def get_sponsor_party(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        party = data["sponsors"][0]["party"]

    return party

def get_sponsor_state(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        state = data["sponsors"][0]["state"]

    return state

def get_sponsor_district(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        district = data["sponsors"][0]["district"]

    return district

def get_policy_area(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        policy_area = data["policyArea"]["name"]

    return policy_area

def get_subjects_count(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        count = data["subjects"]["count"]

    return count

def get_introduction_date(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        date = data["introducedDate"]

    return date 

def get_latest_action(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        latest_action = data["latestAction"]["text"]

    return latest_action 


get_latest_action("BillData/JSONFiles/02324.json")

'Became Public Law No: 117-153.'

In [115]:
features = ["bill_id", 
            "title", 
            "summary", 
            "sponsor_party", 
            "sponsor_state",
            "sponsor_district",
            "policy_area", 
            "subjects_count", 
            "introduced_date",
            "latest_action"]

dataset = pd.DataFrame(columns = features)

#row = pd.DataFrame([[i for i in range(10)]], columns = features)
#for i in range(10):
#    dataset = pd.concat([dataset, row], ignore_index=True)
#dataset

In [141]:
# iterate through htm files in the BillSummaries directory
for file in Path("BillData/BillSummaries").glob('*.htm'):
    bill_number = str(file).split("\\")[-1][:-4]
    
    summary = get_clean_bill_summary(file)

    with open(f"BillData/JSONFiles/{bill_number}.json", "r", encoding="utf-8") as file:
        
    
    # add the row to the dataset
    new_row = pd.DataFrame([[int(bill_number,
                                 title,
                                 summary,
                                 sponsor_party,
                                 sponsor_state,
                                 sponsor_district,
                                 policy_area,
                                 subjects_count,
                                 introduced_date,
                                 latest_action)]], columns = features)
    
    dataset = pd.concat([dataset, new_row])
    break

SyntaxError: invalid syntax (2218664886.py, line 7)