## Gathering the Data
We'll be using the [congress.gov API](https://github.com/LibraryOfCongress/api.congress.gov) to request all of the data on bills introduced in the House of Representatives during the 117th Congress. 

In [None]:
from cdg_client import CDGClient

with open("congress_api.env", "r") as f:
    api_key = f.read()

client = CDGClient(api_key)

In [None]:
# Get information on HR 3684
example_bill = client.get("bill/117/hr/3684")
print(example_bill[0].get("bill").get("title"))

In [None]:
import json
from tqdm import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm
from requests.exceptions import HTTPError
from tenacity import retry, wait_fixed, retry_if_exception_type, before_sleep_log
import logging
import sys
import requests
from time import sleep

In [None]:
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
logger = logging.getLogger(__name__)

# suppress debug logs from urllib3
logging.getLogger("urllib3").setLevel(logging.WARNING)

# custom logging function that logs retries
def retry_log(logger, level):
    def wrapper(retry_state):
        if retry_state.attempt_number > 1:
            logger.log(level, f"Retrying {retry_state.fn.__name__} (attempt {retry_state.attempt_number})")
    return wrapper

@retry(wait=wait_fixed(300), retry=retry_if_exception_type(HTTPError), before_sleep=retry_log(logger, logging.DEBUG))
def getBill(congress, chamber, number):
    return client.get(f"bill/{congress}/{chamber}/{number}")

@retry(wait=wait_fixed(300), retry=retry_if_exception_type(HTTPError), before_sleep=retry_log(logger, logging.DEBUG))
def getBillText(congress, chamber, number):
    return client.get(f"bill/{congress}/{chamber}/{number}/text")

@retry(wait=wait_fixed(300), retry=retry_if_exception_type(HTTPError), before_sleep=retry_log(logger, logging.DEBUG))
def getBillSummary(congress, chamber, number):
    return client.get(f"bill/{congress}/{chamber}/{number}/summaries")

@retry(wait=wait_fixed(300), retry=retry_if_exception_type(HTTPError), before_sleep=retry_log(logger, logging.DEBUG))
def getBillActions(congress, chamber, number):
    return client.get(f"bill/{congress}/{chamber}/{number}/actions")

In [None]:
NUM_BILLS = 9709# 9709 bills were introduced by the House of the 117th congress

start = 1

with logging_redirect_tqdm():
    for i in tqdm(range(start, NUM_BILLS + 1)):
        bill = getBill(117, "hr", i)    

        with open(f"BillData/JSONFiles/{str(i).zfill(5)}.json", "w") as outfile: 
            json.dump(bill[0].get("bill"), outfile)

In [None]:
NUM_BILLS = 9709
start = 1

with logging_redirect_tqdm():
    for i in tqdm(range(start, NUM_BILLS + 1)):
        sources = getBillText(117, "hr", i)

        try:
            formats = sources[0].get("textVersions")[0].get("formats")
        except IndexError:
            print(f"Failed to retrieve the content. No text is available for H.R. {i}.")
            continue

        url = next((item["url"] for item in formats if item["type"] == "Formatted Text"), None)

        response = requests.get(url)
        if response.status_code == 200:
            with open(f"BillData/BillText/{str(i).zfill(5)}.htm", "w", encoding="utf-8") as file:
                file.write(response.text)
        else:
            print(f"Failed to retrieve the content. Status code: {response.status_code}")
        
        # wait before sending more https requests
        sleep(2)

In [None]:
NUM_BILLS = 9709
start = 4944

with logging_redirect_tqdm():
    for i in tqdm(range(start, NUM_BILLS + 1)):
        response = getBillSummary(117, "hr", i)[0].get("summaries")

        if len(response) == 0:
            logging.debug(f"Failed to retrieve the content. No summary is available for H.R. {i}")
            continue

        summary = response[0].get("text")
        
        with open(f"BillData/BillSummaries/{str(i).zfill(5)}.htm", "w", encoding="utf-8") as file:
            file.write(summary)
        
        sleep(1.4)

In [None]:
NUM_BILLS = 9709
start = 246

with logging_redirect_tqdm():
    for i in tqdm(range(start, NUM_BILLS + 1)):
        actions = getBillActions(117, "hr", i)    

        with open(f"BillData/BillActions/{str(i).zfill(5)}.json", "w") as outfile: 
            json.dump(actions, outfile)
        
        sleep(0.5)

## Assembling the dataset

In [None]:
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import re
from pathlib import Path

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# get pretrained GloVe model - Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download)
!wget https://huggingface.co/stanfordnlp/glove/resolve/main/glove.42B.300d.zip
!tar -xf glove.42B.300d.zip

In [None]:
def get_clean_bill_summary(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

        # Remove <b> and <strong> tags (these tags contain the bill title)
        for tag in ['b', 'strong']:
            for match in soup.find_all(tag):
                match.decompose()

        # Convert to string and lowercase
        summary = " ".join(soup.stripped_strings)
        summary = summary.lower()

        # Drop all punctuation
        summary = re.sub(r"[^\w\s]", "", summary)
        
    return summary

def get_clean_bill_title(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

        title = data["title"].lower()
        title = re.sub(r"[^\w\s]", "", title)

    return title

def tokenize_text(text):
    # tokenize text
    tokens = nltk.word_tokenize(text.lower())

    # get rid of stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return tokens

# example usage
get_clean_bill_summary("BillData/BillSummaries/00033.htm")


In [None]:
# function to test if a bill passed house
def did_bill_pass(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        if "Senate" in file.read():
            return 1
        else:
            return 0

# example usage
did_bill_pass("BillData/BillActions/07776.json")

In [None]:
# testing to see how many bills are missing the "policyArea" attribute

total = 0
count = 0

for file in tqdm(Path("BillData/BillSummaries").glob('*.htm')):
    bill_number = str(file).split("\\")[-1][:-4]

    with open(f"BillData/JSONFiles/{bill_number}.json", "r", encoding="utf-8") as file:
        data = json.load(file)

        try:
            policy_area = data["policyArea"]["name"]
        except KeyError:
            count += 1
        
        total +=1

print(f"{count}/{total}={count/total}")
            

In [None]:
features = ["bill_id", 
            "title", 
            "summary", 
            "sponsor_party", 
            "sponsor_state",
            "sponsor_district",
            "policy_area", 
            "subjects_count", 
            "introduced_date",
            "passed_house"]

dataset = pd.DataFrame(columns = features)

# iterate through htm files in the BillSummaries directory
for file in tqdm(Path("BillData/BillSummaries").glob('*.htm')):
    bill_number = str(file).split("\\")[-1][:-4]
    
    summary = get_clean_bill_summary(file)
    summary = tokenize_text(summary)

    # remove word "bill" if it is first word
    if summary[0] == "bill":
        summary.pop(0)

    with open(f"BillData/JSONFiles/{bill_number}.json", "r", encoding="utf-8") as file:
        data = json.load(file)
        
        title = data["title"].lower()
        title = re.sub(r"[^\w\s]", "", title)
        title = tokenize_text(title)

        sponsor_party = data["sponsors"][0]["party"]
        sponsor_state = data["sponsors"][0]["state"]
        
        introduced_date = data["introducedDate"]


        passed_house = did_bill_pass(f"BillData/BillActions/{bill_number}.json")

        # 5 bills are missing subject count
        try:
            subjects_count = data["subjects"]["count"]
        except Exception:
            subjects_count = 1

        # 29 bills are missing the policyArea attribute
        try:
            policy_area = data["policyArea"]["name"]
        except KeyError:
            policy_area = "unknown"

        # some regions only have one district
        # if the bill is missing a sponsor district, it's likely the bill is from such a region
        try:
            sponsor_district = data["sponsors"][0]["district"]
        except KeyError:
            sponsor_district = 1
        
    
    # add the row to the dataset
    new_row = pd.DataFrame([[int(bill_number),
                                 title,
                                 summary,
                                 sponsor_party,
                                 sponsor_state,
                                 sponsor_district,
                                 policy_area,
                                 subjects_count,
                                 introduced_date,
                                 passed_house]], columns = features)
    
    dataset = pd.concat([dataset, new_row])

dataset

In [None]:
# one hot encode these columns
dataset_encoded = pd.get_dummies(dataset, columns=["sponsor_party", "sponsor_state", "policy_area"])

# convert dates to datetime format
dataset_encoded["introduced_date"] = pd.to_datetime(dataset_encoded["introduced_date"], format="%Y-%m-%d")

dataset_encoded

In [None]:
# start and end dates of 117th congress
congress_start_date = dataset_encoded["introduced_date"].min()
congress_end_date = dataset_encoded["introduced_date"].max()

print(f"Congress Start Date: {congress_start_date}")
print(f"Congress Start Date: {congress_end_date}")

In [None]:

# Calculate the total number of days in the congressional session
total_days = (congress_end_date - congress_start_date).days

# function to normalize the dates between 0 and 1 where 0 is the start of congress and 1 is the end date
def normalize_date(date, start_date, total_days):
    return (date - start_date).days / total_days

# apply the normalization function
dataset_encoded['introduced_date'] = dataset_encoded['introduced_date'].apply(normalize_date, args=(congress_start_date, total_days))

dataset_encoded

In [None]:
dataset_encoded.columns

In [None]:
dataset_encoded.to_feather('117hrbills_encoded.feather')

In [None]:
df = pd.read_feather("117hrbills_encoded.feather")
df