# Collecting statement data from Congress.gov Congressional Record API

In [4]:
import re
import requests
import datetime

import pandas as pd
from urllib.error import HTTPError
from credentials import CONGRESS_API_KEY #get API key from congress.gov

In [6]:
# get a month's worth of congressional records from the API

base_url = f"https://api.congress.gov/v3/congressional-record/"
def get_cr_month(year, month):
    param_dict = {
        "api_key": CONGRESS_API_KEY,
        "y": year,
        "m": month,
        "limit": 250 #assuming that there are at most 250 records per month
    }
    response = requests.get(url=base_url, params=param_dict)
    if response.ok:
        return response.json()
    else:
        response.raise_for_status()

In [7]:
# test on Jan 2021 subset
ex = get_cr_month(2021, 1)
ex

In [1]:
# extract PDF links from the JSON returned by the congressional record API
def extract_pdf_urls(response, congress_num):
    try:
        links = [issue['Links'] for issue in response['Results']['Issues'] if int(issue['Congress']) == congress_num]
        house_links, senate_links = [], []
        for link_section in links:
            try:
                house_links += [link['Url'] for link in link_section['House']['PDF']]
                senate_links += [link['Url'] for link in link_section['Senate']['PDF']]
            except KeyError:
                pass
        return house_links, senate_links
    except (KeyError, IndexError) as e:
        return [], []

In [9]:
# test link extraction
house, senate = extract_pdf_urls(ex, 117)
senate

In [13]:
# get start and end years for the congress_num-th congress
def get_congress_years(congress_num):
    url = f"https://api.congress.gov/v3/congress/{congress_num}/"
    param_dict = {
        "api_key": CONGRESS_API_KEY,
    }
    response = requests.get(url=url, params=param_dict)
    if response.ok:
        this_congress = response.json()['congress']
        return int(this_congress['startYear']), int(this_congress['endYear'])
    else:
        response.raise_for_status()

In [12]:
# query, extract, and store CR PDFs for every day that the congress_num-th congress was in session
def congress_cr_df(congress_num):
    try:
        start_year, end_year = get_congress_years(congress_num)
    except HTTPError as e:
        raise e
    start_date = datetime.datetime(start_year, 1, 1)
    end_date = datetime.datetime(end_year, 12, 31)
    date_range = pd.date_range(start_date, end_date)
    cr_records = []
    for date in date_range:
        y = date.year
        m = date.month
        d = date.day
        try:
            response = get_cr_month(y, m)
            house_urls, senate_urls = extract_pdf_urls(response, congress_num)
            cr_records += [{
                'date': date,
                'congress': congress_num,
                'chamber': 'house',
                'pdf_url': house_url
            } for house_url in house_urls]
            cr_records += [{
                'date': date,
                'congress': congress_num,
                'chamber': 'senate',
                'pdf_url': senate_url
            } for senate_url in senate_urls]
        except HTTPError as h:
            raise h
    return pd.DataFrame(cr_records) 


In [88]:
df117 = congress_cr_df(117)

HTTPError: 429 Client Error: Too Many Requests for url: https://api.congress.gov/v3/congress/117/?api_key=3fenjitGhwzPCqg6fzrj4LwB96YYABLS7iWHqY9M

In [80]:
# the very first entry for every other year is a session to close out the previous congress, link goes like yyyy/mm/dd/CREC...
df117.head()

Unnamed: 0,date,chamber,pdf_url
0,2021-01-03,house,https://www.congress.gov/117/crec/2021/01/03/1...
1,2021-01-03,house,https://www.congress.gov/117/crec/2021/01/03/C...
2,2021-01-03,senate,https://www.congress.gov/117/crec/2021/01/03/1...
3,2021-01-03,senate,https://www.congress.gov/117/crec/2021/01/03/C...
4,2021-01-04,house,https://www.congress.gov/117/crec/2021/01/04/C...


In [None]:
pd.to_csv(df117, "cr_117")

Perform spot checks for pdfs: take a small sample from a broad time range and make sure formatting is the same

In [19]:
df117 = pd.read_pickle("cr_117")
df117 = df117.drop([1,3])
df117.head()

Unnamed: 0,date,chamber,pdf_url
0,2021-01-03,house,https://www.congress.gov/117/crec/2021/01/03/1...
2,2021-01-03,senate,https://www.congress.gov/117/crec/2021/01/03/1...
4,2021-01-04,house,https://www.congress.gov/117/crec/2021/01/04/C...
5,2021-01-05,house,https://www.congress.gov/117/crec/2021/01/05/C...
6,2021-01-06,house,https://www.congress.gov/117/crec/2021/01/06/1...


In [24]:
count = 0
for row in df117.itertuples():
    if count == 5:
        break
    count += 1
    with open(f'{row.Index}.pdf', 'wb') as pdf:
        pdf.write(requests.get(row.pdf_url).content)
    print(row.date.date())

2021-01-03
2021-01-03
2021-01-04
2021-01-05
2021-01-06
