# Congress House Bills Web Scraper

## Description

A web scraper for Congress House bills (HBNXXXXX)

## Imports and Dependencies

### Install Dependencies

In [11]:
!uv pip install -r requirements.txt

[2mAudited [1m46 packages[0m [2min 23ms[0m[0m


### Library Imports

In [2]:
from playwright.async_api import async_playwright, Page, Browser
from playwright_stealth import Stealth
from camoufox.async_api import AsyncCamoufox
import asyncio
import os
import json
import requests
import random
MAX_CONCURRENCY = 5
semaphore = asyncio.Semaphore(MAX_CONCURRENCY)

## Helper and Utility Functions and Definitions

### File Class Object Definition

In [None]:
class File:
    def __init__(
            self,
            hbn : str,
            main_title : str, 
            session_number : str, 
            significance : str, 
            date_filed : str, 
            principal_authors : str, 
            date_read : str, 
            primary_referral : str, 
            bill_status : str,  
            text_filed : str, 
            is_file_downloadable : str
            ):
        self.hbn = hbn,
        self.main_title = main_title,
        self.session_number = session_number
        self.significance = significance
        self.date_filed = date_filed
        self.principal_authors = principal_authors
        self.date_read = date_read
        self.primary_referral = primary_referral
        self.bill_status = bill_status
        self.text_filed = text_filed
        self.is_file_downloadable = is_file_downloadable

files : list[File] = []

### JSON Encoder for File Object

In [18]:
def json_encoder(obj: File):
    if isinstance(obj, File):
        return {
            'House Bill Number' : obj.hbn,
            'Main Title' : obj.main_title,
            'Session Number' : obj.session_number,
            'Significance' : obj.significance,
            'Date Filed' : obj.date_filed,
            'Principal Authors' : obj.principal_authors,
            'Date Read' : obj.date_read,
            'Primary Referral' : obj.primary_referral,
            'Bill Status' : obj.bill_status,
            'Text Filed' : obj.text_filed
        }
    raise TypeError("Object is not JSON parsable.")

### Download File From URL Function

In [19]:
def download(url: str, dest_folder: str):
    """
    Downloads the file from the URL provided and places it in the destination folder provided

    Inputs:
    url (str): input URL of file
    dest_folder (str): destination folder/directory of downloaded file
    
    Outputs:
    Returns 1 if the download was successful, and 0 if not.
    """
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)  # create folder if it does not exist

    filename = url.split('/')[-1].replace(" ", "_")  # be careful with file names
    file_path = os.path.join(dest_folder, filename)
    print(f"URL: {url}")
    r = requests.get(url, stream=True)
    if r.ok:
        with open(file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 8):
                if chunk:
                    f.write(chunk)
                    f.flush()
                    os.fsync(f.fileno())
    else:  # HTTP status code 4XX/5XX
        return False
    return True

### Get Files from Current Page

In [27]:
async def get_files_from_page(hb_items):
    for hb_item in hb_items:
        # metadata things
        await hb_item.scroll_into_view_if_needed()
        hbn = await hb_item.locator('div span').first.inner_text()    
        main_title = await hb_item.locator('div span').nth(1).inner_text()
        session_no = await hb_item.locator('.grid.gap-1.px-5 .flex').nth(0).inner_text()
        significance = await hb_item.locator('.grid.gap-1.px-5 .flex').nth(1).inner_text()   
        date_filed = await hb_item.locator('.grid.gap-1.px-5 .flex').nth(2).inner_text()   
        principal_authors = await hb_item.locator('.grid.gap-1.px-5 .flex').nth(3).inner_text()   
        date_read = await hb_item.locator('.grid.gap-1.px-5 .flex').nth(4).inner_text()   
        primary_referral = await hb_item.locator('.grid.gap-1.px-5 .flex').nth(5).inner_text()   
        bill_status = await hb_item.locator('.grid.gap-1.px-5 .flex').nth(6).inner_text()   

        link = await hb_item.locator('.my-5.grid.grid-cols-1 a').first.get_attribute('href')
        is_file_downloadable = download(link, 'outputs/')
        new_file = File(
            hbn,
            main_title,
            session_no,
            significance,
            date_filed,
            principal_authors,
            date_read,
            primary_referral,
            bill_status,
            link,
            is_file_downloadable
        )
        files.append(new_file)

## Scraper Stuff

### Proxy Server

In [6]:
PROXY_SERVER = "http://84.17.47.149:9002"

### Actual Scraper

In [None]:
async with AsyncCamoufox(headless=False, geoip=True) as browser:
    context = await browser.new_context(
        viewport={"width":1280, "height":720}
    )
    page = await context.new_page()

    await page.goto("https://congress.gov.ph/legislative-documents/")
        
    # 3. Wait for the content to confirm bypass
    await page.wait_for_selector('[id="20th Congress"]', state='visible', timeout=90000)    

    # Open 20th Congress Dropdown
    await page.locator('[id="20th Congress"]').click()
    await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
    await page.wait_for_selector('.pagination', state='visible')

    pagination_links = await page.locator('ul li').all()
    last_pagination_link = await pagination_links[-2].inner_text()
    
    current_page = pagination_links[0]
    idx = 1
    while((await current_page.inner_text()) != 'Next'):
        hb_items = await page.locator('.cursor-pointer.rounded-sm.border').all()
        await get_files_from_page(hb_items)
        idx += 1
        pagination_links = await page.locator('ul li').all()
        current_page = pagination_links[idx]
        await current_page.click()

    

    # --- Your scraping logic continues here ---
    

URL: https://docs.congress.hrep.online/legisdocs/basic_20/HB00001.pdf
URL: https://docs.congress.hrep.online/legisdocs/basic_20/HB00002.pdf
URL: https://docs.congress.hrep.online/legisdocs/basic_20/HB00003.pdf
URL: https://docs.congress.hrep.online/legisdocs/basic_20/HB00004.pdf
URL: https://docs.congress.hrep.online/legisdocs/basic_20/HB00005.pdf
URL: https://docs.congress.hrep.online/legisdocs/basic_20/HB00006.pdf
URL: https://docs.congress.hrep.online/legisdocs/basic_20/HB00007.pdf
URL: https://docs.congress.hrep.online/legisdocs/basic_20/HB00008.pdf
URL: https://docs.congress.hrep.online/legisdocs/basic_20/HB00009.pdf
URL: https://docs.congress.hrep.online/legisdocs/basic_20/HB00010.pdf
URL: https://docs.congress.hrep.online/legisdocs/basic_20/HB00001.pdf
URL: https://docs.congress.hrep.online/legisdocs/basic_20/HB00002.pdf


TargetClosedError: Locator.scroll_into_view_if_needed: Target page, context or browser has been closed

In [24]:
with open('outputs/metadata.json', mode='w', encoding='utf-8') as f:
    json.dump(
        obj=files,
        fp=f,
        default=json_encoder,
        indent=4
    )