# Congress House Bills Web Scraper

## Description

A web scraper for Congress House bills (HBNXXXXX)

## Imports and Dependencies

### Install Dependencies

In [2]:
!uv pip install -r requirements.txt

[2mAudited [1m69 packages[0m [2min 116ms[0m[0m


### Library Imports

In [31]:
from playwright.async_api import async_playwright, Page, Browser
from playwright_stealth import Stealth
from camoufox.async_api import AsyncCamoufox
import asyncio
import os
import json
import requests
import random
MAX_CONCURRENCY = 5
semaphore = asyncio.Semaphore(MAX_CONCURRENCY)

## Helper and Utility Functions and Definitions

### File Class Object Definition

In [39]:
class File:
    def __init__(
            self,
            hbn : str,
            main_title : str, 
            session_number : str, 
            significance : str, 
            date_filed : str, 
            principal_authors : str, 
            date_read : str, 
            primary_referral : str, 
            bill_status : str,  
            text_filed : str, 
            is_file_downloadable : str
            ):
        self.hbn = hbn
        self.main_title = main_title
        self.session_number = session_number
        self.significance = significance
        self.date_filed = date_filed
        self.principal_authors = principal_authors
        self.date_read = date_read
        self.primary_referral = primary_referral
        self.bill_status = bill_status
        self.text_filed = text_filed
        self.is_file_downloadable = is_file_downloadable

    def __eq__(self, other):
        if isinstance(other, File):
            print("Collision Check")
            return self.hbn == other.hbn
        return False
    
    def __hash__(self):
        return hash(self.hbn)
files : set[File] = set()

### JSON Encoder for File Object

In [33]:
def json_encoder(obj: File):
    if isinstance(obj, File):
        return {
            'House Bill Number' : obj.hbn,
            'Main Title' : obj.main_title,
            'Session Number' : obj.session_number,
            'Significance' : obj.significance,
            'Date Filed' : obj.date_filed,
            'Principal Authors' : obj.principal_authors,
            'Date Read' : obj.date_read,
            'Primary Referral' : obj.primary_referral,
            'Bill Status' : obj.bill_status,
            'Text Filed' : obj.text_filed
        }
    raise TypeError("Object is not JSON parsable.")

### Download File From URL Function

In [34]:
def download(url: str, dest_folder: str):
    """
    Downloads the file from the URL provided and places it in the destination folder provided

    Inputs:
    url (str): input URL of file
    dest_folder (str): destination folder/directory of downloaded file
    
    Outputs:
    Returns 1 if the download was successful, and 0 if not.
    """
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)  # create folder if it does not exist
    try:
        filename = url.split('/')[-1].replace(" ", "_")  # be careful with file names
        file_path = os.path.join(dest_folder, filename)
        # print(f"URL: {url}")
        r = requests.get(url, stream=True)
        if r.ok:
            with open(file_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024 * 8):
                    if chunk:
                        f.write(chunk)
                        f.flush()
                        os.fsync(f.fileno())
        else:  # HTTP status code 4XX/5XX
            return False
        return True
    except:
        return False

### Get Files from Current Page

In [35]:
async def get_files_from_page(hb_items_locator):
    count = await hb_items_locator.count()
    for i in range(count):
        hb_item = hb_items_locator.nth(i)
        
        # Trigger AOS animation
        await hb_item.scroll_into_view_if_needed()

        # 1. Capture Header Info
        # Using .inner_text() on the specific span to get a clean string
        hbn = await hb_item.locator("span.rounded.border span span").first.inner_text()
        main_title = await hb_item.locator("span.text-blue-500").first.inner_text()
        print(f"Index {i}: {hbn.strip()}")

        # 2. Robust Metadata Helper
        # This specifically targets the grid container so it cannot 'leak' to the footer
        async def get_meta(label):
            try:
                # Logic: Find the label div, then get the very next div sibling (+)
                # only if it is inside the grid container
                value_locator = hb_item.locator(".grid.gap-1.px-5") \
                                       .locator(f"div:has-text('{label}') + div")
                
                text = await value_locator.first.inner_text(timeout=1000)
                return text.strip()
            except:
                return "N/A"

        # 3. PDF Link Fix: Use .first to avoid strictness errors
        pdf_loc = hb_item.locator('a[href$=".pdf"]').first
        link = await pdf_loc.get_attribute('href') if await pdf_loc.count() > 0 else "N/A"
        downloadability = False
        if link != 'N/A':
            downloadability = download(link, "outputs/")
        # 4. Build File Object
        # Note: Ensure you are passing the strings, not the locator objects
        new_file = File(
            hbn.strip(),
            main_title.strip(),
            await get_meta("Session No. :"),
            await get_meta("Significance :"),
            await get_meta("Date Filed :"),
            await get_meta("Principal Author/s :"),
            await get_meta("Date Read :"),
            await get_meta("Primary Referral :"),
            await get_meta("Bill Status :"),
            link,
            downloadability # Downloadable
        )
        files.add(new_file)

## Scraper Stuff

### Proxy Server

In [36]:
PROXY_SERVER = "http://84.17.47.149:9002"

### Actual Scraper

#### File Metadata Reset

In [37]:
files = set()

#### Scraper Code

In [1]:

try:
    async with AsyncCamoufox(headless=False, geoip=True) as browser:
        context = await browser.new_context(viewport={"width":1000, "height":500})
        page = await context.new_page()

        await page.goto("https://congress.gov.ph/legislative-documents/")
        
        # Wait for initial load
        await page.wait_for_selector('[id="20th Congress"]', state='visible', timeout=90000)
        
        # Set pagination to 100
        await page.locator("select.form-select").nth(1).select_option('100')    
        
        # Open the section
        await page.locator('[id="20th Congress"]').click()
        
        # Initial scroll and wait for first page items
        await page.wait_for_selector('.cursor-pointer.rounded-sm.border', state='visible')
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")

        # Skip pages if needed based on files array
        idx = 1
        for i in range(0, int(len(files) / 100)):
            print(f"Skipping page {i}\n")
            old_bill_id = await page.locator(".cursor-pointer span.rounded.border span span").first.inner_text()
            next_button = page.locator('li.next:not(.disabled) a') # Specifically target the 'Next' link
            await next_button.click()
            try:
                await page.wait_for_function(
                    f"""() => {{
                        const el = document.querySelector(".cursor-pointer span.rounded.border span span");
                        return el && el.innerText.trim() !== "{old_bill_id.strip()}";
                    }}""",
                    timeout=15000 # 15 seconds is usually enough for a data swap
                )
            except:
                # Fallback if JS check fails: wait for network to settle
                await page.wait_for_load_state("networkidle")
            
            # Small buffer for the UI to stabilize
            await page.wait_for_timeout(3000)
            idx += 1

        while(idx <= 79):
            # 1. Scrape the current page
            hb_items_locator = page.locator('.cursor-pointer.rounded-sm.border')
            await get_files_from_page(hb_items_locator)
            
            print(f"Finished scraping page {idx}")

            # 2. Prepare for Page Turn
            next_button = page.locator('li.next:not(.disabled) a') # Specifically target the 'Next' link
            
            if await next_button.count() > 0:
                # Capture ID of the first item to track when the data actually changes
                old_bill_id = await page.locator(".cursor-pointer span.rounded.border span span").first.inner_text()
                
                # 3. Perform Click
                await next_button.click()

                # 4. Wait for Content Swap (Simplified to avoid TimeoutError)
                # We only wait for the text to be DIFFERENT from the old one.
                try:
                    await page.wait_for_function(
                        f"""() => {{
                            const el = document.querySelector(".cursor-pointer span.rounded.border span span");
                            return el && el.innerText.trim() !== "{old_bill_id.strip()}";
                        }}""",
                        timeout=15000 # 15 seconds is usually enough for a data swap
                    )
                except:
                    # Fallback if JS check fails: wait for network to settle
                    await page.wait_for_load_state("networkidle")
                
                # Small buffer for the UI to stabilize
                await page.wait_for_timeout(3000)
                idx += 1
            else:
                print("No more pages available.")
                break
except:
    print("Error occurred. Saving progress...")

# --- Processing logic (e.g., saving to JSON) ---
with open('outputs/metadata.json', mode='w', encoding='utf-8') as f:
    json.dump(
        obj=list(files),
        fp=f,
        default=json_encoder,
        indent=4
    )

Error occurred. Saving progress...


NameError: name 'json' is not defined