In [16]:
from playwright.async_api import async_playwright
import asyncio

async def open_page():
    async with async_playwright() as p:
        # Launch Firefox headlessly
        browser = await p.firefox.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        
        # Go to the TDLR search page
        await page.goto("https://www.tdlr.texas.gov/tools_search/", wait_until="networkidle")
        
        # Take a screenshot
        await page.screenshot(path="tdlr_search_page.png")
        print("Screenshot saved as tdlr_search_page.png")
        
        await browser.close()

# Run the async function
await open_page()



Screenshot saved as tdlr_search_page.png


In [17]:
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd
import asyncio

# Your list of ZIP codes
zip_codes = [
    "77002","77021","77040","77059","77078","77098","77384","77479","77547",
    "77003","77022","77041","77060","77079","77099","77385","77484","77571",
    # (add all remaining ZIP codes here)
]

# Function to open the page and return the page object
async def open_tdlr_page():
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        await page.goto("https://www.tdlr.texas.gov/tools_search/", wait_until="networkidle")
        return page, browser

# Example usage: open page
page, browser = await open_tdlr_page()
print("TDLR search page loaded.")


Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed


TDLR search page loaded.


In [24]:
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import asyncio

zip_codes = ["77002"]  # test one ZIP code first

async def scrape_one_zip(zip_code):
    async with async_playwright() as p:
        # Launch headless Firefox
        browser = await p.firefox.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        
        # Open TDLR search page
        await page.goto("https://www.tdlr.texas.gov/tools_search/")
        
        # Fill the ZIP code field
        await page.fill("#zipcodedata", zip_code)
        
        # Click the search button and wait for the results page
        async with page.expect_navigation(timeout=30000):
            await page.click("#submit3")
        
        # Now the results page is loaded
        html = await page.content()
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(html, "html.parser")
        
        # Print snippet to confirm
        print(soup.prettify()[:1000])
        
        await browser.close()

# Run the function
await scrape_one_zip(zip_codes[0])


<html>
 <head>
  <title>
   TDLR Tow Truck and Vehicle Storage Facility Inquiry
  </title>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="Microsoft Visual Studio" name="GENERATOR"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="Welcome to the Tow Truck and Vehicle Storage Facility Inquiry Information Page. 
				This web application allows users to obtain information on companies that have 
				obtained registration through TDLR. This includes addresses, insurance records, 
				recent activities, and vehicle data." name="description"/>
  <meta content="Tow Trucks, Vehicle Storage Facility, registration, insurance, Permit 
				Restrictions, Texas Department of Licensing and Regulation, TDLR" name="keywords"/>
  <meta content="Transportation" name="subject"/>
  <meta content="Programs and services" name="type"/>
  <meta content="Texas Department of Licensing and Regulation (State of Texas)" name="author"/>


In [27]:
# Adjusted scraping function to return HTML
async def scrape_one_zip(zip_code):
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        
        await page.goto("https://www.tdlr.texas.gov/tools_search/")
        await page.fill("#zipcodedata", zip_code)
        
        async with page.expect_navigation(timeout=30000):
            await page.click("#submit3")
        
        html = await page.content()  # return the full page HTML
        await browser.close()
        return html

# Run for one ZIP code and get HTML
html = await scrape_one_zip("77002")


In [28]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")


In [29]:
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd
import asyncio
from tqdm.notebook import tqdm  # progress bar

# List of ZIP codes
zip_codes = [
    "77002","77021","77040","77059","77078","77098","77384","77479","77547",
    "77003","77022","77041","77060","77079","77099","77385","77484","77571",
    "77004","77023","77042","77061","77080","77204","77386","77489","77573",
    "77005","77024","77043","77062","77081","77325","77388","77492","77574",
    "77006","77025","77044","77063","77082","77336","77389","77493","77578"
]

# Function to scrape HTML for one ZIP code
async def scrape_one_zip(zip_code):
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        
        await page.goto("https://www.tdlr.texas.gov/tools_search/")
        await page.fill("#zipcodedata", zip_code)
        
        async with page.expect_navigation(timeout=30000):
            await page.click("#submit3")
        
        html = await page.content()
        await browser.close()
        return html

# Function to parse HTML and extract companies
def parse_companies(html):
    soup = BeautifulSoup(html, "html.parser")
    companies = []

    # Extract Company Information tables
    company_tables = soup.find_all("table", attrs={"border": "0"})
    for table in company_tables:
        if table.find(text="Company Information:"):
            company_data = {}
            rows = table.find_all("tr")
            for row in rows:
                cols = row.find_all("td")
                if len(cols) >= 2:
                    left_text = cols[0].get_text(separator=" ", strip=True)
                    right_text = cols[1].get_text(separator=" ", strip=True)
                    if left_text.startswith("Name:"):
                        company_data["Company Name"] = left_text.replace("Name:", "").strip()
                    elif left_text.startswith("DBA:"):
                        company_data["DBA"] = left_text.replace("DBA:", "").strip()
                    elif left_text.startswith("Owner/Officer:"):
                        company_data["Owner/Officer"] = left_text.replace("Owner/Officer:", "").strip()
                    elif left_text.startswith("Phone:"):
                        company_data["Phone"] = left_text.replace("Phone:", "").strip()
            if company_data:  # only add if non-empty
                companies.append(company_data)

    # Extract Certificate Information table
    cert_table = soup.find("table", attrs={"border": "1"})
    if cert_table and companies:
        rows = cert_table.find_all("tr")
        for row in rows:
            cols = row.find_all("td")
            if len(cols) == 2:
                key = cols[0].get_text(strip=True)
                value = cols[1].get_text(strip=True)
                if "Status" in key:
                    companies[0]["Certificate Status"] = value
                elif "Number" in key:
                    companies[0]["Certificate Number"] = value
                elif "Carrier Type" in key:
                    companies[0]["Carrier Type"] = value
                elif "Number of Active Tow Trucks" in key:
                    companies[0]["Number of Active Tow Trucks"] = value
                elif "Mailing" in key:
                    companies[0]["Mailing Address"] = value
                elif "Physical" in key:
                    companies[0]["Physical Address"] = value

    return companies

# Main async function to scrape all ZIP codes
async def scrape_all_zipcodes(zip_codes):
    all_companies = []
    for zip_code in tqdm(zip_codes, desc="Scraping ZIP codes"):
        html = await scrape_one_zip(zip_code)
        companies = parse_companies(html)
        for company in companies:
            company["ZIP Code Searched"] = zip_code
        all_companies.extend(companies)
    return pd.DataFrame(all_companies)

# Run the scraping and save to CSV
df = await scrape_all_zipcodes(zip_codes)
df.to_csv("tdlr_companies.csv", index=False)
df.head()


Scraping ZIP codes:   0%|          | 0/45 [00:00<?, ?it/s]

In [30]:
# Save the DataFrame to CSV
df.to_csv("tdlr_companies.csv", index=False)


In [31]:
!pwd


/Users/towcenter/Desktop/python/homework/homework-10
