Playwright Scraping Notebook

This notebook demonstrates how to use Playwright to scrape AI news from Wired.
It directly interacts with the webpage to extract titles and URLs, then displays the results in a pandas DataFrame.

In [None]:
# --- Setup and Installation ---
# Install required libraries
!pip install playwright pandas -q
!playwright install chromium

In [None]:
# --- Imports ---
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
from IPython.display import display, HTML
import nest_asyncio

In [None]:
# Apply nest_asyncio to allow asynchronous operations in Jupyter
nest_asyncio.apply()

In [None]:
# --- Web Scraping Function ---
async def scrape_wired_news(url):
    async with async_playwright() as p:
        # Launch the browser
        browser = await p.chromium.launch()
        page = await browser.new_page()

        # Navigate to the Wired AI news page
        await page.goto(url)

        # Select all news items on the page
        news_items = await page.query_selector_all('.summary-item')

        results = []
        for item in news_items:
            # Extract title and URL for each news item
            title_element = await item.query_selector('.summary-item__hed')
            link_element = await item.query_selector('.summary-item__hed-link')

            if title_element and link_element:
                title = await title_element.inner_text()
                url = await link_element.get_attribute('href')
                results.append({'title': title, 'url': f"https://www.wired.com{url}"})

        await browser.close()
        return results

In [None]:
# Scrape the news
news = await scrape_wired_news("https://www.wired.com/tag/artificial-intelligence/?page=1")

# Convert results to a pandas DataFrame
df = pd.DataFrame(news)

In [None]:
display(HTML(df.to_html(render_links=True, escape=False)))