## Web Scraping with Playwright, Part 2

In [30]:
import asyncio
from playwright.async_api import async_playwright

async def fetch_varieties_html():
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=False)
        page = await browser.new_page()

        print("Loading page...")
        await page.goto("https://www.applesfromny.com/varieties/")

        # Get the rendered HTML
        html = await page.content()

        await browser.close()
        return html

html = asyncio.run(fetch_varieties_html())

print("HTML length:", len(html))
print("\nPreview (first 1000 characters):\n")
print(html[:1000])


Loading page...
HTML length: 724138

Preview (first 1000 characters):

<!DOCTYPE html><html lang="en-US" data-uw-w-loader=""><head>
		<meta charset="UTF-8">
		<meta name="viewport" content="width=device-width">
		<meta name="robots" content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1">
	<style>img:is([sizes="auto" i], [sizes^="auto," i]) { contain-intrinsic-size: 3000px 1500px }</style>
	
	<!-- This site is optimized with the Yoast SEO plugin v26.4 - https://yoast.com/wordpress/plugins/seo/ -->
	<title>Varieties Archive - New York Apple Association</title><link rel="preconnect" href="https://fonts.gstatic.com/" crossorigin=""><link rel="preconnect" href="https://fonts.googleapis.com"><style type="text/css">@font-face {
  font-family: 'Montserrat';
  font-style: italic;
  font-weight: 300;
  font-display: swap;
  src: url(https://fonts.gstatic.com/s/montserrat/v30/JTUFjIg1_i6t8kCHKm459Wx7xQYXK0vOoz6jq_p9aX8.ttf) format('truetype');
}
@font-face {
  font

In [33]:
from bs4 import BeautifulSoup
import re

html_snippet = """
<div class="container-fluid">
    <div class="image image-scale">
        <img src="https://www.applesfromny.com/wp-content/uploads/2020/05/20Ounce_NYAS-Apples2.png" alt="Apple Photo - 20 Ounce">
        <div class="scale">
            <div class="sweet-tart-scale">
                <div class="sweet">Sweet</div>
                <div class="scale-icon scale-7"></div>
                <div class="green-bg"></div>
                <div class="tart">Tart</div>
            </div>
        </div>
    </div>

    <h1>20 Ounce</h1>
    <p>This heirloom variety is famous not only for its size, but excellent cooking qualities as well.</p>

    <h2 class="wp-block-heading">PROFILE</h2>
    <p>Pale yellow flesh<br>Firm, tender, juicy and tart</p>

    <h2 class="wp-block-heading">BEST USES</h2>
    <p>Excellent for cooking<br>Best for pies, apple sauce and baking</p>

    <h2 class="wp-block-heading">SPECIAL HINT</h2>
    <p>It’s Grandma’s favorite apple for pies.</p>

    <h2 class="wp-block-heading">AVAILABILITY</h2>
    <p>September to November</p>
</div>
"""

soup = BeautifulSoup(html_snippet, "html.parser")

# Extract data
name = soup.find("h1").get_text(strip=True) if soup.find("h1") else None
main_desc = soup.find("h1").find_next("p").get_text(strip=True) if soup.find("h1") else None

def extract_section_text(title):
    heading = soup.find("h2", string=re.compile(title, re.I))
    if heading:
        p_tag = heading.find_next("p")
        if p_tag:
            return p_tag.get_text(separator="\n", strip=True)
    return None

profile = extract_section_text("PROFILE")
best_uses = extract_section_text("BEST USES")
special_hint = extract_section_text("SPECIAL HINT")
availability = extract_section_text("AVAILABILITY")

# Image URL
img_tag = soup.find("img")
image_url = img_tag["src"] if img_tag else None

# Sweet/Tart scale
scale_icon = soup.find("div", class_=re.compile(r"scale-icon"))
sweet_tart_scale = None
if scale_icon:
    for cls in scale_icon.get("class", []):
        if cls.startswith("scale-") and cls != "scale-icon":
            sweet_tart_scale = int(cls.replace("scale-", ""))

# Store results
apple_data = {
    "name": name,
    "main_description": main_desc,
    "profile": profile,
    "best_uses": best_uses,
    "special_hint": special_hint,
    "availability": availability,
    "image_url": image_url,
    "sweet_tart_scale": sweet_tart_scale
}

print(apple_data)


{'name': '20 Ounce', 'main_description': 'This heirloom variety is famous not only for its size, but excellent cooking qualities as well.', 'profile': 'Pale yellow flesh\nFirm, tender, juicy and tart', 'best_uses': 'Excellent for cooking\nBest for pies, apple sauce and baking', 'special_hint': 'It’s Grandma’s favorite apple for pies.', 'availability': 'September to November', 'image_url': 'https://www.applesfromny.com/wp-content/uploads/2020/05/20Ounce_NYAS-Apples2.png', 'sweet_tart_scale': 7}


In [34]:
import pandas as pd

# If you eventually have multiple apples, put them in a list
apple_list = [apple_data]  # currently just one, can append more

# Convert to DataFrame
df = pd.DataFrame(apple_list)

# Preview the DataFrame
print(df)

# Save to CSV
df.to_csv("apples.csv", index=False)
print("Saved to apples.csv")


       name                                   main_description  \
0  20 Ounce  This heirloom variety is famous not only for i...   

                                           profile  \
0  Pale yellow flesh\nFirm, tender, juicy and tart   

                                           best_uses  \
0  Excellent for cooking\nBest for pies, apple sa...   

                              special_hint           availability  \
0  It’s Grandma’s favorite apple for pies.  September to November   

                                           image_url  sweet_tart_scale  
0  https://www.applesfromny.com/wp-content/upload...                 7  
Saved to apples.csv


In [35]:
df.head()

Unnamed: 0,name,main_description,profile,best_uses,special_hint,availability,image_url,sweet_tart_scale
0,20 Ounce,This heirloom variety is famous not only for i...,"Pale yellow flesh\nFirm, tender, juicy and tart","Excellent for cooking\nBest for pies, apple sa...",It’s Grandma’s favorite apple for pies.,September to November,https://www.applesfromny.com/wp-content/upload...,7
