Below is the code we used to scrape the Title, URL and the Poet from the website https://poets.org/poems the website has 784 pages with 20 poems each.


In [4]:
import requests
from bs4 import BeautifulSoup


def scrape_poets_org_multiple_pages(base_url, start_page=0, end_page=5):
    poems = []

    # Loop through pages
    for page in range(start_page, end_page + 1):
        url = f"{base_url}?page={page}"  # Modify the URL to add `page=x`

        try:
            # Send HTTP request
            response = requests.get(url, timeout=10)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL {url}: {e}")
            continue

        # Parse the page
        soup = BeautifulSoup(response.content, 'lxml')

        # Locate the section containing the poems
        poems_section = soup.find('div', id='block-views-block-poems-listing-table-block')
        if not poems_section:
            print(f"No poems section found on page {page}. Skipping...")
            continue

        # Locate the table containing the poems
        poems_table = poems_section.find('table', class_='cols-3')
        if not poems_table:
            print(f"No poems table found on page {page}. Skipping...")
            continue

        # Locate rows of the table
        poem_rows = poems_table.find('tbody').find_all('tr')
        if not poem_rows:
            print(f"No poem rows found on page {page}. Skipping...")
            continue

        # Extract data from each poem row
        for row in poem_rows:
            # Title and URL
            title_cell = row.find('td', class_='views-field views-field-title')
            if title_cell:
                title_link = title_cell.find('a')
                if title_link and 'href' in title_link.attrs:
                    title = title_link.text.strip()
                    # Correctly construct the URL
                    poem_url = "https://poets.org" + title_link['href']
                else:
                    print("Warning: Skipping a poem due to missing title link.")
                    continue
            else:
                print("Warning: Skipping a row due to missing title cell.")
                continue

            # Poet name
            poet_cell = row.find('td', class_='views-field views-field-field-author')
            poet = poet_cell.text.strip() if poet_cell else "Unknown Poet"

            # Append the poem details
            poems.append({"title": title, "url": poem_url, "poet": poet})

    return poems


# Base URL of the website
base_url = "https://poets.org/poems"

# Scrape multiple pages (e.g., from page 0 to 2)
poems = scrape_poets_org_multiple_pages(base_url, start_page=0, end_page=783)

# Print the results
if poems:
    print("Scraped Poems:")
    for i, poem in enumerate(poems, start=1):
        print(f"{i}. Title: {poem['title']}, URL: {poem['url']}, Poet: {poem['poet']}")
else:
    print("No poems were scraped.")


Scraped Poems:
1. Title: A Line-storm Song, URL: https://poets.org/poem/line-storm-song, Poet: Robert Frost
2. Title: The Weary Blues, URL: https://poets.org/poem/weary-blues, Poet: Langston Hughes
3. Title: Morning in the Burned House, URL: https://poets.org/poem/morning-burned-house, Poet: Margaret Atwood
4. Title: On Living, URL: https://poets.org/poem/living, Poet: Nâzim Hikmet
5. Title: I Could Be a Whale Shark, URL: https://poets.org/poem/i-could-be-whale-shark, Poet: Aimee Nezhukumatathil
6. Title: Batter my heart, three person’d God (Holy Sonnet 14), URL: https://poets.org/poem/batter-my-heart-three-persond-god-holy-sonnet-14, Poet: John Donne
7. Title: Theme for English B, URL: https://poets.org/poem/theme-english-b, Poet: Langston Hughes
8. Title: The Negro Speaks of Rivers, URL: https://poets.org/poem/negro-speaks-rivers, Poet: Langston Hughes
9. Title: Throwing Children, URL: https://poets.org/poem/throwing-children, Poet: Ross Gay
10. Title: Poem [Lana Turner has collapsed