In [3]:
import pandas as pd
import os
import requests
import time
import random
import re

from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
def extract_poem_links(driver, page_idx):
    main_url = f"https://www.thivien.net/searchpoem.php?PoemType=16&ViewType=1&Country=2&Age[]=3&Page={page_idx}"
    driver.get(main_url)
    time.sleep(random.uniform(3, 5))

    content_tags_xpath = '//*[@class="page-content container"]//div[@class="page-content-main"]//div[@class="list-item"]'
    content_tags = driver.find_elements(By.XPATH, content_tags_xpath)
    poem_links = []
    for tag in content_tags:
        try:
            link_element = tag.find_element(By.XPATH, './/h4[@class="list-item-header"]/a')
            poem_title = link_element.text
            poem_url = link_element.get_attribute("href")
            poem_links.append({"title": poem_title, "url": poem_url})
        except Exception as e:
            print(f"Error extracting link: {e}")
            continue
    return poem_links

In [4]:
def clean_poem_html(html):
    html = re.sub(r"<img.*?>", "", html, flags=re.IGNORECASE)
    html = re.sub(r"<i>.*?</i>", "", html, flags=re.IGNORECASE | re.DOTALL)
    html = re.sub(r"<b>(.*?)</b>(?!\s*(?:<br\s*/?>\s*){2,})", r"\1", html, flags=re.IGNORECASE)
    html = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
    html = re.sub(r"</?p>", "", html, flags=re.IGNORECASE)
    return html.strip()

In [5]:
def process_poem_content(html, poem_src, poem_url, default_title=""):
    cleaned = clean_poem_html(html)

    pattern = re.compile(r"<b>(.*?)</b>\s*\n{2,}", flags=re.IGNORECASE)
    matches = list(pattern.finditer(cleaned))

    poems = []
    if matches:
        for i, match in enumerate(matches):
            title = match.group(1).strip()
            start = match.end()
            end = matches[i+1].start() if i + 1 < len(matches) else len(cleaned)
            content = cleaned[start:end].strip("\n")
            poems.append({
                "title": title,
                "content": content,
                "source": poem_src,
                "url": poem_url
            })
    else:
        poems.append({
            "title": default_title,
            "content": cleaned,
            "source": poem_src,
            "url": poem_url
        })
    return poems

In [8]:
def scrape_poem(driver, poem_url):
    driver.get(poem_url)
    time.sleep(random.uniform(3, 5))

    poem_content_tag = WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.CSS_SELECTOR, "div.poem-content"))
    )

    html_content = poem_content_tag.get_attribute("innerHTML")

    try:
        poem_src_tag = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[@class="small"]'))
        )
        poem_src = poem_src_tag.text
    except Exception:
        poem_src = ""

    return process_poem_content(html_content, poem_src, poem_url)


In [12]:
def scrape_poems(driver, num_pages=10):
    datasets = []
    for page_idx in tqdm(range(1, num_pages + 1)):
        poem_links = extract_poem_links(driver, page_idx)
        for poem in poem_links:
            poem_url = poem["url"]
            try:
                poems = scrape_poem(driver, poem_url)
                datasets.extend(poems)
            except Exception as e:
                print(f"Error processing {poem_url}: {e}")
                continue
    return datasets

In [14]:
driver = webdriver.Chrome()
datasets = scrape_poems(driver, num_pages=10)
driver.quit()

100%|██████████| 10/10 [13:21<00:00, 80.18s/it]


In [15]:
df = pd.DataFrame(datasets)
df.to_csv("poem_dataset.csv", index=True)