In [None]:
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

def get_wikipedia_page(date_month, date_day):
    """Get Wikipedia page for a specific date"""
    url = f"https://en.wikipedia.org/wiki/{date_month}_{date_day}"
    
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()
        return response.text
    except:
        return None

def parse_births(html_content):
    """Parse the births section from Wikipedia HTML"""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find "Births" section
    for header in soup.find_all(['h2', 'h3']):
        if 'Births' in header.get_text():
            # Get next <ul> after the header
            ul = header.find_next('ul')
            return ul
    return None

def extract_person_info(li_text):
    """Extract year, name, and occupation from list item text"""
    parts = li_text.split(' – ', 1)
    if len(parts) != 2:
        return None
    
    year = parts[0].strip()
    rest = parts[1].strip()
    
    # Extract name (before first comma)
    if ',' in rest:
        name, occupation = rest.split(',', 1)
        occupation = occupation.strip()
    else:
        name = rest
        occupation = ""
    
    # Clean name - remove citations like [1], [2]
    name = name.split('[')[0].strip()
    
    # Format name: replace spaces with underscores, keep commas
    formatted_name = name.replace(' ', '_')
    
    return year, formatted_name, occupation

def create_xml_output(people_data):
    """Create XML in the specified format"""
    root = ET.Element("persons")
    root.set("day", "11-14")
    root.set("сomm", "создан 01.12 К.А.")
    
    for year, name, occupation in people_data:
        psn = ET.SubElement(root, "psn")
        psn.set("y", year)
        psn.set("h", name)
        if occupation:
            psn.set("p", occupation)
    
    # Convert to string
    xml_str = '<?xml version=\'1.0\' encoding=\'utf-8\' standalone=\'yes\'?>\n'
    xml_str += ET.tostring(root, encoding='unicode')
    
    return xml_str

def main():
    # Set date (November 14)
    month = "November"
    day = "15"
    
    print(f"Getting Wikipedia page for {month} {day}...")
    
    # Get Wikipedia page
    html = get_wikipedia_page(month, day)
    if not html:
        print("Failed to get Wikipedia page")
        return
    
    # Parse births section
    births_list = parse_births(html)
    if not births_list:
        print("No births section found")
        return
    
    # Extract people data
    people_data = []
    
    for li in births_list.find_all('li'):
        li_text = li.get_text().strip()
        
        # Skip empty items
        if not li_text:
            continue
        
        # Extract info
        result = extract_person_info(li_text)
        if result:
            people_data.append(result)
    
    print(f"Found {len(people_data)} people")
    
    # Create XML
    xml_output = create_xml_output(people_data)
    
    # Save to file
    with open("qpp1115.xml", "w", encoding="utf-8") as f:
        f.write(xml_output)
    
    print("qpp1115.xml")
    
    # Show sample
    print("\nSample of first 5 entries:")
    for i, (year, name, occupation) in enumerate(people_data[:1000]):
        occupation_display = f" | {occupation}" if occupation else ""
        print(f"{i+1}. {year} - {name.replace('_', ' ')}{occupation_display}")

if __name__ == "__main__":
    main()

Getting Wikipedia page for November 15...
Found 6 people
qpp1115.xml

Sample of first 5 entries:
1. 459 - Bʼutz Aj Sak Chiik | Mayan king (died 501)
2. 1316 - John I | king of France and Navarre (died 1316)
3. 1397 - Nicholas V | pope of the Catholic Church (died 1455)
4. 1498 - Eleanor of Austria | queen of Portugal and France (died 1558)
5. 1511 - Johannes Secundus | Dutch poet and author (died 1536)
