In [1]:
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import re
import time
from urllib.parse import unquote, urljoin, urlparse
import html
import sys
import io

# Ensure UTF-8 encoding for output (Jupyter-compatible)
try:
    # Check if we're in Jupyter/IPython - only modify if buffer exists
    if hasattr(sys.stdout, 'buffer') and sys.stdout.encoding != 'utf-8':
        sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
    if hasattr(sys.stderr, 'buffer') and sys.stderr.encoding != 'utf-8':
        sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
except (AttributeError, ValueError):
    # In Jupyter, stdout/stderr are already UTF-8 compatible, no action needed
    pass

# Configuration
BASE_URL = "https://ru.wikipedia.org"
CATEGORY_URL = "https://ru.wikipedia.org/wiki/Категория:Родившиеся_18_ноября"
EXISTING_XML = "pyy1118.xml"
OUTPUT_XML = "qyy1118.xml"

# Russian occupation names for matching - expanded list
OCCUPATIONS = [
    # Writers and journalists
    "писатель", "поэт", "прозаик", "драматург", "журналист", "публицист", "эссеист", "критик", "литературный критик",
    "библиограф", "библиофил", "лексикограф", "литературовед", "историк литературы",
    # Artists and designers
    "художник", "скульптор", "архитектор", "дизайнер", "иллюстратор", "график", "живописец", "портретист",
    "карикатурист", "гравёр", "художник-гравёр", "художник-график", "художник-постановщик", "художник-мультипликатор",
    "деятель изобразительного искусства", "керамист", "керамистка", "гобеленистка", "рисовальщик", "рисовальщица",
    # Film and theater
    "актёр", "актриса", "режиссёр", "кинорежиссёр", "сценарист", "оператор", "кинематографист", "кинооператор",
    "театральный деятель", "танцор", "хореограф", "балетмейстер", "артистка балета", "театральный актёр",
    "театральный педагог", "театральный критик", "театровед", "кинокритик", "киновед", "искусствовед",
    # Music
    "музыкант", "композитор", "певец", "дирижёр", "пианист", "пианистка", "басистка",
    # Scientists and educators
    "учёный", "исследователь", "профессор", "преподаватель", "учитель", "лектор", "преподаватель университета",
    "физик", "химик", "математик", "биолог", "ботаник", "геолог", "историк", "географ", "археолог", "лингвист",
    "антрополог", "этнограф", "палеонтолог", "палеоботаник", "вирусолог", "микробиолог", "биохимик", "физиолог",
    "анатом", "энтомолог", "зоолог", "орнитолог", "ихтиолог", "лепидоптеролог", "лихенолог", "селекционер",
    "филолог", "классицист", "классицистка", "романист", "нумизматка", "византолог",
    # Medical
    "врач", "медик", "хирург", "психиатр", "невролог", "патолог", "фармаколог", "военный врач",
    # Engineering and technology
    "инженер", "конструктор", "изобретатель", "авиационный инженер", "горный инженер", "военный инженер",
    "инженер-строитель", "инженер-механик", "инженер-гидротехник", "инженер-архитектор",
    # Politics and law
    "политик", "государственный деятель", "дипломат", "юрист", "адвокат", "практикующий юрист", "судья",
    "прокурор", "депутат", "посол", "министр", "президент", "премьер-министр",
    # Military
    "военный деятель", "военнослужащий", "офицер", "полководец", "лётчик", "солдат", "капитан", "генерал",
    "адмирал", "командир", "разведчик", "мемуарист",
    # Sports
    "спортсмен", "футболист", "тренер", "тренер-преподаватель", "бейсболист", "хоккеист", "биатлонист",
    "бобслеист", "скелетонист", "автогонщик", "раллийный автогонщик",
    # Business
    "предприниматель", "бизнесмен", "торговец", "банкир", "менеджер", "промышленник", "магнат",
    # Religious
    "священник", "религиозный деятель", "монах", "раввин", "настоятель", "миссионер", "теолог", "богослов",
    # Philosophy and humanities
    "философ", "философ науки", "историк науки", "культуролог", "социолог", "экономист", "политолог",
    # Other professions
    "библиотекарь", "переводчик", "редактор", "издатель", "фотограф", "кинопродюсер", "телепродюсер",
    "телеведущий", "радиоведущий", "подкастер", "обозреватель", "корреспондент", "репортёр",
    "модель", "фотомодель", "порноактриса", "эротическая модель", "участница конкурса красоты",
    "блогер", "видеоблогер", "стример", "тиктокер", "ютубер",
    "модельер", "кутюрье", "переплётчица",
    # General
    "долгожитель", "аристократ", "аристократка", "фрейлина", "аэронавт", "капитан судна",
    "капитан речного флота", "шеф-повар", "агроном", "доярка", "свинарка", "шахтёр", "рабочий", "крестьянин",
    "партизан", "революционерка", "активист", "активистка", "гуманистка", "правозащитница",
    "коллекционер", "коллекционерка", "коллекционер искусства"
]

OCCUPATION_CATEGORIES = {
    "Писатель": ["писатель", "поэт", "прозаик", "драматург", "журналист", "публицист", "переводчик", "редактор", "издатель"],
    "Художник": ["художник", "скульптор", "архитектор", "дизайнер", "иллюстратор", "фотограф"],
    "Кинематографист": ["актёр", "актриса", "режиссёр", "кинорежиссёр", "сценарист", "оператор", "кинематографист"],
    "Театральный деятель": ["театральный деятель", "актёр", "актриса", "режиссёр", "танцор", "хореограф"],
    "Учёный": ["учёный", "исследователь", "профессор", "преподаватель", "учитель", "физик", "химик", "математик", "биолог", "ботаник", "геолог", "историк", "географ", "археолог", "лингвист"],
    "Врач": ["врач", "медик", "хирург", "психиатр", "невролог"],
    "Военный деятель": ["военный деятель", "военнослужащий", "офицер", "полководец", "лётчик"],
    "Государственный деятель": ["государственный деятель", "политик", "дипломат", "юрист", "адвокат"],
    "Политик": ["политик", "государственный деятель"],
    "Предприниматель": ["предприниматель", "бизнесмен", "торговец"],
    "Религиозный деятель": ["религиозный деятель", "священник", "монах"],
    "Философ": ["философ"],
    "Персона": ["долгожитель", "аристократ", "аристократка", "фрейлина"],
    "Порноактриса": ["порноактриса"],
    "Фотомодель": ["фотомодель", "модель"],
    "Модельер": ["модельер"],
    "Блогер": ["блогер", "видеоблогер", "стример", "тиктокер"],
    "Ректор": ["ректор"],
    "Священник": ["священник"],
    "Убийца": ["убийца"],
    "Футболист": ["футболист"],
    "Биатлонист": ["биатлонист"],
    "Карточка фотографа": ["фотограф"]
}

print("Configuration loaded")


Configuration loaded


In [2]:
def normalize_name(name):
    """Normalize a name to a standard format for comparison"""
    if not name:
        return ''
    # Decode URL encoding if present
    normalized = unquote(name)
    # Ensure underscores are used (not spaces) to match XML format
    normalized = normalized.replace(' ', '_')
    return normalized

def load_existing_entries(xml_file):
    """Load existing entries from XML file with their data for comparison"""
    existing_data = {}  # Dictionary: normalized_name -> {'year': year, 'occupation': occupation}
    existing_names = set()  # Set of all name variations for quick lookup
    
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for psn in root.findall('psn'):
            name = psn.get('h', '')
            if name:
                # Normalize the name
                normalized = normalize_name(name)
                
                # Extract year and occupation
                year = psn.get('y', '')
                occupation_attr = psn.get('p', '')
                
                # Extract occupation (remove category if present, format: "Category;occupation")
                occupation = ''
                if occupation_attr:
                    if ';' in occupation_attr:
                        occupation = occupation_attr.split(';', 1)[1]  # Get part after semicolon
                    else:
                        occupation = occupation_attr
                
                # Store data by normalized name
                existing_data[normalized] = {
                    'year': year if year and year != 'yyyy' else None,
                    'occupation': occupation
                }
                
                # Also add name variations for quick lookup
                existing_names.add(normalized)
                existing_names.add(name)
                try:
                    from urllib.parse import quote
                    existing_names.add(quote(normalized, safe=''))
                    existing_names.add(quote(name, safe=''))
                except:
                    pass
        
        print(f"Loaded {len(existing_data)} existing entries from {xml_file}")
        print(f"Sample entries: {list(existing_data.keys())[:3] if existing_data else 'None'}")
    except FileNotFoundError:
        print(f"File {xml_file} not found, starting fresh")
    except Exception as e:
        print(f"Error loading {xml_file}: {e}")
    
    return existing_data, existing_names

existing_data, existing_names = load_existing_entries(EXISTING_XML)


Loaded 1234 existing entries from pyy1118.xml
Sample entries: ['Ааней,_Андрея', 'Абашидзе,_Аслан_Хусейнович', 'Аббуд,_Карима']


In [3]:
def get_page_content(url):
    """Fetch Wikipedia page content"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_year_of_birth(soup):
    """Extract year of birth from Wikipedia page"""
    # Try to find in infobox (most reliable)
    infobox = soup.find('table', class_='infobox')
    if infobox:
        # Look for "Дата рождения" or "Родился/Родилась"
        for row in infobox.find_all('tr'):
            th = row.find('th')
            if th:
                th_text = th.get_text().strip()
                if 'Дата рождения' in th_text or 'Родился' in th_text or 'Родилась' in th_text:
                    td = row.find('td')
                    if td:
                        text = td.get_text()
                        # Extract year (4 digits, between 1000-2099)
                        years = re.findall(r'\b(1[0-9]{3}|20[0-2][0-9])\b', text)
                        if years:
                            # Return the first valid year found
                            return years[0]
    
    # Try to find in first paragraph
    first_para = soup.find('div', class_='mw-parser-output')
    if first_para:
        paragraphs = first_para.find_all('p', recursive=False)
        for p in paragraphs[:3]:  # Check first 3 paragraphs
            text = p.get_text()
            # Look for patterns like "родился 14 ноября 1900 года" or "родился в 1900 году"
            patterns = [
                r'(?:родился|родилась)\s+(?:14\s+ноября\s+)?(\d{4})\s*(?:года|г\.|г)',
                r'(?:родился|родилась)\s+в\s+(\d{4})\s*(?:году|г\.|г)',
                r'(\d{4})\s*(?:года|г\.|г)\s*—\s*(?:родился|родилась)',
                r'14\s+ноября\s+(\d{4})',  # Direct pattern for "14 ноября YYYY"
            ]
            for pattern in patterns:
                year_match = re.search(pattern, text, re.IGNORECASE)
                if year_match:
                    year = year_match.group(1)
                    if 1000 <= int(year) <= 2099:
                        return year
            
            # Or just find 4-digit year near "14 ноября" or "ноября"
            if '14 ноября' in text or 'ноября' in text:
                years = re.findall(r'\b(1[0-9]{3}|20[0-2][0-9])\b', text)
                if years:
                    return years[0]
    
    return None

def extract_occupation(soup):
    """Extract occupation from Wikipedia page"""
    occupations_found = []
    category_found = None
    
    # Try to find in infobox first (most reliable)
    infobox = soup.find('table', class_='infobox')
    if infobox:
        # Look for "Род деятельности" or "Профессия"
        for row in infobox.find_all('tr'):
            th = row.find('th')
            if th:
                th_text = th.get_text().strip()
                if 'Род деятельности' in th_text or 'Профессия' in th_text or 'Род занятий' in th_text:
                    td = row.find('td')
                    if td:
                        text = td.get_text().lower()
                        # Extract occupations
                        for occ in OCCUPATIONS:
                            if occ.lower() in text and occ not in occupations_found:
                                occupations_found.append(occ)
    
    # Try to find in categories (at bottom of page)
    categories = soup.find('div', id='catlinks')
    if categories:
        cat_text = categories.get_text().lower()
        # Check each category
        for cat_name, occs in OCCUPATION_CATEGORIES.items():
            # Check if any occupation from this category is mentioned
            for occ in occs:
                if occ.lower() in cat_text:
                    category_found = cat_name
                    if occ not in occupations_found:
                        occupations_found.append(occ)
                    break
    
    # Try to find in first paragraph if nothing found yet
    if not occupations_found:
        first_para = soup.find('div', class_='mw-parser-output')
        if first_para:
            paragraphs = first_para.find_all('p', recursive=False)
            for p in paragraphs[:3]:
                text = p.get_text().lower()
                for occ in OCCUPATIONS:
                    if occ.lower() in text and occ not in occupations_found:
                        occupations_found.append(occ)
    
    # Format occupation string (no category, just occupations)
    # Remove "персона" from occupations if present
    occupations_found = [occ for occ in occupations_found if occ != 'персона']
    
    if occupations_found:
        occ_str = ', '.join(occupations_found[:5])  # Limit to 5 occupations
        return occ_str
    
    return None

print("Helper functions defined")


Helper functions defined


In [4]:
def parse_category_page(url):
    """Parse category page and extract person links"""
    soup = get_page_content(url)
    if not soup:
        return [], None
    
    person_links = []
    
    # Find all links in the category page
    # Try different possible containers
    content = soup.find('div', id='mw-pages')
    if not content:
        content = soup.find('div', class_='mw-category')
    if not content:
        content = soup.find('div', class_='mw-category-group')
    
    # List of special page prefixes to skip
    skip_prefixes = ['Категория:', 'Файл:', 'Шаблон:', 'Обсуждение:', 'Участник:', 'Портал:', 
                     'Википедия:', 'Служебная:', 'Медиа:', 'Справка:', 'Обсуждение_шаблона:',
                     'Обсуждение_участника:', 'Обсуждение_файла:', 'Обсуждение_категории:']
    
    if content:
        # Find all links in the category listing
        for link in content.find_all('a', href=True):
            href = link['href']
            if href.startswith('/wiki/'):
                title_part = href.split('/wiki/')[1].split('#')[0]
                # Decode to check if it's a special page
                title_decoded = unquote(title_part)
                
                # Skip special pages
                should_skip = False
                for prefix in skip_prefixes:
                    if title_part.startswith(prefix) or title_decoded.startswith(prefix):
                        should_skip = True
                        break
                
                if not should_skip:
                    full_url = urljoin(BASE_URL, href)
                    # Use the URL-encoded title for consistency with XML format
                    person_links.append((title_part, full_url))
    
    # Remove duplicates while preserving order
    seen = set()
    unique_links = []
    for title, url in person_links:
        if title not in seen:
            seen.add(title)
            unique_links.append((title, url))
    
    # Find "Следующая страница" link - improved search
    next_link = None
    
    # Method 1: Look for link with exact text "Следующая страница"
    for link in soup.find_all('a', href=True):
        link_text = link.get_text().strip()
        if link_text == 'Следующая страница':
            next_link = urljoin(BASE_URL, link['href'])
            break
    
    # Method 2: Look in navigation divs (more comprehensive)
    if not next_link:
        # Try different navigation containers
        nav_selectors = [
            ('div', {'class': 'mw-category-navigation'}),
            ('div', {'id': 'mw-pages'}),
            ('div', {'class': 'mw-category'}),
        ]
        
        for tag, attrs in nav_selectors:
            nav_divs = soup.find_all(tag, attrs)
            for nav_div in nav_divs:
                for link in nav_div.find_all('a', href=True):
                    link_text = link.get_text().strip()
                    # Check for "Следующая страница" or variations
                    if 'Следующая' in link_text or 'следующая' in link_text or 'Следующая страница' in link_text:
                        href = link.get('href')
                        if href and href.startswith('/wiki/'):
                            next_link = urljoin(BASE_URL, href)
                            break
                if next_link:
                    break
            if next_link:
                break
    
    # Method 3: Look for links with "from" parameter (pagination)
    if not next_link:
        # Wikipedia category pages use "from" parameter for pagination
        for link in soup.find_all('a', href=True):
            href = link.get('href', '')
            if 'from=' in href and 'Категория:Родившиеся_14_ноября' in href:
                next_link = urljoin(BASE_URL, href)
                break
    
    # Method 4: Look for "next" class or similar
    if not next_link:
        next_links = soup.find_all('a', class_=re.compile('next', re.I))
        for link in next_links:
            if link.get('href'):
                next_link = urljoin(BASE_URL, link['href'])
                break
    
    return unique_links, next_link

print("Category parser function defined")


Category parser function defined


In [5]:
def parse_person_page(url, name):
    """Parse individual person page and extract data"""
    soup = get_page_content(url)
    if not soup:
        return None
    
    # Extract year of birth
    year = extract_year_of_birth(soup)
    
    # Extract occupation (no category, just occupation)
    occupation = extract_occupation(soup)
    
    # Normalize name to match XML format (decoded, with underscores)
    name_normalized = normalize_name(name)
    
    return {
        'year': year,
        'name': name_normalized,  # Normalized format to match XML
        'occupation': occupation or ''  # Just occupation, no category
    }

print("Person parser function defined")


Person parser function defined


In [6]:
def write_xml(persons, output_file):
    """Write persons data to XML file"""
    root = ET.Element('persons')
    root.set('day', '14-11')
    root.set('comm', f'Parsed {len(persons)} persons')
    
    # Sort by year, then by name - handle None values properly
    def sort_key(person):
        year = person.get('year')
        name = person.get('name', '')
        # Convert None to '9999' for sorting, ensure year is string
        year_str = str(year) if year is not None else '9999'
        return (year_str, name)
    
    sorted_persons = sorted(persons, key=sort_key)
    
    for person in sorted_persons:
        psn = ET.SubElement(root, 'psn')
        if person.get('year'):
            psn.set('y', str(person['year']))
        else:
            psn.set('y', 'yyyy')  # Placeholder if year not found
        
        psn.set('h', person['name'])
        
        # Add occupation if available (no category, just occupation)
        if person.get('occupation'):
            psn.set('p', person['occupation'])
    
    # Write to file with proper UTF-8 encoding
    tree = ET.ElementTree(root)
    ET.indent(tree, space='  ')
    # Write with UTF-8 encoding explicitly
    with open(output_file, 'wb') as f:
        f.write('<?xml version="1.0" encoding="utf-8"?>\n'.encode('utf-8'))
        tree.write(f, encoding='utf-8', xml_declaration=False)
    print(f"Written {len(persons)} persons to {output_file}")

print("XML writer function defined")


XML writer function defined


In [7]:
# Main parsing loop
def main():
    all_persons = []
    current_url = CATEGORY_URL
    page_count = 0
    total_processed = 0
    total_skipped_identical = 0
    total_updated = 0
    total_new = 0
    total_failed = 0
    
    print(f"Starting to parse category: {CATEGORY_URL}")
    print(f"Existing entries loaded: {len(existing_data)}")
    
    while current_url:
        page_count += 1
        print(f"\n{'='*60}")
        print(f"--- Parsing page {page_count} ---")
        print(f"URL: {current_url}")
        print(f"{'='*60}")
        
        # Parse category page
        person_links, next_link = parse_category_page(current_url)
        print(f"Found {len(person_links)} person links on this page")
        
        if not person_links:
            print("No person links found, trying to continue...")
        
        # Process each person
        for idx, (name, url) in enumerate(person_links, 1):
            # Normalize the name to standard format for comparison
            name_normalized = normalize_name(name)
            name_display = unquote(name).replace('_', ' ')  # For display with spaces
            
            print(f"[{idx}/{len(person_links)}] Processing: {name_display}")
            person_data = parse_person_page(url, name)
            
            if not person_data:
                total_failed += 1
                print(f"  ✗ Failed to extract data")
                time.sleep(0.5)
                continue
            
            # Check if person exists in existing data
            if name_normalized in existing_data:
                existing = existing_data[name_normalized]
                new_year = person_data.get('year')
                new_occupation = person_data.get('occupation', '')
                existing_year = existing.get('year')
                existing_occupation = existing.get('occupation', '')
                
                # If new year is None, use existing year (don't overwrite with None)
                if new_year is None:
                    person_data['year'] = existing_year
                    new_year = existing_year
                
                # Normalize years for comparison (handle None and 'yyyy')
                new_year_str = str(new_year) if new_year else None
                existing_year_str = str(existing_year) if existing_year else None
                
                # Check if data is different
                year_different = new_year_str != existing_year_str
                
                # Only change occupation if original file doesn't have it or it's empty/dash
                # Check if existing occupation is empty, None, or just '-'
                existing_occ_empty = not existing_occupation or existing_occupation.strip() == '' or existing_occupation.strip() == '-'
                
                # Only update occupation if existing is empty/dash
                if existing_occ_empty and new_occupation:
                    occupation_different = True
                    # Use new occupation
                    person_data['occupation'] = new_occupation
                else:
                    # Keep existing occupation (don't change it)
                    occupation_different = False
                    person_data['occupation'] = existing_occupation
                
                # Update if year is different OR occupation was updated
                if year_different or occupation_different:
                    # Data is different, add to output (will overwrite)
                    all_persons.append(person_data)
                    total_updated += 1
                    changes = []
                    if year_different:
                        changes.append(f"year: {existing_year_str} → {new_year_str}")
                    if occupation_different:
                        changes.append(f"occupation: '{existing_occupation}' → '{new_occupation}'")
                    print(f"  ↻ Updated: {', '.join(changes)}")
                else:
                    # Data is identical, skip
                    total_skipped_identical += 1
                    print(f"  ✓ Identical data, skipping")
            else:
                # New person, add to output
                all_persons.append(person_data)
                total_new += 1
                year = person_data.get('year', 'N/A')
                occupation = person_data.get('occupation', 'N/A')
                print(f"  + New: Year: {year}, Occupation: {occupation}")
            
            total_processed += 1
            
            # Be polite to Wikipedia servers
            time.sleep(0.5)
        
        # Move to next page
        current_url = next_link
        if next_link:
            print(f"\n→ Moving to next page: {next_link}")
        else:
            print(f"\n→ No more pages found")
            break
    
    print(f"\n{'='*60}")
    print(f"=== Parsing complete ===")
    print(f"Total pages parsed: {page_count}")
    print(f"Total persons processed: {total_processed}")
    print(f"Total new persons: {total_new}")
    print(f"Total updated (different data): {total_updated}")
    print(f"Total skipped (identical data): {total_skipped_identical}")
    print(f"Total failed to extract: {total_failed}")
    print(f"{'='*60}")
    
    # Write to XML
    if all_persons:
        write_xml(all_persons, OUTPUT_XML)
        print(f"\n✓ Successfully wrote {len(all_persons)} persons to {OUTPUT_XML}")
    else:
        print("\n⚠ No persons to write (all were identical to existing data)")

# Run the parser
if __name__ == "__main__":
    main()


Starting to parse category: https://ru.wikipedia.org/wiki/Категория:Родившиеся_18_ноября
Existing entries loaded: 1234

--- Parsing page 1 ---
URL: https://ru.wikipedia.org/wiki/Категория:Родившиеся_18_ноября
Found 200 person links on this page
[1/200] Processing: Ааней, Андрея
  ✓ Identical data, skipping
[2/200] Processing: Абашидзе, Аслан Хусейнович
  ✓ Identical data, skipping
[3/200] Processing: Аббуд, Карима
  ✓ Identical data, skipping
[4/200] Processing: Абдешев, Хайрулла
  ↻ Updated: occupation: ' ' → 'командир, крестьянин'
[5/200] Processing: Абель, Джейк
  ✓ Identical data, skipping
[6/200] Processing: Абер, Лоис
  ↻ Updated: occupation: ' ' → 'биатлонист'
[7/200] Processing: Абзалов, Мелис Арипович
  ✓ Identical data, skipping
[8/200] Processing: Абова, Тамара Евгеньевна
  ✓ Identical data, skipping
[9/200] Processing: Абу Бакр-мирза (сын Мухаммеда-Джуки)
  ↻ Updated: occupation: ' ' → 'политик'
[10/200] Processing: Аванесова, Галина Алексеевна
  ✓ Identical data, skipping
