In [1]:
# Import libraries

import requests
from bs4 import BeautifulSoup
import re

In [30]:
import requests

def search_zambian_pages(limit=10):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": "intitle:Zambia",
        "format": "json",
        "srlimit": limit
    }
    response = requests.get(url, params=params)
    print("Search API URL:", response.url)
    data = response.json()
    titles = [result["title"] for result in data["query"]["search"]]
    print("Found titles:", titles)
    return titles

titles = search_zambian_pages(limit=10)


Search API URL: https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=intitle%3AZambia&format=json&srlimit=10
Found titles: ['Zambia', 'Outline of Zambia', 'Economy of Zambia', 'President of Zambia', 'China–Zambia relations', 'Languages of Zambia', 'Foreign relations of Zambia', 'India–Zambia relations', 'Geography of Zambia', 'Zambia Railways']


In [31]:
def get_page_html(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "text"
    }
    response = requests.get(url, params=params)
    print(f"Parse API URL for '{title}':", response.url)
    data = response.json()
    html = data["parse"]["text"]["*"]
    print(f"Length of HTML content for '{title}':", len(html))
    return html

page_title = titles[0]  # take first title from previous cell
html = get_page_html(page_title)


Parse API URL for 'Zambia': https://en.wikipedia.org/w/api.php?action=parse&page=Zambia&format=json&prop=text
Length of HTML content for 'Zambia': 533384


In [57]:
from bs4 import BeautifulSoup

def extract_sections(html):
    soup = BeautifulSoup(html, "html.parser")
    content_div = soup.find('div', class_='mw-parser-output')
    if not content_div:
        print("No main content div found!")
        return []

    sections = []

    # Find all divs with class starting with 'mw-heading'
    heading_divs = content_div.find_all('div', class_=lambda c: c and c.startswith('mw-heading'))

    for heading_div in heading_divs:
        # Extract the h2/h3/h4 tag inside the heading div
        header_tag = heading_div.find(['h2', 'h3', 'h4'])
        if not header_tag:
            continue

        section_title = header_tag.get_text(strip=True)

        content = []
        # Collect siblings until the next heading
        for sibling in heading_div.find_next_siblings():
            if sibling.name == 'div' and sibling.get('class') and any(c.startswith('mw-heading') for c in sibling.get('class')):
                break

            for edit_span in sibling.find_all('span', class_='mw-editsection'):
                edit_span.decompose()

            text = sibling.get_text(separator=' ', strip=True)
            if text:
                content.append(text)

        full_text = " ".join(content).strip()
        if full_text:
            sections.append((section_title, full_text))

    return sections

# Extract sections from the HTML content
sections = extract_sections(html)
print(f"\nSections in '{page_title}':")
for title, content in sections[:50]:
    print(f"- {title} (first 100 chars): {content[:100]}")



Sections in 'Zambia':
- Etymology (first 100 chars): Further information: Rhodesia (name) The territory of Zambia was known as Northern Rhodesia from 191
- History (first 100 chars): Main article: History of Zambia
- Prehistoric era (first 100 chars): Archaeological excavation work on the Zambezi Valley and Kalambo Falls shows a succession of human c
- Khoisan and Batwa (first 100 chars): Ancient (but graffitied) Rock Art in Nsalu Cave, Kasanka National Park in North-Central Zambia Moder
- The Bantu (Abantu) (first 100 chars): The Bantu people or Abantu (meaning people) are an enormous and diverse ethnolinguistic group that c
- Bantu origins (first 100 chars): Batonga fisherwomen in Southern Zambia. Women have played and continue to play pivotal roles in many
- First Bantu settlement (first 100 chars): The first Bantu people to arrive in Zambia came through the eastern route via the African Great Lake
- Second Bantu settlement (first 100 chars): The second mass settlement of Bantu peo

In [33]:
def wiki_page_url(title):
    base_url = "https://en.wikipedia.org/wiki/"
    url = base_url + title.replace(" ", "_")
    print(f"Wikipedia page URL for '{title}': {url}")
    return url

page_url = wiki_page_url(page_title)


Wikipedia page URL for 'Zambia': https://en.wikipedia.org/wiki/Zambia


In [58]:
import re

def compute_features(text):
    word_count = len(text.split())
    ref_count = len(re.findall(r'\[citation needed\]|\[.*?\]', text))
    return word_count, ref_count

sample_section_text = sections[0][1]  # first section's text
word_count, ref_count = compute_features(sample_section_text)
print(f"\n Features of first section:")
print(f"Words: {word_count}, References: {ref_count}")



 Features of first section:
Words: 47, References: 1


In [59]:
def classify_effort(word_count, ref_count):
    if word_count > 200 and ref_count > 2:
        return "Low"
    elif 100 < word_count <= 200 or 1 <= ref_count <= 2:
        return "Medium"
    else:
        return "High"

effort = classify_effort(word_count, ref_count)
print(f"🧠 Editing Effort: {effort}")


🧠 Editing Effort: Medium


In [60]:
def analyze_page(title):
    print(f"\n🔍 Analyzing: {title}")
    html = get_page_html(title)
    sections = extract_sections(html)
    for section_title, content in sections:
        word_count, ref_count = compute_features(content)
        effort = classify_effort(word_count, ref_count)
        print(f" - [{effort}] {section_title} ({word_count} words, {ref_count} refs)")

# Try with one title
analyze_page(titles[0])



🔍 Analyzing: Zambia
 - [Medium] Etymology (47 words, 1 refs)
 - [High] History (5 words, 0 refs)
 - [Medium] Prehistoric era (79 words, 1 refs)
 - [Medium] Khoisan and Batwa (147 words, 3 refs)
 - [Medium] The Bantu (Abantu) (111 words, 1 refs)
 - [Medium] Bantu origins (121 words, 3 refs)
 - [Low] First Bantu settlement (466 words, 6 refs)
 - [Medium] Second Bantu settlement (93 words, 1 refs)
 - [Low] Europeans (431 words, 7 refs)
 - [Medium] British colonisation (132 words, 1 refs)
 - [Low] Independence (243 words, 4 refs)
 - [Low] Post Independence (403 words, 5 refs)
 - [Medium] Economic troubles (79 words, 1 refs)
 - [Medium] Democratisation (122 words, 2 refs)
 - [Low] Politics (304 words, 7 refs)
 - [High] Foreign relations (63 words, 0 refs)
 - [Medium] Military (52 words, 1 refs)
 - [High] Administrative divisions (42 words, 0 refs)
 - [Medium] Human rights (175 words, 5 refs)
 - [Low] Geography (824 words, 3 refs)
 - [Medium] Climate (181 words, 1 refs)
 - [Low] Biodiversit


Sections in 'Zambia':
- Etymology (first 100 chars): Further information: Rhodesia (name) The territory of Zambia was known as Northern Rhodesia from 191
- History (first 100 chars): Main article: History of Zambia
- Prehistoric era (first 100 chars): Archaeological excavation work on the Zambezi Valley and Kalambo Falls shows a succession of human c


Found 49 heading divs

Sections in 'Zambia':
- Etymology (first 100 chars): Further information: Rhodesia (name) The territory of Zambia was known as Northern Rhodesia from 191
- History (first 100 chars): Main article: History of Zambia
- Prehistoric era (first 100 chars): Archaeological excavation work on the Zambezi Valley and Kalambo Falls shows a succession of human c
