In [1]:
# Import libraries

import requests
from bs4 import BeautifulSoup
import re

In [2]:
import requests

def search_zambian_pages(limit=30):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": "intitle:Zambia",
        "format": "json",
        "srlimit": limit
    }
    response = requests.get(url, params=params)
    print("Search API URL:", response.url)
    data = response.json()
    titles = [result["title"] for result in data["query"]["search"]]
    print("Found titles:", titles)
    return titles

titles = search_zambian_pages(limit=30)


Search API URL: https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=intitle%3AZambia&format=json&srlimit=30
Found titles: ['Zambia', 'President of Zambia', 'Outline of Zambia', 'Languages of Zambia', 'Economy of Zambia', 'China–Zambia relations', 'Geography of Zambia', 'Zambia Railways', 'India–Zambia relations', 'Politics of Zambia', 'Foreign relations of Zambia', 'Russia–Zambia relations', 'Zambia Open', 'History of Zambia', 'United States–Zambia relations', 'Prime Minister of Zambia', 'Demographics of Zambia', 'National Assembly of Zambia', 'Elections in Zambia', 'Religion in Zambia', 'Zambia national football team', 'Education in Zambia', 'Denmark–Zambia relations', 'Districts of Zambia', 'Christianity in Zambia', 'Transport in Zambia', 'Zambian Defence Force', 'Mass media in Zambia', 'Stand and Sing of Zambia, Proud and Free', 'Zambian Air Force']


In [7]:
def get_page_html(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "text"
    }
    response = requests.get(url, params=params)
    print(f"🌐 Parse API URL for '{title}': {response.url}")
    data = response.json()
    html = data["parse"]["text"]["*"]
    print(f"🧾 Length of HTML content for '{title}': {len(html)} characters")
    return html

# Test on the first title
html = get_page_html(titles[0])


🌐 Parse API URL for 'Zambia': https://en.wikipedia.org/w/api.php?action=parse&page=Zambia&format=json&prop=text
🧾 Length of HTML content for 'Zambia': 534000 characters


In [8]:
# Dictionary to hold title → HTML content
all_html_pages = {}

for title in titles:
    print(f"\n--- Processing: {title} ---")
    try:
        html = get_page_html(title)
        all_html_pages[title] = html
    except Exception as e:
        print(f"❌ Error processing '{title}': {e}")

print(f"\n✅ Processed {len(all_html_pages)} pages successfully.")



--- Processing: Zambia ---
🌐 Parse API URL for 'Zambia': https://en.wikipedia.org/w/api.php?action=parse&page=Zambia&format=json&prop=text
🧾 Length of HTML content for 'Zambia': 534000 characters

--- Processing: President of Zambia ---
🌐 Parse API URL for 'President of Zambia': https://en.wikipedia.org/w/api.php?action=parse&page=President+of+Zambia&format=json&prop=text
🧾 Length of HTML content for 'President of Zambia': 120782 characters

--- Processing: Outline of Zambia ---
🌐 Parse API URL for 'Outline of Zambia': https://en.wikipedia.org/w/api.php?action=parse&page=Outline+of+Zambia&format=json&prop=text
🧾 Length of HTML content for 'Outline of Zambia': 123730 characters

--- Processing: Languages of Zambia ---
🌐 Parse API URL for 'Languages of Zambia': https://en.wikipedia.org/w/api.php?action=parse&page=Languages+of+Zambia&format=json&prop=text
🧾 Length of HTML content for 'Languages of Zambia': 102798 characters

--- Processing: Economy of Zambia ---
🌐 Parse API URL for 'Econ

In [9]:
from bs4 import BeautifulSoup

def extract_sections(html):
    soup = BeautifulSoup(html, "html.parser")
    content_div = soup.find('div', class_='mw-parser-output')
    if not content_div:
        print("No main content div found!")
        return []

    sections = []

    # Find all divs with class starting with 'mw-heading'
    heading_divs = content_div.find_all('div', class_=lambda c: c and c.startswith('mw-heading'))

    for heading_div in heading_divs:
        # Extract the h2/h3/h4 tag inside the heading div
        header_tag = heading_div.find(['h2', 'h3', 'h4'])
        if not header_tag:
            continue

        section_title = header_tag.get_text(strip=True)

        content = []
        # Collect siblings until the next heading
        for sibling in heading_div.find_next_siblings():
            if sibling.name == 'div' and sibling.get('class') and any(c.startswith('mw-heading') for c in sibling.get('class')):
                break

            for edit_span in sibling.find_all('span', class_='mw-editsection'):
                edit_span.decompose()

            text = sibling.get_text(separator=' ', strip=True)
            if text:
                content.append(text)

        full_text = " ".join(content).strip()
        if full_text:
            sections.append((section_title, full_text))

    return sections


# ✅ Process and print sections from all pages
all_sections = {}

for page_title, html in all_html_pages.items():
    print(f"\n📘 Sections in: {page_title}")
    try:
        sections = extract_sections(html)
        all_sections[page_title] = sections
        if sections:
            for section_title, content in sections[:5]:  # Print first 5 sections only
                print(f"- {section_title} (first 100 chars): {content[:100]}")
        else:
            print("⚠️ No sections found.")
    except Exception as e:
        print(f"❌ Error processing '{page_title}': {e}")

print(f"\n✅ Extracted sections from {len(all_sections)} pages.")



📘 Sections in: Zambia
- Etymology (first 100 chars): Further information: Rhodesia (name) The territory of Zambia was known as Northern Rhodesia from 191
- History (first 100 chars): Main article: History of Zambia
- Prehistoric era (first 100 chars): Archaeological excavation work on the Zambezi Valley and Kalambo Falls shows a succession of human c
- Khoisan and Batwa (first 100 chars): Ancient (but graffitied) Rock Art in Nsalu Cave, Kasanka National Park in North-Central Zambia Moder
- The Bantu (Abantu) (first 100 chars): The Bantu people or Abantu (meaning people) are an enormous and diverse ethnolinguistic group that c

📘 Sections in: President of Zambia
- Presidential term (first 100 chars): The president is elected for a term of five years. Since 1991, There is a two-term limit for the pre
- Northern Rhodesia (first 100 chars): When the British colony of Northern Rhodesia was separated from Southern Rhodesia and British South 
- Zambia (first 100 chars): Upon independence and

In [10]:
def wiki_page_url(title):
    base_url = "https://en.wikipedia.org/wiki/"
    url = base_url + title.replace(" ", "_")
    print(f"Wikipedia page URL for '{title}': {url}")
    return url

# Dictionary to hold title → Wikipedia URL
all_page_urls = {}

for page_title in all_html_pages.keys():
    url = wiki_page_url(page_title)
    all_page_urls[page_title] = url

print(f"\n✅ Generated URLs for {len(all_page_urls)} pages.")


Wikipedia page URL for 'Zambia': https://en.wikipedia.org/wiki/Zambia
Wikipedia page URL for 'President of Zambia': https://en.wikipedia.org/wiki/President_of_Zambia
Wikipedia page URL for 'Outline of Zambia': https://en.wikipedia.org/wiki/Outline_of_Zambia
Wikipedia page URL for 'Languages of Zambia': https://en.wikipedia.org/wiki/Languages_of_Zambia
Wikipedia page URL for 'Economy of Zambia': https://en.wikipedia.org/wiki/Economy_of_Zambia
Wikipedia page URL for 'China–Zambia relations': https://en.wikipedia.org/wiki/China–Zambia_relations
Wikipedia page URL for 'Geography of Zambia': https://en.wikipedia.org/wiki/Geography_of_Zambia
Wikipedia page URL for 'Zambia Railways': https://en.wikipedia.org/wiki/Zambia_Railways
Wikipedia page URL for 'India–Zambia relations': https://en.wikipedia.org/wiki/India–Zambia_relations
Wikipedia page URL for 'Politics of Zambia': https://en.wikipedia.org/wiki/Politics_of_Zambia
Wikipedia page URL for 'Foreign relations of Zambia': https://en.wikiped

In [11]:
import re

def compute_features(text):
    word_count = len(text.split())
    ref_count = len(re.findall(r'\[citation needed\]|\[.*?\]', text))
    return word_count, ref_count

# Dictionary to hold features per page
all_page_features = {}

for title, html in all_html_pages.items():
    print(f"\n🔍 Computing features for: {title}")
    sections = extract_sections(html)
    
    section_features = []
    for section_title, section_text in sections:
        word_count, ref_count = compute_features(section_text)
        print(f"  - Section: {section_title[:50]}")
        print(f"    Words: {word_count}, References: {ref_count}")
        section_features.append({
            "section_title": section_title,
            "word_count": word_count,
            "ref_count": ref_count
        })
    
    all_page_features[title] = section_features

print(f"\n✅ Computed features for {len(all_page_features)} pages.")



🔍 Computing features for: Zambia
  - Section: Etymology
    Words: 47, References: 1
  - Section: History
    Words: 5, References: 0
  - Section: Prehistoric era
    Words: 79, References: 1
  - Section: Khoisan and Batwa
    Words: 147, References: 3
  - Section: The Bantu (Abantu)
    Words: 111, References: 1
  - Section: Bantu origins
    Words: 121, References: 3
  - Section: First Bantu settlement
    Words: 461, References: 6
  - Section: Second Bantu settlement
    Words: 92, References: 1
  - Section: Europeans
    Words: 431, References: 7
  - Section: British colonisation
    Words: 132, References: 1
  - Section: Independence
    Words: 243, References: 4
  - Section: Post Independence
    Words: 403, References: 5
  - Section: Economic troubles
    Words: 79, References: 1
  - Section: Democratisation
    Words: 122, References: 2
  - Section: Politics
    Words: 304, References: 7
  - Section: Foreign relations
    Words: 63, References: 0
  - Section: Military
    Word

In [15]:
import re

def compute_features(text):
    word_count = len(text.split())
    ref_count = len(re.findall(r'\[citation needed\]|\[.*?\]', text))
    return word_count, ref_count

def classify_effort(word_count, ref_count):
    if word_count > 200 and ref_count > 2:
        return "Low"
    elif 100 < word_count <= 200 or 1 <= ref_count <= 2:
        return "Medium"
    else:
        return "High"

# Dictionary to hold all computed data
all_page_analysis = {}

for title, html in all_html_pages.items():
    print(f"\n🔍 Analyzing: {title}")
    sections = extract_sections(html)

    section_data = []
    for section_title, section_text in sections:
        word_count, ref_count = compute_features(section_text)
        effort = classify_effort(word_count, ref_count)

        print(f"  - Section: {section_title[:50]}")
        print(f"    Words: {word_count}, References: {ref_count}, Effort: {effort}")

        section_data.append({
            "section_title": section_title,
            "word_count": word_count,
            "ref_count": ref_count,
            "effort": effort
        })

    all_page_analysis[title] = section_data

print(f"\n✅ Classified editing effort for {len(all_page_analysis)} pages.")



🔍 Analyzing: Zambia
  - Section: Etymology
    Words: 47, References: 1, Effort: Medium
  - Section: History
    Words: 5, References: 0, Effort: High
  - Section: Prehistoric era
    Words: 79, References: 1, Effort: Medium
  - Section: Khoisan and Batwa
    Words: 147, References: 3, Effort: Medium
  - Section: The Bantu (Abantu)
    Words: 111, References: 1, Effort: Medium
  - Section: Bantu origins
    Words: 121, References: 3, Effort: Medium
  - Section: First Bantu settlement
    Words: 461, References: 6, Effort: Low
  - Section: Second Bantu settlement
    Words: 92, References: 1, Effort: Medium
  - Section: Europeans
    Words: 431, References: 7, Effort: Low
  - Section: British colonisation
    Words: 132, References: 1, Effort: Medium
  - Section: Independence
    Words: 243, References: 4, Effort: Low
  - Section: Post Independence
    Words: 403, References: 5, Effort: Low
  - Section: Economic troubles
    Words: 79, References: 1, Effort: Medium
  - Section: Democra

In [16]:
import csv

def save_analysis_to_csv(analysis_dict, filename="wiki_effort_analysis.csv"):
    # Prepare rows as a list of dictionaries
    rows = []
    for page_title, sections in analysis_dict.items():
        for section in sections:
            rows.append({
                "page_title": page_title,
                "section_title": section["section_title"],
                "word_count": section["word_count"],
                "ref_count": section["ref_count"],
                "effort": section["effort"]
            })

    # Write rows to CSV
    with open(filename, mode='w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ["page_title", "section_title", "word_count", "ref_count", "effort"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

    print(f"✅ Data saved to {filename}")

# Example usage:
save_analysis_to_csv(all_page_analysis)


✅ Data saved to wiki_effort_analysis.csv


In [60]:
# def analyze_page(title):
#     print(f"\n🔍 Analyzing: {title}")
#     html = get_page_html(title)
#     sections = extract_sections(html)
#     for section_title, content in sections:
#         word_count, ref_count = compute_features(content)
#         effort = classify_effort(word_count, ref_count)
#         print(f" - [{effort}] {section_title} ({word_count} words, {ref_count} refs)")

# # Try with one title
# analyze_page(titles[0])



🔍 Analyzing: Zambia
 - [Medium] Etymology (47 words, 1 refs)
 - [High] History (5 words, 0 refs)
 - [Medium] Prehistoric era (79 words, 1 refs)
 - [Medium] Khoisan and Batwa (147 words, 3 refs)
 - [Medium] The Bantu (Abantu) (111 words, 1 refs)
 - [Medium] Bantu origins (121 words, 3 refs)
 - [Low] First Bantu settlement (466 words, 6 refs)
 - [Medium] Second Bantu settlement (93 words, 1 refs)
 - [Low] Europeans (431 words, 7 refs)
 - [Medium] British colonisation (132 words, 1 refs)
 - [Low] Independence (243 words, 4 refs)
 - [Low] Post Independence (403 words, 5 refs)
 - [Medium] Economic troubles (79 words, 1 refs)
 - [Medium] Democratisation (122 words, 2 refs)
 - [Low] Politics (304 words, 7 refs)
 - [High] Foreign relations (63 words, 0 refs)
 - [Medium] Military (52 words, 1 refs)
 - [High] Administrative divisions (42 words, 0 refs)
 - [Medium] Human rights (175 words, 5 refs)
 - [Low] Geography (824 words, 3 refs)
 - [Medium] Climate (181 words, 1 refs)
 - [Low] Biodiversit


Sections in 'Zambia':
- Etymology (first 100 chars): Further information: Rhodesia (name) The territory of Zambia was known as Northern Rhodesia from 191
- History (first 100 chars): Main article: History of Zambia
- Prehistoric era (first 100 chars): Archaeological excavation work on the Zambezi Valley and Kalambo Falls shows a succession of human c


Found 49 heading divs

Sections in 'Zambia':
- Etymology (first 100 chars): Further information: Rhodesia (name) The territory of Zambia was known as Northern Rhodesia from 191
- History (first 100 chars): Main article: History of Zambia
- Prehistoric era (first 100 chars): Archaeological excavation work on the Zambezi Valley and Kalambo Falls shows a succession of human c


In [65]:
import requests
from bs4 import BeautifulSoup

def search_zambian_pages(limit=5):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "search",
        "srsearch": "intitle:Zambia",
        "format": "json",
        "srlimit": limit
    }
    response = requests.get(url, params=params)
    data = response.json()
    return [result["title"] for result in data["query"]["search"]]

def get_page_html(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "text"
    }
    response = requests.get(url, params=params)
    data = response.json()
    return data["parse"]["text"]["*"]

def clean_html_for_text(soup):
    # Remove styles and scripts
    for tag in soup(['style', 'script']):
        tag.decompose()
    # Remove footnote references
    for sup in soup.find_all('sup', class_='reference'):
        sup.decompose()
    # Remove edit section spans
    for span in soup.find_all('span', class_='mw-editsection'):
        span.decompose()
    # Optionally remove tables (infoboxes, navboxes) if they clutter sections
    for table in soup.find_all('table'):
        table.decompose()
    return soup


def classify_effort(word_count):
    if word_count <= 100:
        return "Low"
    elif word_count <= 300:
        return "Medium"
    else:
        return "High"
    
    
def extract_sections_with_effort(html):
    soup = BeautifulSoup(html, "html.parser")
    soup = clean_html_for_text(soup)

    content_div = soup.find('div', class_='mw-parser-output')
    if not content_div:
        return []

    sections = []
    heading_divs = content_div.find_all('div', class_=lambda c: c and c.startswith('mw-heading'))

    for heading_div in heading_divs:
        header_tag = heading_div.find(['h2', 'h3', 'h4'])
        if not header_tag:
            continue

        section_title = header_tag.get_text(strip=True)
        content = []

        for sibling in heading_div.find_next_siblings():
            if sibling.name == 'div' and sibling.get('class') and any(c.startswith('mw-heading') for c in sibling.get('class')):
                break
            # Clean references and edit sections inside siblings as well
            for sup in sibling.find_all('sup', class_='reference'):
                sup.decompose()
            for span in sibling.find_all('span', class_='mw-editsection'):
                span.decompose()

            text = sibling.get_text(separator=' ', strip=True)
            if text:
                content.append(text)

        full_text = " ".join(content).strip()
        word_count = len(full_text.split())
        if full_text:
            sections.append({
                "title": section_title,
                "text": full_text,
                "word_count": word_count,
                "char_count": len(full_text),
                "effort": classify_effort(word_count)
            })

    return sections

# --- Main Driver ---
titles = search_zambian_pages(limit=5)

for title in titles:
    print(f"\n📄 Page: {title}")
    try:
        html = get_page_html(title)
        sections = extract_sections_with_effort(html)

        print("Sections:")
        for sec in sections[:5]:  # Change this to show more sections if desired
            preview = sec["text"][:100].replace('\n', ' ')
            print(f" - {sec['title']} (first 100 chars): {preview} (Effort: {sec['effort']})")

        # Page-level features
        section_count = len(sections)
        total_words = sum(s["word_count"] for s in sections)
        total_chars = sum(s["char_count"] for s in sections)
        avg_words = total_words / section_count if section_count else 0

        print("\n🧮 Features:")
        print(f" - Section count: {section_count}")
        print(f" - Total words: {total_words}")
        print(f" - Total characters: {total_chars}")
        print(f" - Avg. section length (words): {avg_words:.2f}")
    except Exception as e:
        print(f"❌ Failed to process '{title}': {e}")



📄 Page: Zambia
Sections:
 - Etymology (first 100 chars): Further information: Rhodesia (name) The territory of Zambia was known as Northern Rhodesia from 191 (Effort: Low)
 - History (first 100 chars): Main article: History of Zambia (Effort: Low)
 - Prehistoric era (first 100 chars): Archaeological excavation work on the Zambezi Valley and Kalambo Falls shows a succession of human c (Effort: Low)
 - Khoisan and Batwa (first 100 chars): Ancient (but graffitied) Rock Art in Nsalu Cave, Kasanka National Park in North-Central Zambia Moder (Effort: Medium)
 - The Bantu (Abantu) (first 100 chars): The Bantu people or Abantu (meaning people) are an enormous and diverse ethnolinguistic group that c (Effort: Medium)

🧮 Features:
 - Section count: 45
 - Total words: 13763
 - Total characters: 85147
 - Avg. section length (words): 305.84

📄 Page: Outline of Zambia
Sections:
 - General reference (first 100 chars): An enlargeable basic map of Zambia Pronunciation : / ˈ z æ m b i ə / Common Englis