# Wikipedia Article Scraper

Author:  Devin Patel  
Purpose: To scrape a random selection wikipedia articles.

## Scraper

In [1]:
# Imports
from bs4 import BeautifulSoup   # For HTML parsing
import requests                 # HTTP requests
import re                       # Regular expressions

In [2]:
# Request wikipedia page and return main content
# Returns tuple (page_title, {'subsection_title': 'subsection_content'})
def wikipage(url):
    page_dict = {}
    FIRST_SECTION = 'Overview'
    
    try:
    
        page = requests.get(url)

        # Parse main content portion of page
        soop = BeautifulSoup(page.text, 'html')
        main_content = soop.find('main', {'id': 'content', 'class': 'mw-body'})
        
        
        # Parse article title
        page_title = soop.find('span', {'class': 'mw-page-title-main'}).text
        
        # Parse subsections
        article_contents = main_content.find_all('div', {'class': 'mw-parser-output'})
        for content in article_contents:
            if not content.find('span', {'typeof': 'mw:File'}):
                article_contents = content
        
        subsections = article_contents.find_all(['h2', 'p'])
        
        current_section = FIRST_SECTION
        
        # Read each paragraph, collect them into a dictionary. Stop at Notes section.
        for sub in subsections:
            # Check if 2nd level header is met. If so, change current section.
            if sub.name == 'h2':
                if sub.find('span', {'id': 'Notes'}):
                    break
                
                sub_header = sub.find('span', {'class': 'mw-headline'})
                current_section = sub_header.text
                
            
            # Check if paragraph is met.
            elif sub.name == 'p' and sub.text:
                # Remove footnote references using regex
                sub_text = re.sub(r'\[\d+\]', '', sub.text).strip()
                
                # If the paragraph is just whitespace, skip it
                if not sub_text: continue
                
                # Append paragraph to current section
                if not current_section in page_dict.keys():
                    page_dict[current_section] = sub_text
                else:
                    page_dict[current_section] += sub_text
    
    except Exception as e:
        return None
    # End of for loop
    return (page_title, page_dict)
# End of wikipage()

In [3]:
# Output status bar
def progress_bar(progress, total):
    percent = 100 * (progress / float(total))
    bar = '█' * int(percent) + '-' * (100 - int(percent))
    print(f"\r|{bar}| {percent:.2f}%", end="\r")


# Loop and collect a number of random wikipedia pages
COUNT = 100
RANDOM_URL = 'https://en.wikipedia.org/wiki/Special:Random'

# Will contain tuples (page_title, {'subsection_title': 'subsection_content'})
articles = []

while len(articles) < COUNT:
    page = wikipage(RANDOM_URL)
    if page and page[1]:
        articles.append(page)
    progress_bar(len(articles), COUNT)


|████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00%

In [4]:
# Print the contents of a page from wikipage()
def print_page(wikipage):
    title, page = wikipage[0], wikipage[1]
    print("Article Title:", title, end='\n\n')
    
    for sub_title, sub_par in page.items():
        print(f"{sub_title}:")
        print(f"\t{sub_par}\n")
    # End of for loop
# End of print_page()

# Randomly select an article and print it
import random
random_article = random.choice(articles)
print_page(random_article)

Article Title: Southern Tagalog 10

Overview:
	The Southern Tagalog 10 was a group of activists abducted and "disappeared" in 1977 during martial law in the Philippines under Proclamation No. 1081 issued by President of Philippines Ferdinand E. Marcos. Of the 10 university students and professors who were abducted, only three, Virgilio Silva, Salvador Panganiban, and Modesto Sison, "surfaced" later after being killed by suspected agents of the state. Two of those who surfaced were apparently summarily executed. The rest were never found.

Background:
	The victims, most of them in their early twenties, all belonged to a network of community organizations in the Southern Tagalog region, Philippines. They were abducted in late July 1977 at the Makati Medical Center in Metro Manila.The incident is believed to be the single biggest case of involuntary disappearance during martial law. Bonifacio Ilagan, brother of one of the victims and vice chair of Samahan ng Ex-Detainees Laban sa Detensyo