# Wikipedia Word Analyzer

Author:  Devin Patel  
Purpose: To scrape a selection wikipedia articles and perform sentence analysis on it using TextBlob.  

## Scraper

In [58]:
# Imports
from bs4 import BeautifulSoup   # For HTML parsing
import requests                 # HTTP requests
import re                       # Regular expressions
import pickle                   # Saving and loading data

In [59]:
# Request wikipedia page and return main content
# Returns tuple (page_title, {'subsection_title': 'subsection_content'})
def wikipage(url):
    page_dict = {}
    FIRST_SECTION = 'Overview'
    
    try:
    
        page = requests.get(url)

        # Parse main content portion of page
        soop = BeautifulSoup(page.text, 'html')
        main_content = soop.find('main', {'id': 'content', 'class': 'mw-body'})
        
        
        # Parse article title
        page_title = soop.find('span', {'class': 'mw-page-title-main'}).text
        
        # Parse subsections
        article_contents = main_content.find_all('div', {'class': 'mw-parser-output'})
        for content in article_contents:
            if not content.find('span', {'typeof': 'mw:File'}):
                article_contents = content
        
        subsections = article_contents.find_all(['h2', 'p'])
        
        current_section = FIRST_SECTION
        
        # Read each paragraph, collect them into a dictionary. Stop at Notes section.
        for sub in subsections:
            # Check if 2nd level header is met. If so, change current section.
            if sub.name == 'h2':
                if sub.find('span', {'id': 'Notes'}):
                    break
                
                sub_header = sub.find('span', {'class': 'mw-headline'})
                current_section = sub_header.text
                
            
            # Check if paragraph is met.
            elif sub.name == 'p' and sub.text:
                # Remove footnote references using regex
                sub_text = re.sub(r'\[\d+\]', '', sub.text).strip()
                
                # If the paragraph is just whitespace, skip it
                if not sub_text: continue
                
                # Append paragraph to current section
                if not current_section in page_dict.keys():
                    page_dict[current_section] = sub_text
                else:
                    page_dict[current_section] += sub_text
    
    except Exception as e:
        return None
    # End of for loop
    return (page_title, page_dict)
# End of wikipage()

In [60]:
# Output status bar
def progress_bar(progress, total):
    percent = 100 * (progress / float(total))
    bar = '█' * int(percent) + '-' * (100 - int(percent))
    print(f"\r|{bar}| {percent:.2f}%", end="\r")


# Loop and collect a number of random wikipedia pages
COUNT = 100
RANDOM_URL = 'https://en.wikipedia.org/wiki/Special:Random'

# Will contain tuples (page_title, {'subsection_title': 'subsection_content'})
articles = []

while len(articles) < COUNT:
    page = wikipage(RANDOM_URL)
    if page and page[1]:
        articles.append(page)
    progress_bar(len(articles), COUNT)

pickle.dump(articles, open(r'articles.pkl', 'wb'))


|████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00%

In [61]:
# Print the contents of a page from wikipage()
def print_page(wikipage):
    title, page = wikipage[0], wikipage[1]
    print("Article Title:", title, end='\n\n')
    
    for sub_title, sub_par in page.items():
        print(f"{sub_title}:")
        print(f"\t{sub_par}\n")
    # End of for loop
# End of print_page()

# Randomly select an article and print it
import random
random_article = random.choice(articles)
print_page(random_article)

Article Title: Annalee Whitmore Fadiman

Overview:
	Annalee Whitmore Fadiman (May 27, 1916 – February 5, 2002) was a scriptwriter for MGM, and World War II foreign correspondent for Life and Time magazines. She was the co-author with Theodore H. White of Thunder Out of China, a book on the Chinese civil war.

Early life:
	Fadiman was born in Price, Utah, the daughter of bank president Leland Whitmore and Anne Sharp Whitmore, who later became a librarian at New York Public Library. Fadiman graduated from Stanford University in 1937. She was the first woman to be managing editor of the Stanford Daily newspaper. She moved from San Francisco, where she briefly worked at the Agricultural Adjustment Administration, to Los Angeles taking a secretarial pool job at MGM. She wrote several screen treatments including Andy Hardy Meets Debutante (1940) and a screen adaptation for Tish.

Career:
	MGM offered her a contract but once the war began, Fadiman found "the prospect of seven years of Hollywo

## Prepare Data

In [62]:
# Imports
import textblob         # For sentence and sentiment analysis
import pandas as pd
import scipy.io
from matplotlib import pyplot as plt

In [63]:
# Exports pyplot figure
import os

def exportFig(fname):
    exportPath = "images"

    try:
        if not os.path.exists(exportPath): os.mkdir(exportPath)
    except Exception:
        print("Can't create a directory to store figures, so they will not be saved.")
        return
    
    exportPath = os.path.join(exportPath, fname)
    
    plt.savefig(exportPath)

In [73]:
# Import Articles
articles = pickle.load(open(r'articles.pkl', 'rb'))

# Combine all section contents into one string per article
for i, article in enumerate(articles):
    article_text = ""
    
    for section, content in article[1].items():
        article_text += content
    articles[i] = (article[0], article_text)
    
    
# Create a dataframe of the articles
df = pd.DataFrame(articles, columns=['title', 'content'])


# Create a new column for the number of sentences in each article
df['sentences'] = df['content'].apply(lambda x: len(textblob.TextBlob(x).sentences))

# Create a new column for the number of words in each article
df['words'] = df['content'].apply(lambda x: len(textblob.TextBlob(x).words))

# Create a new column for the subjectivity of each article
df['subjectivity'] = df['content'].apply(lambda x: textblob.TextBlob(x).sentiment.subjectivity)

# Create a new column for the sentiment of each article
df['Polarity'] = df['content'].apply(lambda x: textblob.TextBlob(x).sentiment.polarity)

# Data is ready
print(f"\nShape of main dataframe: {df.shape}")
df

NameError: name 'X_df' is not defined