CS119 Big Data

Spring 2024

Objective:
To use Beautiful Soup, the Python Library, to extract some information from the internet about
Beautiful Soup, the Lewis Carroll poem.

In [1]:
from bs4 import BeautifulSoup
import requests

def make_soup(url):
    '''
    Creates a soup object from a given url on the web
    '''
    html_content = requests.get(url).text
    soup = BeautifulSoup(html_content, "html.parser")
    return soup

# books()

In [2]:
def books():
    '''
    Returns the names of the canon books as a list.
    '''
    home_url = "https://aliceinwonderland.fandom.com/wiki/Alice_in_Wonderland_Wiki"
    soup = make_soup(home_url)
    nav_bar = soup.find(class_='fandom-community-header__local-navigation')
    tabs = nav_bar.find(class_='wds-tabs')
    # Gets the second child in the tabs - the lit tab
    lit_tab = tabs.findChildren("li", recursive=False)[1]
    lit_content = lit_tab.find("ul", class_="wds-list wds-is-linked")
    # Canon books is the first option in the literature tab
    canon_books_tab = lit_content.findChildren("li", recusive=False)[0]
    canon_books = canon_books_tab.find("ul", class_="wds-list wds-is-linked")
    # All names are listed in spans
    book_names = canon_books.find_all("span")
    book_names_text = [name.get_text() for name in book_names]
    return book_names_text

## Test the output against known answer

In [3]:
result = books()
print(result)

p1_answer = [
    "Alice's Adventures in Wonderland",
    'Through the Looking-Glass, and What Alice Found There',
    "Alice's Adventures Underground"
]
if result == p1_answer:
    print('CORRECT')
else:
    print("INCORRECT")

["Alice's Adventures in Wonderland", 'Through the Looking-Glass, and What Alice Found There', "Alice's Adventures Underground"]
CORRECT


# poems()

In [4]:
def poems():
    '''
    Returns the names of the canon poems and their urls, as a list of tuples.
    '''
    home_url = "https://aliceinwonderland.fandom.com/wiki/Alice_in_Wonderland_Wiki"
    soup = make_soup(home_url)
    nav_bar = soup.find(class_='fandom-community-header__local-navigation')
    tabs = nav_bar.find(class_='wds-tabs')
    # Gets the second child in the tabs - the lit tab
    lit_tab = tabs.findChildren("li", recursive=False)[1]
    lit_content = lit_tab.find("ul", class_="wds-list wds-is-linked")
    # Canon poems is the second option in the literature tab
    canon_poems_tab = lit_content.findChildren("li", class_="wds-dropdown-level-nested", recusive=False)[1]
    canon_poems = canon_poems_tab.find("ul", class_="wds-list wds-is-linked")
    poem_names = canon_poems.find_all("span")
    poem_names_text = [name.get_text() for name in poem_names]
    # Links are listed as the href part of an a tag
    poem_links = canon_poems.find_all("a")
    poem_links_text = [link['href'] for link in poem_links]
    return list(zip(poem_names_text, poem_links_text))

## Test the output against known answer

In [5]:
result = poems()
print(result)

p2_answer = [
    ('Jabberwocky', 'https://aliceinwonderland.fandom.com/wiki/Jabberwocky'),
    ('How Doth the Little Crocodile', 'https://aliceinwonderland.fandom.com/wiki/How_Doth_the_Little_Crocodile'),
    ('The Walrus and the Carpenter', 'https://aliceinwonderland.fandom.com/wiki/The_Walrus_and_the_Carpenter_(poem)'),
    ('You Are Old, Father William', 'https://aliceinwonderland.fandom.com/wiki/You_Are_Old,_Father_William'),
    ("Humpty Dumpty's Recitation", 'https://aliceinwonderland.fandom.com/wiki/Humpty_Dumpty%27s_Recitation'),
    ('Turtle Soup', 'https://aliceinwonderland.fandom.com/wiki/Turtle_Soup'),
    ('Tis the Voice of the Lobster', 'https://aliceinwonderland.fandom.com/wiki/Tis_the_Voice_of_the_Lobster')
]
if result == p2_answer:
    print('CORRECT')
else:
    print("INCORRECT")

[('Jabberwocky', 'https://aliceinwonderland.fandom.com/wiki/Jabberwocky'), ('How Doth the Little Crocodile', 'https://aliceinwonderland.fandom.com/wiki/How_Doth_the_Little_Crocodile'), ('The Walrus and the Carpenter', 'https://aliceinwonderland.fandom.com/wiki/The_Walrus_and_the_Carpenter_(poem)'), ('You Are Old, Father William', 'https://aliceinwonderland.fandom.com/wiki/You_Are_Old,_Father_William'), ("Humpty Dumpty's Recitation", 'https://aliceinwonderland.fandom.com/wiki/Humpty_Dumpty%27s_Recitation'), ('Turtle Soup', 'https://aliceinwonderland.fandom.com/wiki/Turtle_Soup'), ('Tis the Voice of the Lobster', 'https://aliceinwonderland.fandom.com/wiki/Tis_the_Voice_of_the_Lobster')]
CORRECT


# poem_title_text()

In [6]:
def poem_title_text(n):
    '''
    Returns the corresponding poem title and text, according to the order of outputs of poems(). Only works for n=2 or n=5.
    '''
    if n not in [2,5]:
        raise ValueError('n can only be 2 or 5')
    title, link = poems()[n]
    soup = make_soup(link)
    text_h2 = soup.select('span#Text')[0]
    poem_paragraphs = text_h2.find_all_next('p')
    poem_text = ""
    for paragraph in poem_paragraphs:
        poem_text += paragraph.get_text()
    return title, poem_text



## Display the output

In [7]:
for i in [2,5]:
    title, text = poem_title_text(i)
    print(title + '\n')
    print(text)

The Walrus and the Carpenter

The sun was shining on the sea,
Shining with all his might:
He did his very best to make
The billows smooth and bright--
And this was odd, because it was
The middle of the night.
The moon was shining sulkily,
Because she thought the sun
Had got no business to be there
After the day was done--
"It's very rude of him," she said,
"To come and spoil the fun!"
The sea was wet as wet could be,
The sands were dry as dry.
You could not see a cloud, because
No cloud was in the sky:
No birds were flying overhead--
There were no birds to fly.
The Walrus and the Carpenter
Were walking close at hand;
They wept like anything to see
Such quantities of sand:
"If this were only cleared away,"
They said, "it would be grand!"
"If seven maids with seven mops
Swept it for half a year.
Do you suppose," the Walrus said,
"That they could get it clear?"
"I doubt it," said the Carpenter,
And shed a bitter tear.
"O Oysters, come and walk with us!"
The Walrus did beseech.
"A pleasant