In [None]:
## CLASS 6 + 7: Parsing HTML for specific content

In [None]:
# Once we download an HTML file, how do we extract the data we want? A single webpage can easily be 20 pages worth of
# HTML code, and oftentimes up to 100 pages. Thus, simply downloading the webpage code and copy-pasting or using Ctrl + F 
# is horribly inefficient.

# Let's do this again
import requests

my_response_object = requests.get("http://mason.gmu.edu/~jlee17/python_workshop_files/example_data/index-very-simple.html")
html_text = my_response_object.text
print(html_text)

In [None]:
# bs4 (aka BeautifulSoup) is an amazingly powerful method of parsing HTML files
# The syntax [from .... import ....] is used when you only want certain functions.
# Without this syntax (that is, if we just did [import bs4]), we'd need bs4.BeautifulSoup() instead of BeautifulSoup()
# Maybe a bit silly, but this is the standard naming convention in Python for the bs4 library.

from bs4 import BeautifulSoup

# Let's transform our text into a Soup object. 
# The "lxml" parameter there just specifies how Python should parse it. 
# lxml is a particularly good parser, but it may not be available on every operating system and version.
# If your computer doesn't already have it, go to your command prompt and type in "pip install lxml" (no quotes)
# If that doesn't work, you can replace "lxml" with "html.parse", which Python has by default.
soup = BeautifulSoup(html_text,"lxml")

# You never need pprint with soup objects - they have their own prettify() method
print(soup.prettify())
# Note that prettify() is specially designed for HTML tags - it helps you see what tags are inside of what
# other tags.

# You can also just grab out all the text and rip out all the HTML tags by using the .text attribute:
print(soup.text)

In [None]:
# I wonder what else is inside our soup object? We can not only print it and display the text, but there's also
# a whole universe of functionality that we can apply to soup objects, especially for more complex pages.
# Soup objects are made up of NODES.
# Each node represents an HTML tag in its entirety, including all the tags that are inside of it.
# The original Soup object is itself a node. It is the "root node", with all other nodes being its children, grandchildren, etc.

# Below, the title attribute gets the <title> NODE in its entirety from the document

# Also note the special {0} syntax in the string - this can be simpler than simply doing
# "the" + variable + " other string" every time. You can use {0}, {1}, etc. for multiple variables
print("Here is the title, tag AND text: {0}".format(soup.title))
# <title>A Sample Title</title>

# If you just want the name of the element, you can get 'name' attribute from the 'title' attribute.
print("Here is the title, tag ONLY: {0}".format(soup.title.name))
# 'title'

print("Here is the title, text ONLY: {0}".format(soup.title.string))
# u'A Sample Title'

In [None]:
# More basics
print("Here is the title tag's PARENT's tag NAME: {0}".format(soup.title.parent.name))
# 'head'

print("Here is the first entire 'p' element: {0}".format(soup.p))
# <p>Here is a paragraph. But it doesn't have any attributes that you can capture!</p>

# When you want to get HTML attributes (not to be confused with Python's object attributes)
# you treat the HTML node like a dictionary. Try to follow along with what the code below is grabbing
print("Here is the value of the id attribuate for the first 'div' element: {0}".format(soup.div['id']))
# all_the_lists

print("Here is the first entire 'a' element: {0}".format(soup.a))
# <a href="http://www.google.com">Google</a>

In [None]:
# Different between .text and .string
# .string just gets what in that element's immediate text area
# .text is far more complete, it combines the current element plus all subelements together and merges all
# their values.

In [None]:
# soup.title is all well and good, because there's only ever ONE title element in an HTML page. 
# But what about an element like <a>, which can appear many many times (and is very important)?
# Doing soup.a is ok when you only care about the very first hyperlink. 
# But HTML can get really dense. What do you do when you want elements that are deep in the text quickly?

# Rather than just the first, let's get ALL the elements with an 'a' tag and put them in a list for us
# All soup objects (i.e. nodes) have a find_all method, which searches all nodes INSIDE that node in the tree
# If you call find_all on the root node (soup), it will thus search the entire document.
all_a_tags = soup.find_all('a')

# It returns a ResultSet object, which we can treat just like a List
# Let's loop through it and see what's inside?
for a_tag in all_a_tags:
    print("----------------")
    # Let's see the whole thing
    print(a_tag)
    # What about just the 'href' HTML attribute?
    print(a_tag['href'])
    # The text of the tag?
    print(a_tag.string)
    # The tag's name itself?
    print(a_tag.name)

# Example Output:
# <a href="http://www.google.com">Google</a>
# http://www.google.com
# Google
# a

In [None]:
## find vs find_all
# 'find' just finds the FIRST match inside current node.
# 'find_all' finds ALL matches inside the current node.

# Just like with requests.get(), you can attach a dictionary on the end of the find() or find_all() method
# That dictionary contains HTML attributes(s) that the node also must match.

required_tags = {}
required_tags['id'] = 'file_links'

# Thus, below, instead of just finding "the first div tag" in the soup object
# We're finding "the first div tag that ALSO has an 'id' attribute equal to 'file_links'" in the soup object
book_title_element = soup.find("div",required_tags)

print(type(book_title_element))
print(book_title_element.prettify())
print(book_title_element['id'])
print(book_title_element.name)
print(book_title_element.string)
print(book_title_element.text)

In [None]:
# Ok, so how might we find only a select group of hyperlinks from all the hyperlinks?
# In this case, just the hyperlinks in the first div?

# Also, note that if you're only selecting by a single attribute(such as ID here), you can just insert the
# dictionary right in without putting it in a variable first.

# First, get the proper div tag
div_tag = soup.find("ul",{"id":"the_first_list"})

# Then, from that div tag, get all the 'a' tags INSIDE the div tag.
a_elements = div_tag.find_all("a")

for a_element in a_elements:
    print(a_element['href'])
    print(a_element.string)

In [None]:
# Advanced Concept - Lambda Functions

# Lambda functions take some explaining. Basically, they're miniature, one-time-use functions that have no name.
# They pass in a variable L (or whatever you want to name it), which is defined by BeautifulSoup in this case to be a string.
# In this case, we want it to return TRUE if it's a match but FALSE if it's not a match
# Imagine a for loop, looping through every 'ul' element tag in the file:

# all_the_ul_tags = soup.find_all("ul")
# selected_ul_elements = []
# for L in all_the_ul_tags:
#     if L['id'] and L['id'].endswith("_list"):
#         selected_ul_elements.append(L['id'])

# This is what the lambda function below is doing. Five lines of code into one line! Much cleaner, once you get the syntax 
# The BeautifulSoup package understands to make a list of only those TRUE elements
# Also, the reason for the strange "L and" and the beginning is a fancy way of preventing errors, as I'll explain in the workshop
selected_ul_elements = soup.find_all("ul",{'id' : lambda L: L and L.endswith("_list")})

for element in selected_ul_elements:
    print("-----------------------")
    print(element)
    print("-----------------------")
    print(element.find_all("a"))

In [None]:
# Another tool you'll find useful is urljoin()
# Oftentimes you get what are known as relative links, which need to be joined with the base link to make a full hyperlink
# The cool part about urljoin() (vs. string concatenation) is that it doesn't care about making sure the slashes match
from urllib.parse import urljoin

my_link = "http://www.google.com/" + "/some_random_page.html"
print(my_link)
print(urljoin("http://www.google.com/","/some_random_page.html"))

In [None]:
# ADVANCED Recipe: Download all Forum posts from a specific forum
# Note that this is a very complex program! Don't expect to understand it your first time through
# With experience, continue to go back to this receipe.
# Take pieces of it apart in new cells and play with them

from bs4 import BeautifulSoup
import requests
import re
import unicodecsv as csv
from urllib.parse import urljoin

# Loop through all the LINKS on a PAGE
def go_through_thread_links(all_a_links, base_page):
    for forum_link in all_a_links:
        print("---------------------NEW THREAD---------------------------")
        print("Currently on link {0}".format(forum_link))
        response = requests.get(forum_link)
        assert response.status_code == 200
        civ_text = response.text
        soup = BeautifulSoup(civ_text,"lxml")
        all_posts = soup.find_all("li",{"id":lambda L: L and L.startswith("post-")})
        page_title = soup.find("div",{"class":"titleBar"}).find("h1").text
        go_through_page_posts(all_posts, base_page, page_title)

# Loop through all the POSTS on a LINK
def go_through_page_posts(all_posts, base_page, page_title):
    for posting in all_posts:
        # Get the user!
        posting_user = posting.find("a",{ "itemprop" : "name" }).string
        
        # Get the datetime
        has_span_datetime = posting.find("span",{"class" : "DateTime"})
        if not has_span_datetime:
            posting_datetime = posting.find("abbr",{"class" : "DateTime"}).string
        else:
            posting_datetime = has_span_datetime.string
        
        # Get the content and clean it!
        posting_text = posting.find("blockquote",{"class": "messageText SelectQuoteContainer ugc baseHtml"}).text.strip()
        posting_text_cleaned = re.sub( '\s+', ' ', str(posting_text))
        #print(posting_text_cleaned)
        
        # Get the post's ID according to the forum (every post has a unique ID in this forum)
        posting_id = posting['id']
        
        # Prepare in list
        my_row = [page_title,posting_user,posting_datetime,posting_id,posting_text_cleaned]
        
        # Write the Row
        my_csv_writer.writerow(my_row)
    print("Subpage {0} done".format(page_title)) # Status message

# Part One: File setup code - prepare the CSV file. Write the first (title) column
output_file = open("Output Files/my_forum_results.csv","wb")
my_csv_writer = csv.writer(output_file)
my_csv_writer.writerow(["Page Title","Posting User","Posting Date-Time","Posting ID","Posting Text"])

# Part Two: Prepare the initial soup based on Page 1
base_page = "http://forums.civfanatics.com/" # A random forum chosen

# Separating the "base page" from the "relative link" is really useful for later
initial_civ_link = urljoin(base_page,"/forums/civ-ideas-suggestions.119/")

response = requests.get(initial_civ_link)
assert response.status_code == 200

civ_text = response.text

# Part Three: Find all other page links from our initial_civ_link page
soup = BeautifulSoup(civ_text,"lxml")

# Get number of pages to go through
num_of_pages = int(soup.find("a",{"class":"PageNavNext"}).find_next_sibling("a").string)

# Part Four: Loop through each PAGE.
# Note that for speed purposes, I've made the range do only a single page [range(1,2)].
# However, if want to get everything, change that line of code below to:
# for current_page in range(1,num_of_pages):
# But note that it may take up to 2 hours to complete - tens of thousands of forum posts to download!
for current_page in range(1,2):
    print("---------------------NEW PAGE---------------------------")
    current_page_link = initial_civ_link + "page-" + str(current_page)
    response = requests.get(current_page_link)
    assert response.status_code == 200
    soup = BeautifulSoup(response.text,"lxml")
    all_forum_page_tags = soup.find_all("li",{"id":lambda L: L and L.startswith("thread-")})
    
    # Most complex line of code - parsing the links out
    all_a_links = [urljoin(base_page, forum_page_tag.find("h3",{"class":"title"}).a['href']) for forum_page_tag in all_forum_page_tags]
    # Finally, call the function that goes through everything inside that page
    go_through_thread_links(all_a_links, current_page_link)
    
print("Done!")
output_file.close()