In [None]:
my_variable = None
the_variable =
if my_variable:
    print("Hello!")
else:
    print("Goodbye!")

In [None]:
# Once we download an HTML file, how do we extract the data we want? A single webpage can easily be 20 pages worth of
# HTML code, and oftentimes up to 100 pages. Thus, simply downloading the webpage code and copy-pasting is horribly
# inefficient.

# Let's do this again
import requests

my_response_object = requests.get("http://mason.gmu.edu/~jlee17/python_workshop_files/example_data/index-very-simple.html")
html_text = my_response_object.text
print(html_text)

In [None]:
# Jupyter Notebook comes with the all-powerful bs4 library. By convention, just importing the BeautifulSoup function
# is considered the simplest method. Thus the syntax from .... import ....
# Without this syntax (if I just did import bs4), we'd need bs4.BeautifulSoup() instead of BeautifulSoup()
# Maybe a bit silly here, but this is the standard convention.

from bs4 import BeautifulSoup

# Let's transform it into SOUP!
# There are two parameters. The first, naturally, is our html text.
# The second is which "parser" we'll be using. For now, html.parser is fine. You won't really ever
# need to change that.
soup = BeautifulSoup(html_text,"html.parser")

# soup objects have a really nifty prettify() method
print(soup.prettify())
print(soup.text)

In [None]:
# I wonder what else is inside our soup object?
# soup objects have lots of variables inside them, with information set to them by default.
# title gets the <title> element in its entirety

print("Here is the title, tag PLUS text: {0}".format(soup.title))
# <title>A Sample Title</title>

# If you just want the name of the element, you can get 'name' attribute.
print("Here is the title, tag ONLY: {0}".format(soup.title.name))
print(soup.title.text)

In [None]:
# More basics
print("Here is the title tag's PARENT's tag NAME: {0}".format(soup.title.parent.name))
# 'head'

print("Here is the first entire 'p' element: {0}".format(soup.p))
# <p>Here is a paragraph. But it doesn't have any attributes that you can capture!</p>

#print("Here is the value of the id attribuate for the first 'div' element: {0}".format(soup.div['id']))
# all_the_lists

print("Here is the first entire 'a' element: {0}".format(soup.a))
# <a href="http://www.google.com">Google</a>

In [None]:
# Different between .text and .string
# .string just gets what in that element's immediate text area
# .text is far more complete, it combines the current element plus all subelements together and merges all
# their values.

In [None]:
# soup.title is all well and good, because there's only ever ONE title element in an HTML page. But what about
# an element like <a>, which can appear many many times (and is very important to us?)
# Doing soup.a is cute when you only care about the very first hyperlink. But HTML can get REALLY dense!
# What do we do when we want elements that are really deep quickly?
# Remember: every node (element) in the soup tree contains every method itself! What do I mean by that?

# Let's get all the elements with an 'a' tag and put them in a list for us
# All soup objects have a find_all method, which searches all elements BELOW that element in the tree
# This finds ALL A TAGS in the ENTIRE DOCUMENT
all_a_tags = soup.find_all('a')

# It returns something called a ResultSet, which we can treat just like a list in terms of using a for loop
# Let's loop around and see what's inside?
for a_tag in all_a_tags:
    print("----------------")
    # Let's see the whole tag
    print(a_tag)
    # Just the href attribute of that tag
    print(a_tag['href'])
    # The text of the tag
    print(a_tag.string)
    # The tag's name itself (a)
    print(a_tag.name)

# Example:
# <a href="http://www.google.com">Google</a>
# http://www.google.com
# Google
# a

In [None]:
my_a = soup.find('a')
print(my_a)

In [None]:
# Note find vs find_all - 'find' just finds the FIRST match below the root from top to bottom
# 'find_all' finds (obviously) all matches below the root from top to bottom
# However, id is always unique, so it's pretty safe to do find with it

required_tags = {}
required_tags['id'] = 'file_links'

book_title_element = soup.find_all("div", required_tags)

for div_tag in book_title_element:
    print(div_tag.prettify())

In [None]:
# Ok, so how might we find only a select group of hyperlinks from ALL the hyperlinks?
# In this case, JUST the hyperlinks in the first div?
# One common way is method chaining a find_all() after a find() for the div/ul element
# Also note that if you're only selecting by a single attribute(such as ID here), you can just insert the
# dictionary right in there if you like.
ul_tag = soup.find("ul",{"id":"the_first_list"})
a_elements = ul_tag.find_all("a")

response = requests.get("http://whatever.com")
my_pdf = response.content
my_file.write(my_pdf)
for a_element in a_elements:
    print(a_element['href'])
    print(a_element.string)

In [None]:
# Advanced Concept - Lambda Functions!

# Lambda functions take some explaining. Basically, they're miniature, one-time-use functions that have no name.
# They pass in a variable L (or whatever you want to name it), which is defined by BeautifulSoup in this case to be a string.
# In this case, we want it to return TRUE if it's a match but FALSE if it's not a match
# Imagine a for loop, looping through every 'ul' element tag in the file:

# all_the_ul_tags = soup.find_all("ul")
# selected_ul_elements = []
# for L in all_the_ul_tags:
#     if L['id'] and L['id'].endswith("_list"):
#         selected_ul_elements.append(L['id'])

# This is what the lambda function below is doing. Five lines of code into one line! Much cleaner, once you get the syntax 
# The BeautifulSoup package understands to make a list of only those TRUE elements
# Also, the reason for the strange "L and" and the beginning is a fancy way of preventing errors.
# All this line of code below does is say:
# "Find me all the 'ul' elements whose id attribute ends with "_list".
# The "L and" at the beginning is a fancy way of making sure the id exists - without that,
# if the ul didn't have an id attribute Python would produce an error.

selected_ul_elements = soup.find_all("ul",{'id' : lambda L: L and L.endswith("_list") })

for element in selected_ul_elements:
    print("-----------------------")
    print(element)
    print("-----------------------")
    print(element.find_all("a"))

In [None]:
# Another tool you'll find useful is urljoin()
# Oftentimes you get what are known as relative links, which need to be joined with the base link to make a full hyperlink
# The cool part about urljoin() (vs. string concatenation) is that it doesn't care about making sure the slashes match
from urllib.parse import urljoin

my_link = "http://www.google.com/" + "/some_random_page.html"
print(my_link)
print(urljoin("http://www.google.com","some_random_page.html"))

In [None]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import unicodecsv as csv
import progressbar

bar = progressbar.ProgressBar()
my_file = open("clearance_stuff_3.csv","wb")
my_ss = csv.writer(my_file)

for value in bar(range(1996,2017)):
    hyperlink = 'http://ogc.osd.mil/doha/industrial/' + str(value) + '.html'
    response = requests.get(hyperlink)
    assert response.status_code == 200
    the_html = response.text
    soup = BeautifulSoup(the_html,"html.parser")
    my_dictionary = {}
    my_dictionary['class'] = 'case-list'
    my_main_div = soup.find("div",my_dictionary)
    all_the_cases = my_main_div.find_all("div",{'class':'case'})
    base_link = "http://www.dod.mil"
    
    for current_case in all_the_cases:
        case_number = current_case.find("div",{'class':'casenum'})
        case_number_text = case_number.text
        a_tag = case_number.find('a')
        
        if a_tag:
            a_tag_link = urljoin(base_link,a_tag['href'])
        else:
            a_tag_link = base_link
        keywords = current_case.find("div",{'class':'keywords'})
        keywords_text = keywords.text
        if ";" in keywords_text:
            keyword_list = keywords_text.split(";")
        elif "," in keywords_text:
            keyword_list = keywords_text.split(",")
        else:
            keyword_list = [keywords_text]
        date = current_case.find("p",{'class':'date'})
        date_text = date.text
        summary = current_case.find("p",{'class':'digest'})
        summary_text = summary.text.strip()
        my_ss.writerow([case_number_text,keywords_text,date_text,summary_text,a_tag_link])

my_file.close()
print("Done!")

In [None]:
pprint.pprint(my_dict['stats'][0]['effort'])

In [None]:
import requests
import unicodecsv as csv
file = open("pokemon.csv","wb")
csv_file = csv.writer(file)
csv_file.writerow(['Name','Weight','Speed','Defense','Attack','HP'])
for idx in range(1,152):
    response = requests.get("http://pokeapi.co/api/v2/pokemon/" + str(idx))
    assert response.status_code == 200
    my_dict = response.json()
    csv_file.writerow([my_dict['name'],my_dict['weight'],
                       my_dict['stats'][0]['base_stat'],
                       my_dict['stats'][3]['base_stat'],
                     my_dict['stats'][4]['base_stat'],
                       my_dict['stats'][5]['base_stat']])
file.close()
print("Done!")
