In [None]:
# We'll use requesta and BeautifulSoup again in this tutorial:
import requests 
from bs4 import BeautifulSoup

## We'll also use the re module for regular expressions.
import re

In [None]:
## Let's look at this list of state universities in the US:
top_url = 'https://en.wikipedia.org/wiki/List_of_state_universities_in_the_United_States'

# Use requests.get to fetch the HTML at the specific url:
response = requests.get(top_url)

print(type(response))
# This returns an object of type Response:

In [None]:
# And it contains all the HTML of the URL:
print(response.content)

In [None]:
# Create the nested data object using the BeautifulSoup() function:
soup = BeautifulSoup(response.content)
print(type(soup))

In [None]:
# The prettify method for making our output more readable.
## The example below looks at the 50,000 - 51,000 characters in the scraped HTML: 
print(soup.prettify())[50000:51000]

In [None]:
# We can use the find method to find the first tag (and its contents) of a certain type.
soup.find("p")

### Exploring and Inspecting a Webpage

Similar to the `find` method, we can use the `find_all` method to find all the tags of a certain type. But what tags are we looking for? We can look at the code for any individual part of an HTML page by clicking on it from within a browser and selecting `inspect`.
![Inspecting an HTML Element](03-images/inspect.png)

### Inspected Elements
This will show you the underlying code that generates this element.

![Results of Inspection](03-images/inspected.png)

You can see that the links to the colleges are listed, meaning within `<li>` tags, as well as links, meaning within `<a>` tags.

In [None]:
# This gets us somewhere, but there are links in here that are not colleges and some of the colleges do not have links.
soup.find_all("a")

In [None]:
# Searching for <li> tags gets us closer, but there are still some non-universities in here.
list_items = soup.find_all("li")
print(type(list_items))
print(list_items[200:210])

In [None]:
# Let's search for the first and last university in the list and return their index number:
for i in range(0, len(list_items)):
    content = str(list_items[i].contents)
    
    if "University of Alabama System" in content:
        print("Index of first university is: " + str(i))    
    
    if "University of Wyoming" in content:
        print("Index of last university is: " + str(i))   

In [None]:
# Now we can use those indexes to subset out everything that isn't a university:
universities = list_items[71:840]

print(len(universities))
print(universities)

In [None]:
# We can grab the University Names and URLs for the wikipedia pages for the schools that have them:

name_list = []
url_list = []

for uni in universities:
    
    name_list.append(uni.text)
    
    a_tag = uni.find("a")
    if a_tag:
        ref = a_tag.get("href")
        print(ref)
        url_list.append(ref)
        
    else:
        print("No URL for this University")
        url_list.append("")

In [None]:
import pandas as pd

d = { "name" : pd.Series(name_list),
      "html_tag" : pd.Series(universities),
      "url" : pd.Series(url_list)}
    
df = pd.DataFrame(d)

df["url"] = "https://en.wikipedia.org" + df["url"]

df.shape
df[:10]

In [None]:
# How many names contain 'College':
df['name'].str.contains("College", na=False).value_counts()

In [None]:
# How many names contain 'University':
df['name'].str.contains("University", na=False).value_counts()

## From Scraping to Crawling

So, you might have noticed that the information we collected from this scraper isn't that interesting. However, it does include a list of URLs for each University we found and we can scrape these pages as well. On the individual pages for each university, there's data on the school type, their location, endowment, and founding year, as well as other interesting information that we may be able to get to.

At this point, you'd start to consider our task a basic form of web crawling - the systemic or automated browsing of multiple web pages. This is certainly a simple application of web crawling, but the idea of following hyperlinks from one URL to another is representative.

In [None]:
uni_pages = []
for url in df["url"]:
    if url != "":
        resp = requests.get(url)
        uni_pages.append(resp.content)
    else:
        uni_pages.append("")

In [None]:
## Add this newly scrapped data to our pandas dataframe:
df["wikipedia_page"] = uni_pages
df.shape

In [None]:
## Our pandas dataframe now has a column containing the entire HTML wikipedia apgefor each university:
df["wikipedia_page"][:10]

In [None]:
# Let's see what we can get from one page:
soup = BeautifulSoup(df["wikipedia_page"][0])
table = soup.find("table", {"class" : "infobox"})
rows = table.find_all("tr")
    
print(rows[:])

In [None]:
## Now we can search across these rows for various data of interest:
for row in rows:
    header = row.find("th")
    data = row.find("td")
   
    # Make sure there was actually both a th and td tag in that row, and proceed if so.
    if header is not None and data is not None:
        
        if header.contents[0] == "Type":
            print("The type of this school is " + data.text)
        
        if header.contents[0] == "Location":
            print("This location of this school is " + data.text)
            
        if header.contents[0] == "Website":
            print("The website for this school is " + data.text)
            
        if "Endowment" in str(header.contents[0]):
            print("The endowment for this school is " + data.text)

In [None]:
## Create empty columns of out dataframe to fill with new information:
df["type"] = ""
df["location"] = ""
df["website"] = ""
df["established"] = ""
df["endowment"] = ""

## Loop over every wikipedia page in our dataframe and populate our new columns with the pertinent data:
for i in range(0, len(df["wikipedia_page"])):
    tmp_soup = BeautifulSoup(df["wikipedia_page"][i])
    tmp_table = tmp_soup.find("table", {"class" : "infobox"})
    
    if tmp_table is not None:
        tmp_rows = tmp_table.find_all("tr")

        for row in tmp_rows:
            header = row.find("th")
            data = row.find("td")

            if header is not None and data is not None:
                if header.contents[0] == "Type":
                    df["type"][i] = data.text

                if header.contents[0] == "Location":
                    df["location"][i] = data.text
                
                if header.contents[0] == "Website":
                    df["website"][i] = data.text  
                    
                ## Note that below we convert to unicode using utf-8, rather then simply str().
                ## This is more robust in handling special characters.
                if "Endowment" in header.contents[0].encode('utf-8'):
                    df["endowment"][i] = data.text
                    
                if "Established" in header.contents[0].encode('utf-8'):
                    df["established"][i] = data.text  

In [None]:
## Now we have dramatically more actionable data that could have been very difficult to collect manually.
df[:200]