## Load in the necessary libraries

In [1]:
import requests
from bs4 import BeautifulSoup as bs

## Load the webpage content

In [2]:
# Load the webpage content
path = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to a beatiful soup object
soup = bs(path.content)

# Print out our HTML
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Start using Beautiful soup to Scrape

### find and find_all

In [3]:
# The method .find only finds the first element
first_header = soup.find("h2")
# print(first_header)

# The method .find_all returns all the h2 elements as a list
headers = soup.find_all("h2")
print(headers)

[<h2>A Header</h2>, <h2>Another header</h2>]


In [4]:
# Pass in a list of elements to look for
first_header = soup.find(["h1", "h2"])
# print(first_header)

headers = soup.find_all(["h1", "h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [5]:
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all("p", attrs={"id": "paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [6]:
# You can nest find/find_all calls
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [7]:
# We can search specific strings in our find/find_all calls
import re

paragraph = soup.find_all("p", string=re.compile("(s|S)ome"))
paragraph

headers = soup.find_all("h2", string=re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

### Select (CSS selector)
for more information about CSS selectors [click_here](https://www.w3schools.com/cssref/css_selectors.asp)

In [8]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [9]:
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [10]:
# Select all the p elements that are after h2
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [11]:
# Select all the p elements. Then look up for the id in those elements
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [12]:
# look up in the body for all p tags elements
paragraphs = soup.select("body > p")
print(paragraphs)

for paragraph in paragraphs:
    print(paragraph.select("i"))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [13]:
# Grab by element with specific property
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

### Get different properties of the HTML

In [14]:
# For one element use .string
header = soup.find("h2")
header.string


# If multiple child elements use get_text
div = soup.find("div")
print(div.prettify())
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [15]:
# Get a specifict property from an element
link = soup.find("a")
link['href']

paragraphs = soup.select("p#paragraph-id")
paragraphs[0]['id']

'paragraph-id'

## Code Navigation

In [16]:
# Path Syntax
print(soup.body.prettify())

soup.body.div.h1.string

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



'HTML Webpage'

In [17]:
# Know the terms: Parent, Sibling, Child

soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

## Exercise 1: Grab all social links on webpage in 3 different ways

### Load the Webpage

In [25]:
# Load the webpage content
url = "https://keithgalli.github.io/web-scraping/"
path = requests.get(url + "webpage.html")

# Convert to a beatiful soup object
soup = bs(path.content)

# Print out our HTML
# print(soup.prettify())


### Grab all of the social links from the webpage
Do this in at least 3 different ways

In [19]:
# links = soup.find_all('a', string=re.compile("keithgalli"))
# actual_links = [link['href'] for link in links]
# actual_links


# ulist = soup.find('ul', attrs={"class": "socials"})
# links = ulist.find_all("a")
# actual_links = [link['href'] for link in links]
# actual_links


# links = soup.select("li.social a")
# actual_links = [link['href'] for link in links]
# actual_links


# links = soup.select(".socials a")
# actual_links = [link['href'] for link in links]
# actual_links
# links

## Exercise 2: Scrape an HTML table into Pandas Dataframe

In [20]:
import pandas as pd

table = soup.select("table.hockey-stats")[0]

# columns = table.find_all("th")
columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]

table_rows = table.find("tbody").find_all("tr")
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

df = pd.DataFrame(l, columns=column_names)
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


## Exercise 3: Grab all fun facts that contain the word "is"

In [21]:
facts = soup.select("ul.fun-facts li")
facts_with_is = [fact.find(string=re.compile("is")) for fact in facts]
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

## Exercise 4: Use beautiful soup to help download an image

In [31]:
img = soup.select("div.row div.column img")
img_url = img[0]['src']
# print(img_url)
full_url = url + img_url

img_data = requests.get(full_url).content
with open('lake_como.jpg', 'wb') as handler:
    handler.write(img_data)

## Exercise 5: Solve the mystery challenge

In [37]:
files = soup.select("div.block a")
relative_files = [f['href'] for f in files]

for f in relative_files:
    full_url = url + f
    page = requests.get(full_url)
    bs_page = bs(page.content)
    secret_word_element = bs_page.find("p", attrs={"id": "secret-word"})
    secret_word = secret_word_element.string
    print(secret_word)

Make
sure
to
smash
that
like
button
and
subscribe
!!!
