In [13]:
#https://realpython.com/python-web-scraping-practical-introduction/

#One useful package for web scraping that you can find in Python’s standard library is urllib
#, which contains tools for working with URLs. 
#In particular, the urllib.request module contains a function called urlopen() that can be used to open a URL within a program.

from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/aphrodite"
page = urlopen(url)
page

html_bytes = page.read()
html = html_bytes.decode("utf-8")

print(html)

<html>
<head>
<title>Profile: Aphrodite</title>
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/aphrodite.gif" />
<h2>Name: Aphrodite</h2>
<br><br>
Favorite animal: Dove
<br><br>
Favorite color: Red
<br><br>
Hometown: Mount Olympus
</center>
</body>
</html>



In [24]:
#Extract Text From HTML With String Methods

title_index = html.find("<title>")
title_index

start_index = title_index + len("<title>")
start_index

end_index = html.find("</title>")
end_index

title = html[start_index:end_index]
title

'Profile: Aphrodite'

In [30]:
>>> url = "http://olympus.realpython.org/profiles/poseidon"
>>> page = urlopen(url)
>>> html = page.read().decode("utf-8")
>>> start_index = html.find("<title>") + len("<title>")
>>> end_index = html.find("</title>")
>>> title = html[start_index:end_index]
>>> title



'\n<head>\n<title >Profile: Poseidon'

In [31]:

page

<http.client.HTTPResponse at 0x1a33fc58898>

In [29]:
html_bytes = page.read()
html = html_bytes.decode("utf-8")
html

''

In [32]:
#A Primer on Regular Expressions

import re

In [33]:
re.findall("ab*c", "ac")

['ac']

In [34]:
>>> re.findall("ab*c", "abcd")

['abc']

In [37]:
re.findall("ab*c", "ABC")


[]

In [38]:
#IGNORECASE

re.findall("ab*c", "ABC", re.IGNORECASE)

['ABC']

In [39]:
#You can use a period (.) to stand for any single character in a regular expression. 

re.findall("a.c", "abc")

['abc']

In [40]:
re.findall("a.c", "abbc")

[]

In [41]:
#The pattern .* inside a regular expression stands for any character repeated any number of times.
re.findall("a.*c", "abc")

['abc']

In [42]:
#There’s one more function in the re module that’s useful for parsing out text. re.sub(), which is short for substitute
#, allows you to replace text in a string that matches a regular expression with new text.

>>> string = "Everything is <replaced> if it's in <tags>."
>>> string = re.sub("<.*>", "ELEPHANTS", string)
>>> string

'Everything is ELEPHANTS.'

In [43]:
#"<.*?>"

>>> string = "Everything is <replaced> if it's in <tags>."
>>> string = re.sub("<.*?>", "ELEPHANTS", string)
>>> string

"Everything is ELEPHANTS if it's in ELEPHANTS."

In [44]:
import re
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")

pattern = "<title.*?>.*?</title.*?>"
match_results = re.search(pattern, html, re.IGNORECASE)
title = match_results.group()
title = re.sub("<.*?>", "", title) # Remove HTML tags

print(title)


Profile: Dionysus


In [48]:
html

'<html>\n<head>\n<TITLE >Profile: Dionysus</title  / >\n</head>\n<body bgcolor="yellow">\n<center>\n<br><br>\n<img src="/static/dionysus.jpg" />\n<h2>Name: Dionysus</h2>\n<img src="/static/grapes.png"><br><br>\nHometown: Mount Olympus\n<br><br>\nFavorite animal: Leopard <br>\n<br>\nFavorite Color: Wine\n</center>\n</body>\n</html>\n'

In [49]:
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
page

html_bytes = page.read()
html = html_bytes.decode("utf-8")

print(html)

<html>
<head>
<TITLE >Profile: Dionysus</title  / >
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/dionysus.jpg" />
<h2>Name: Dionysus</h2>
<img src="/static/grapes.png"><br><br>
Hometown: Mount Olympus
<br><br>
Favorite animal: Leopard <br>
<br>
Favorite Color: Wine
</center>
</body>
</html>



In [52]:
pattern = "<h2.*?>.*?</h2.*?>"
match_results = re.search(pattern, html, re.IGNORECASE)

title = match_results.group()
title = re.sub("<.*?>", "", title) # Remove HTML tags
title = re.sub("Name: ", "", title) # Remove HTML tags

print(title)

Dionysus


In [53]:
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/dionysus"
html_page = urlopen(url)
html_text = html_page.read().decode("utf-8")


for string in ["Name: ", "Favorite Color:"]:
    string_start_idx = html_text.find(string)
    text_start_idx = string_start_idx + len(string)

    next_html_tag_offset = html_text[text_start_idx:].find("<")
    text_end_idx = text_start_idx + next_html_tag_offset

    raw_text = html_text[text_start_idx : text_end_idx]
    clean_text = raw_text.strip(" \r\n\t")
    print(clean_text)

Dionysus
Wine


In [56]:
pip install beautifulsoup4


Note: you may need to restart the kernel to use updated packages.


In [59]:
#Run pip show to see the details of the package you just installed:
pip show beautifulsoup4

SyntaxError: invalid syntax (<ipython-input-59-c96c7464fe3c>, line 2)

In [61]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")

In [62]:
print(soup.get_text())



Profile: Dionysus





Name: Dionysus

Hometown: Mount Olympus

Favorite animal: Leopard 

Favorite Color: Wine






In [63]:
image1, image2 = soup.find_all("img")

In [64]:
image1.name

'img'

In [65]:
image1["src"]

'/static/dionysus.jpg'

In [66]:
image2["src"]

'/static/grapes.png'

In [67]:
soup.title

<title>Profile: Dionysus</title>

In [68]:
#Beautiful Soup automatically cleans up the tags for you by removing the extra space
#in the opening tag and the extraneous forward slash (/) in the closing tag.


In [70]:
#You can also retrieve just the string between the title tags with the .string property of the Tag object:
soup.title.string

'Profile: Dionysus'

In [71]:
#One of the more useful features of Beautiful Soup is the ability to search for specific kinds of tags whose attributes match certain values.
soup.find_all("img", src="/static/dionysus.jpg")

[<img src="/static/dionysus.jpg"/>]

In [72]:
#BeautifulSoup is great for scraping data from a website’s HTML, but it doesn’t provide any way to work with HTML forms. For example, if you need to search a website for some query and then scrape the results, then BeautifulSoup alone won’t get you very far.


In [80]:
from urllib.request import urlopen

base_url = "http://olympus.realpython.org"

html_page = urlopen(base_url + "/profiles")
html_text = html_page.read().decode("utf-8")

soup = BeautifulSoup(html_text, "html.parser")

for link in soup.find_all("a"):
    link_url = base_url + link["href"]
    print(link_url)

http://olympus.realpython.org/profiles/aphrodite
http://olympus.realpython.org/profiles/poseidon
http://olympus.realpython.org/profiles/dionysus
