# Webscraping Tutorial 

In [1]:
from urllib.request import urlopen

In [2]:
# open the webpage 
url = "http://olympus.realpython.org/profiles/aphrodite"
page = urlopen(url)
page

<http.client.HTTPResponse at 0x16a745ce950>

In [3]:
# extract the html and decode to a strin 
html_bytes = page.read()
html = html_bytes.decode("utf-8")
print(html)


<html>
<head>
<title>Profile: Aphrodite</title>
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/aphrodite.gif" />
<h2>Name: Aphrodite</h2>
<br><br>
Favorite animal: Dove
<br><br>
Favorite color: Red
<br><br>
Hometown: Mount Olympus
</center>
</body>
</html>



## Extract Text From HTML With String Methods 

In [4]:
find_str = "<title>"
title_index = html.find(find_str)
start_index = title_index + len(find_str)
end_index = html.find("</title>")
html[start_index:end_index]

'Profile: Aphrodite'

In [6]:
#  Here’s another profile page with some messier HTML that you can scrape:
url = "http://olympus.realpython.org/profiles/poseidon"
page = urlopen(url)
html = page.read().decode("utf-8")
start_index = html.find("<title>") + len("<title>")
end_index = html.find("</title>")
title = html[start_index:end_index]
title

# html of Poseidon has additional whitespace after <title > 
# -> html.find("<title>") returns -1, bc exact substring does not exist 

'\n<head>\n<title >Profile: Poseidon'

## Using Regular Expressions 

In [7]:
import re

In [16]:
# the asterisk character (*) stands for zero or more instances 
# of whatever comes just before the asterisk.
re.findall("ab*c", "abbc  ac abcac")

['abbc', 'ac', 'abc', 'ac']

In [18]:
# Notice that if no match is found, then .findall() returns an empty list
re.findall("ab*c", "ABC")

[]

In [19]:
# ignore case sensitivity 
re.findall("ab*c", "ABC", re.IGNORECASE)


['ABC']

In [24]:
# use a period (.) to stand for any single character in a regular expression 
print(re.findall("a.c", "abc"))
print(re.findall("a.c", "abbc"))

print(re.findall("a.c", "ac"))
print(re.findall("a.*c", "ac"))

['abc']
[]
[]
['ac']


In [26]:
# use .* for any character repeated any number of times 
# -> find any substring starting with "a", ending with "c"
re.findall("a.*c", "abdbc")
re.findall("a.*c", "acc")


['acc']

### use re.search() to find pattern 
returns a MatchObject

In [33]:
match_results = re.search("ab*c", "ABC ac", re.IGNORECASE)
# match_results
match_results.group()

'ABC'

### Replace Text in a string with re.sub()

In [35]:
# greed matching pattern * -> find longest match for <....>
string = "Everything is <replaced> if it's in <tags> end."
string = re.sub("<.*>", "ELEPHANTS", string)
string

'Everything is ELEPHANTS end.'

In [36]:
# non greedy matching pattern: *?
# works same way as * expect that it matches the shortest possible string of text
string = "Everything is <replaced> if it's in <tags>."
string = re.sub("<.*?>", "ELEPHANTS", string)
string

"Everything is ELEPHANTS if it's in ELEPHANTS."

## Extract Text From HTML With Regular Expressions

In [40]:

import re
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")

pattern = "<title.*?>.*?</title.*?>"
match_results = re.search(pattern, html, re.IGNORECASE)
title = match_results.group()
title = re.sub("<.*?>", "", title)  # remoce html tags
title

'Profile: Dionysus'

In [71]:
url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")

patterns = ["(?<=Name:).*", "(?<=Favorite Color:).*"]
for pattern in patterns: 
    match_results = re.search(pattern, html)
    # print(match_results.group())
    # print(re.findall(pattern, html))
    title = re.findall(pattern, html)[0]
    title = re.sub("<.*?>", "", title)
    print(title)

 Dionysus
 Wine


## Beautifulsoup - Webscraping an HTML 

In [94]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
print(soup.get_text())



Profile: Dionysus





Name: Dionysus

Hometown: Mount Olympus

Favorite animal: Leopard 

Favorite Color: Wine






In [99]:
# return a list of all <img> tags in the HTML document -> tag object by BeatitfulSoup
soup.find_all("img")

[<img src="/static/dionysus.jpg"/>, <img src="/static/grapes.png"/>]

In [100]:
# each tag object has a .name property containing the html tag type 
image1, image2 = soup.find_all("img")
image1.

'img'

In [105]:
# access the HTML attributes of the Tag object just like keys in a dictionary
image1["src"]

'/static/dionysus.jpg'

In [107]:
# retrieve the string of the title tag
soup.title.string

'Profile: Dionysus'

In [126]:
base_url = "http://olympus.realpython.org"
url = "http://olympus.realpython.org/profiles"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all("a"): 
    print(base_url + link["href"])

http://olympus.realpython.org/profiles/aphrodite
http://olympus.realpython.org/profiles/poseidon
http://olympus.realpython.org/profiles/dionysus


## Interact with HTML Forms - Mechanical Soup

In [27]:
# create a headless web browser 
# can be used to request a page from the Internet by passing an URL 
import mechanicalsoup
browser = mechanicalsoup.Browser()

In [28]:
# get a response from the browser: 200 means successfull, 404 = URL does not exist 
url = "http://olympus.realpython.org/login"
page = browser.get(url)
page

<Response [200]>

In [29]:
# the page has a .soup attribute, which represents a BeautifulSoup object
# notice: this page has a <form> on it for username and password 
page.soup

<html>
<head>
<title>Log In</title>
</head>
<body bgcolor="yellow">
<center>
<br/><br/>
<h2>Please log in to access Mount Olympus:</h2>
<br/><br/>
<form action="/login" method="post" name="login">
Username: <input name="user" type="text"/><br/>
Password: <input name="pwd" type="password"/><br/><br/>
<input type="submit" value="Submit"/>
</form>
</center>
</body>
</html>

In [19]:
import mechanicalsoup

# 1
browser = mechanicalsoup.Browser()
url = "http://olympus.realpython.org/login"
login_page = browser.get(url)
login_html = login_page.soup

# 2
form = login_html.select("form")[0]
form.select("input")[0]["value"] = "zeus"
form.select("input")[1]["value"] = "ThunderDude"

# 3
profiles_page = browser.submit(form, login_page.url)


In [20]:
profiles_page.url

'http://olympus.realpython.org/profiles'

## Interact With Websites in Real Time

In [46]:
import mechanicalsoup
import time
 
browser = mechanicalsoup.Browser()

for i in range(4):
    page = browser.get("http://olympus.realpython.org/dice")
    tag = page.soup.select("#result")[0]
    result = tag.text
    print(f"The result of your dice roll is: {result}")
    # Wait 10 seconds if this isn't the last request
    if i < 3:
        time.sleep(4)


The result of your dice roll is: 3
The result of your dice roll is: 2
The result of your dice roll is: 4
The result of your dice roll is: 5


In [43]:
page.soup.select("h2 ")

[<h2 id="result">1</h2>]

In [None]:
<h2 id="result">1</h2>

## Speedtest Website

In [12]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen

# Send a GET request to the website
url = 'https://fast.com/'
page = urlopen(url)
html = page.read().decode("utf-8")

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Find the element containing the download speed
speed_element = soup.find('strong', {'id': 'speed-value'})

if speed_element:
    # Extract the download speed value
    speed = speed_element.text.strip()
    print('Download Speed:', speed)
else:
    print('Unable to retrieve download speed.')


Unable to retrieve download speed.


In [48]:
import mechanicalsoup
import time
from bs4 import BeautifulSoup
 
browser = mechanicalsoup.Browser()
url = 'https://fast.com/'



page = browser.get(url)
time.sleep(4)
tag = page.soup.select("#speed-value")[0]
result = tag.text
print(f"Your internet speed is: {result}")
# Wait 10 seconds if this isn't the last request
# if i < 3:
#     time.sleep(4)

Your internet speed is: 
                                0
                            


In [None]:
# import requests
# from bs4 import BeautifulSoup

# # Send a GET request to the website
# response = requests.get('https://fast.com/')

# # Parse the HTML content using BeautifulSoup
# soup = BeautifulSoup(response.content, 'html.parser')

# # Find the element containing the download speed
# speed_element = soup.find('strong', {'id': 'speed-value'})

# if speed_element:
#     # Extract the download speed value
#     speed = speed_element.text.strip()
#     print('Download Speed:', speed)
# else:
#     print('Unable to retrieve download speed.')