# Web scraping

https://www.baeldung.com/cs/web-crawling-vs-web-scraping

**Web crawling**: searching and automatically indexing web content and other data over the web. Web crawlers scan webpages to understand every page of a website to retrieve, update, and index information when users perform search queries. <u>The goal of a crawler is to understand the content of a website</u>

<img src="example_datasets/Media/web-scraping/web-crawling.png">

**Web scraping**: the procedure of gathering and examining raw data from the internet. <u>Web scraping aims to convert specific website content into a structured format, such as tables, JSON, databases, and XML representations</u>.

<img src="example_datasets/Media/web-scraping/web-scraping.png">


Methods in web scraping:
- Pandas: simple method; extract tables from a website;
- urllib: extract HTML and then parse it manually;
- BeautifulSoup: HTML parser designed for parsing HTML pages;

## Pandas

In [None]:
import pandas as pd

df = pd.read_html('https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes_(seasons_1%E2%80%9320)')

# Show how many tables were extracted
print(len(df))

# Season 1
df[1]

23


Unnamed: 0,No.overall,No. inseason,Title,Directed by,Written by,Original air date,Prod.code,U.S. viewers(millions)
0,1,1,"""Simpsons Roasting on an Open Fire""",David Silverman,Mimi Pond,"December 17, 1989",7G08,26.7[46]
1,2,2,"""Bart the Genius""",David Silverman,Jon Vitti,"January 14, 1990",7G02,24.5[46]
2,3,3,"""Homer's Odyssey""",Wes Archer,Jay Kogen & Wallace Wolodarsky,"January 21, 1990",7G03,27.5[47]
3,4,4,"""There's No Disgrace Like Home""",Gregg Vanzo & Kent Butterworth,Al Jean & Mike Reiss,"January 28, 1990",7G04,20.2[48]
4,5,5,"""Bart the General""",David Silverman,John Swartzwelder,"February 4, 1990",7G05,27.1[49]
5,6,6,"""Moaning Lisa""",Wes Archer,Al Jean & Mike Reiss,"February 11, 1990",7G06,27.4[50]
6,7,7,"""The Call of the Simpsons""",Wes Archer,John Swartzwelder,"February 18, 1990",7G09,27.6[51]
7,8,8,"""The Telltale Head""",Rich Moore,"Al Jean, Mike Reiss, Sam Simon & Matt Groening","February 25, 1990",7G07,28[52]
8,9,9,"""Life on the Fast Lane""",David Silverman,John Swartzwelder,"March 18, 1990",7G11,33.5[53]
9,10,10,"""Homer's Night Out""",Rich Moore,Jon Vitti,"March 25, 1990",7G10,30.3[54]


## urllib

In [None]:
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/aphrodite"
# Open webpage
page = urlopen(url)
# Extract HTMl from the webpage
html_bytes = page.read()
html = html_bytes.decode("utf-8")
print(html)

# Extract webpage title
title_index = html.find("<title>")
start_index = title_index + len("<title>")
end_index = html.find("</title>")
title = html[start_index:end_index]
print(f"Title: {title}")



<html>
<head>
<title>Profile: Aphrodite</title>
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/aphrodite.gif" />
<h2>Name: Aphrodite</h2>
<br><br>
Favorite animal: Dove
<br><br>
Favorite color: Red
<br><br>
Hometown: Mount Olympus
</center>
</body>
</html>

Title: Profile: Aphrodite


In [None]:
from urllib.request import urlopen
import re

url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")
print(html)
# Find title
title = re.findall('< *title *>(.*?)< */title[ /]*>', html, re.IGNORECASE)
print(title)
# Entries 
entries = re.findall('[a-zA-Z0-9 ]*:[a-zA-Z0-9 ]*', html)
print(entries)


<html>
<head>
<TITLE >Profile: Dionysus</title  / >
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/dionysus.jpg" />
<h2>Name: Dionysus</h2>
<img src="/static/grapes.png"><br><br>
Hometown: Mount Olympus
<br><br>
Favorite animal: Leopard <br>
<br>
Favorite Color: Wine
</center>
</body>
</html>

['Profile: Dionysus']
['Profile: Dionysus', 'Name: Dionysus', 'Hometown: Mount Olympus', 'Favorite animal: Leopard ', 'Favorite Color: Wine']


In [None]:
from urllib import request

# download a file
URL = 'https://files.rcsb.org/view/3L56.pdb'
response = request.urlretrieve(URL, 'example_datasets/3L56.pdb')


## requests

In [None]:
import requests

url = "http://olympus.realpython.org/profiles/aphrodite"

page = requests.get(url)
print( page.text )


<html>
<head>
<title>Profile: Aphrodite</title>
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/aphrodite.gif" />
<h2>Name: Aphrodite</h2>
<br><br>
Favorite animal: Dove
<br><br>
Favorite color: Red
<br><br>
Hometown: Mount Olympus
</center>
</body>
</html>



In [None]:
import requests

urls = ["http://olympus.realpython.org/profiles/aphrodite", "http://olympus.realpython.org/profiles/dionysus"]

results = [requests.get(url) for url in urls]
results


[<Response [200]>, <Response [200]>]

## BeautifulSoup

This library is used to parse HTML from web pages.

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

url = "http://olympus.realpython.org/profiles"
# Open URL
page = urlopen(url)
# read HTML from the page
html = page.read().decode("utf-8")
# Create BeautifulSoup object
soup = BeautifulSoup(html, "html.parser")
print(soup)

# Get title
print(soup.title)
print(soup.title.string)
# Get main text
text = soup.get_text()
text = re.sub('\n+', '\n', text)
print(text)
# Find all images
print(soup.find_all('img'))
# Get links
paths = soup.find_all('a')
print(paths)
paths2 = [ i["href"] for i in paths ]
print(paths2)


<html>
<head>
<title>All Profiles</title>
</head>
<body bgcolor="yellow">
<center>
<br/><br/>
<h1>All Profiles:</h1>
<br/><br/>
<h2>
<a href="/profiles/aphrodite">Aphrodite</a>
<br/><br/>
<a href="/profiles/poseidon">Poseidon</a>
<br/><br/>
<a href="/profiles/dionysus">Dionysus</a>
</h2>
</center>
</body>
</html>

<title>All Profiles</title>
All Profiles

All Profiles
All Profiles:
Aphrodite
Poseidon
Dionysus

[]
[<a href="/profiles/aphrodite">Aphrodite</a>, <a href="/profiles/poseidon">Poseidon</a>, <a href="/profiles/dionysus">Dionysus</a>]
['/profiles/aphrodite', '/profiles/poseidon', '/profiles/dionysus']


In [None]:
# The Guardian news collection
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd

url = 'https://www.theguardian.com/world'
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
# print(soup)

stuff = soup.find_all('div',  class_="fc-item__container")
stuff

links, headlines, texts = [],[],[]
for i in stuff:
	# print(i)
	### LINKS
	link = i.find('a')['href']
	# print(link)
	links.append(link)
	### HEADLINE
	headline = i.find('span', class_="js-headline-text").contents[0]
	# print(headline)
	headlines.append(headline)
	### TEXT
	text = i.find('div', class_="fc-item__standfirst").contents[0]
	# print(text)
	texts.append(text)

df = pd.DataFrame([], columns=['Headline', 'Text', 'Link'])
headlines = [ i.strip() for i in headlines ]
texts = [ i.strip() for i in texts ]

df['Headline'] = headlines
df['Text'] = texts
df['Link'] = links

# df.to_csv('output/TheGuardian_headlines2.csv', sep=',', index=False)

df


Unnamed: 0,Headline,Text,Link
0,Six journalists arrested over footage of South...,State broadcaster’s staff held on suspicion of...,https://www.theguardian.com/world/2023/jan/06/...
1,Killing of LGBTQ+ activist prompts outcry over...,Body of fashion designer and model Edwin Chilo...,https://www.theguardian.com/world/2023/jan/06/...
2,UN envoy calls for release of jailed journalis...,"Concerns raised over health of Pape Alé Niang,...",https://www.theguardian.com/global-development...
3,Zimbabwe court denies opposition MP pre-trial ...,"In an election year, critics of the ruling Zan...",https://www.theguardian.com/global-development...
4,Nine killed in New Year’s Eve crush in Ugandan...,Survivor says pushing began after fireworks ou...,https://www.theguardian.com/world/2023/jan/01/...
5,Blair government had misgivings about Mandela ...,Files show Downing Street felt former South Af...,https://www.theguardian.com/uk-news/2022/dec/3...
6,Jair Bolsonaro wrecked Brazil’s presidential p...,Journalist touring residence with new first la...,https://www.theguardian.com/world/2023/jan/06/...
7,Sinaloa cartel launches violent response as Me...,Ovidio Guzmán’s arrest on Thursday prompted he...,https://www.theguardian.com/world/2023/jan/05/...
8,Briton shot dead in Jamaica was victim of cont...,Jamaican police say fatal shooting of Sean Pat...,https://www.theguardian.com/world/2023/jan/05/...
9,Number of populist world leaders at 20-year low,After Bolsonaro’s defeat and Duterte’s departu...,https://www.theguardian.com/world/2023/jan/05/...


## Selenium

In [None]:
import os
import time
from urllib import request
from urllib.request import urlopen
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
# from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from timeit import default_timer as timer
import re


# URL of the fasta file
url = "https://www.ncbi.nlm.nih.gov/nuccore/NC_056059.1?report=fasta&log$=seqview&format=text&from=86393909&to=86405669"
# Location of the webdriver for google chrome( path not hardcoded, but created with os to make it compatible with different OS)
chromedriver_path = os.path.join( os.getcwd(), 'modules', 'chromedriver108.exe' )

# create selenium webdirver object
service = Service(executable_path=chromedriver_path)
driver = webdriver.Chrome(service=service)
driver.get(url)

""" Give the webpage time to load """
# ### Explicitly state to wait for 5 seconds
# time.sleep(5)
### Wait until the body element is present
start = timer()
wait = WebDriverWait(driver, 100)
wait.until(ec.presence_of_element_located((By.XPATH, "/html/body")))
stop = timer()
print(f"Wait time (sec): {stop-start}")


### Print the whole body text in the terminal
text = driver.find_element(By.XPATH, "/html/body").text
# print( text )
### OPTION: Find lines that don't belong in the FASTA file
a = text.split('\n')
for i in a:
	if not set(i.lower()).issubset( set('tcag') ):
		if i[0] == '>':
			pass
		else:
			print(f'Invalid line!   {i}')

# Write the output in a file
with open('output/file.fasta', 'w') as output:
	output.write(text)

# Close the driver
driver.close()

Wait time (sec): 2.0244813000317663
Invalid!   Downloading Large Sequence: 0.00MB (COMPLETE)


In [None]:
""" Script to extract headlines and news descriptions from xxx, output them to csv with links """

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import pandas as pd
from selenium.webdriver.chrome.options import Options


website = "https://www.thesun.co.uk/sport/football"
# path = "C:/Users/evgen/Desktop/Python/chromedriver"
path = 'modules/chromedriver108'

### OPTION 1: open the browser
service = Service(executable_path=path)
driver = webdriver.Chrome(service=service)
driver.get(website)

### OPTION 2: run in the headless mode, without opening the browser
# options = Options()
# options.headless = True
# service = Service(executable_path=path)
# driver = webdriver.Chrome(service=service, options=options)
# driver.get(website)



containers = driver.find_elements(by="xpath", value='//div[@class="teaser__copy-container"]')

everything, titles, subtitles, links = [], [], [], []

for container in containers:
	""" Locate all the elements of a website that we are interested in """
	# asdf = 	container.find_elements(by="xpath", value='//div[@class="teaser__copy-container"]/a/h2')
	# asdf = 	container.find_elements(by="xpath", value='//div[@class="teaser__copy-container"]/a')
	# print(asdf)
	# container.find_elements(by="xpath", value='./a/h2') # Title element
	# title = container.find_element(by="xpath", value='./a/h2').text # Get not the entire HTML element, but only the text within the element
	# titles.append(title)
	all = container.find_element(by="xpath", value='./a').text
	everything.append(all)
	# //div[@class="teaser__copy-container"]/a/p
	# container.find_elements(by="xpath", value="./a/p") # Subtitle element
	subtitle = container.find_element(by="xpath", value='./a/p').text
	subtitles.append(subtitle)
	#
	link = container.find_element(by="xpath", value='./a').get_attribute("href")
	links.append(link)

# print(everything)
# print(titles)
# print(subtitles)
# print(links)


""" Export the elements in a CSV format """
my_dict = {'subtitle': subtitles, 'link': links}
df_headlines = pd.DataFrame(my_dict)
df_headlines.to_csv('output/news_headlines.csv')


driver.quit()

In [None]:
""" Scrape news from the newspaper """


