In [2]:
import tqdm
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
response = requests.get('https://jsonplaceholder.typicode.com/todos')
response.json()[0]

{'userId': 1, 'id': 1, 'title': 'delectus aut autem', 'completed': False}

# The url to scrape

In [4]:
url = 'https://www.reuters.com/article/us-shazam-m-a-apple-eu/eu-clears-apples-purchase-of-shazam-idUSKCN1LM1TZ'

In [5]:
url

'https://www.reuters.com/article/us-shazam-m-a-apple-eu/eu-clears-apples-purchase-of-shazam-idUSKCN1LM1TZ'

# Get the response

In [8]:
response = requests.get(url)
response

<Response [200]>

In [9]:
response.json()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [10]:
response.content

b'<!DOCTYPE html><html lang="en"><head><meta name="viewport" content="width=device-width"/><meta charSet="utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=edge"/><meta http-equiv="x-dns-prefetch-control" content="on"/><meta name="robots" content="index, follow"/><meta name="twitter:site" content="@Reuters"/><meta name="twitter:creator" content="@Reuters"/><meta name="twitter:card" content="summary_large_image"/><meta name="msapplication-TileColor" content="#ff8000"/><meta name="msapplication-config" content="none"/><meta name="theme-color" content="#ffffff"/><meta property="article:publisher" content="https://www.facebook.com/Reuters"/><link rel="dns-prefetch" href="//s1.reutersmedia.net"/><link rel="dns-prefetch" href="//s2.reutersmedia.net"/><link rel="dns-prefetch" href="//s3.reutersmedia.net"/><link rel="dns-prefetch" href="//s4.reutersmedia.net"/><link rel="dns-prefetch" href="//static.reuters.com"/><link rel="dns-prefetch" href="//www.googletagservices.com"/><link rel="dns-

In [11]:
type(response.content)

bytes

In [12]:
response.text

'<!DOCTYPE html><html lang="en"><head><meta name="viewport" content="width=device-width"/><meta charSet="utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=edge"/><meta http-equiv="x-dns-prefetch-control" content="on"/><meta name="robots" content="index, follow"/><meta name="twitter:site" content="@Reuters"/><meta name="twitter:creator" content="@Reuters"/><meta name="twitter:card" content="summary_large_image"/><meta name="msapplication-TileColor" content="#ff8000"/><meta name="msapplication-config" content="none"/><meta name="theme-color" content="#ffffff"/><meta property="article:publisher" content="https://www.facebook.com/Reuters"/><link rel="dns-prefetch" href="//s1.reutersmedia.net"/><link rel="dns-prefetch" href="//s2.reutersmedia.net"/><link rel="dns-prefetch" href="//s3.reutersmedia.net"/><link rel="dns-prefetch" href="//s4.reutersmedia.net"/><link rel="dns-prefetch" href="//static.reuters.com"/><link rel="dns-prefetch" href="//www.googletagservices.com"/><link rel="dns-p

# Get the html

In [None]:
html = response.content
html

`!pip install beautifulsoup4`

https://groups.google.com/forum/#!topic/beautifulsoup/rfyHGk0UjKU

In [None]:
from bs4 import BeautifulSoup

# Do the soup

In [None]:
soup = BeautifulSoup(html)
soup

In [None]:
soup.find_all('h1')

# With the soup, you'll be able to walk through the HTML tree. 
## Looking for tag h1

In [None]:
soup.find_all('h1')

It returns a list of tags.

In [None]:
soup.find_all('h1')[0]

In [None]:
tag_h1 = soup.find('h1')
tag_h1

In [None]:
type(tag_h1)

In [None]:
tag_h1.text

In [None]:
url

In [None]:
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
tag_h1 = soup.find('h1')
tag_h1.text

## Looking for tag a

In [None]:
tag_a = soup.find_all('a')
tag_a

In [None]:
tag_a.text

In [None]:
tag_a.attrs['href']

In [None]:
from tqdm.auto import tqdm

In [None]:
tags = soup.find_all('a')

result = []
for tag in tqdm(tags):
    result.append(tag.attrs.get('href'))
    
result

In [None]:
soup.find_all('a')

Again, a list of tags

## Searching Tags and specifying attributes

In [None]:
tag_span = soup.find('span', attrs={'class': 'trustBadgeUrl'})
tag_span

In [None]:
tag_span.find('a').text

In [None]:
tag_span.find('a').attrs['href']

## Getting tag attributes

In [None]:
tag_a = tag_span.find('a')
tag_a

In [None]:
tag_a.attrs['href']

In [None]:
tag_a['href']

In [None]:
tag_a.get_attribute_list('href')

In [None]:
tag_a.get('href')

## Searching for a list of tags

In [None]:
soup.find_all(['h1','p', 'a'])

----

# Another example

<ol>
    <li> request the url </li>
    <li> get its html </li>
    <li> convert to soup </li>
    <li> being a soup, we can look for tags </li>
</ol>
    

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_European_countries_by_life_expectancy'

In [None]:
response = requests.get(url)
response

In [None]:
html = response.content

In [None]:
soup = BeautifulSoup(html)
soup

## Find the `table` tag whose `class=sortable wikitable`

In [None]:
table = soup.find('table', attrs={'class' : 'sortable wikitable'})

## get the table header `<th>`

In [None]:
headers = [header.text.strip() for header in table.find_all('th')]

## get the table data `<td>`

In [None]:
data = [data.text.strip() for data in table.find_all('td')]


In [None]:
nrows = int(len(data)/3)
ncols = 3

reshaped_array = np.array(data).reshape(nrows, ncols)

## append results

In [None]:
pd.DataFrame(reshaped_array, columns=headers)

# Pandas to the rescue!

In [None]:
pd.read_html(url)[0]

In [None]:
pd.read_html(url, attrs={'class' : 'sortable wikitable'})

--------

# Some problems that may occur

## Forbidden requests

Pandas can read_html from a string:

In [None]:
url = 'https://www.hltv.org/results'
pd.read_html(url)

In [None]:
pd.concat(pd.read_html(requests.get(url).content))

## Headers to fake the browser you're using

In [None]:
url = 'https://www.hybrid-analysis.com/recent-submissions?filter=file&sort=^timestamp'
response = requests.get(url)
response

In [None]:
url = 'https://www.hybrid-analysis.com/recent-submissions?filter=file&sort=^timestamp'
response = requests.get(url)
soup = BeautifulSoup(response.content)
soup.find_all('h1')

In [None]:
requests.get('http://httpbin.org/headers').json()

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'}
requests.get('http://httpbin.org/headers', headers=headers).json()

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content)
soup.find_all('h1')